@lukaszraczylo/cloudflare-crawl-mcp 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example ADDED
@@ -0,0 +1,9 @@
1
+ # Cloudflare API Token (get from https://dash.cloudflare.com/profile/api-tokens)
2
+ # Required permissions: Account > Browser Rendering > Edit
3
+ CF_API_TOKEN=your_cloudflare_api_token
4
+
5
+ # Cloudflare Account ID (get from https://dash.cloudflare.com/_/account)
6
+ CF_ACCOUNT_ID=your_cloudflare_account_id
7
+
8
+ # Rate limit: REST API requests per minute (default: 6 for Free, 600 for Paid)
9
+ # CF_RATE_LIMIT=6
package/FUNDING.yml ADDED
@@ -0,0 +1,2 @@
1
+ github: lukaszraczylo
2
+ custom: https://github.com/sponsors/lukaszraczylo
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Lukasz Raczylo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,302 @@
1
+ # @lukaszraczylo/cloudflare-crawl-mcp
2
+
3
+ <p align="center">
4
+ <a href="https://www.npmjs.com/package/@lukaszraczylo/cloudflare-crawl-mcp">
5
+ <img src="https://img.shields.io/npm/v/@lukaszraczylo/cloudflare-crawl-mcp" alt="NPM Version">
6
+ </a>
7
+ <a href="https://github.com/lukaszraczylo/cloudflare-crawl-mcp/blob/main/LICENSE">
8
+ <img src="https://img.shields.io/badge/license-MIT-blue.svg" alt="License">
9
+ </a>
10
+ </p>
11
+
12
+ MCP server for crawling websites using Cloudflare Browser Rendering API. Supports multiple output formats including Markdown, HTML, and JSON.
13
+
14
+ ## Features
15
+
16
+ - **Multiple Output Formats**: Choose between Markdown, HTML, or JSON output
17
+ - **Configurable Crawling**: Control depth, page limits, and link following
18
+ - **Pattern Filtering**: Include/exclude URLs using wildcard patterns
19
+ - **JavaScript Rendering**: Execute JavaScript for dynamic content (or disable for static content)
20
+ - **Environment-Based Secrets**: Securely manage credentials via environment variables
21
+
22
+ ## Prerequisites
23
+
24
+ - Node.js 18+
25
+ - Cloudflare account with Browser Rendering API access
26
+ - Cloudflare API Token with `Browser Rendering` permissions
27
+ - Cloudflare Account ID
28
+
29
+ ## Quick Start
30
+
31
+ ```bash
32
+ # Clone and setup
33
+ npm install
34
+ npm run build
35
+
36
+ # Run with environment variables
37
+ CF_API_TOKEN=your_token CF_ACCOUNT_ID=your_account_id npm start
38
+ ```
39
+
40
+ ## Installation
41
+
42
+ ### 1. Clone the Repository
43
+
44
+ ```bash
45
+ git clone https://github.com/lukaszraczylo/cloudflare-crawl-mcp.git
46
+ cd cloudflare-crawl-mcp
47
+ ```
48
+
49
+ ### 2. Install Dependencies
50
+
51
+ ```bash
52
+ npm install
53
+ ```
54
+
55
+ ### 3. Build the Server
56
+
57
+ ```bash
58
+ npm run build
59
+ ```
60
+
61
+ ### 4. Configure Environment Variables
62
+
63
+ Copy the example environment file and add your credentials:
64
+
65
+ ```bash
66
+ cp .env.example .env
67
+ ```
68
+
69
+ Edit `.env` with your Cloudflare credentials:
70
+
71
+ ```
72
+ CF_API_TOKEN=your_cloudflare_api_token
73
+ CF_ACCOUNT_ID=your_cloudflare_account_id
74
+ ```
75
+
76
+ #### Getting Cloudflare Credentials
77
+
78
+ 1. **Account ID**: Find it at https://dash.cloudflare.com/_/account
79
+ 2. **API Token**: Create one at https://dash.cloudflare.com/profile/api-tokens with these permissions:
80
+ - `Account` > `Browser Rendering` > `Edit`
81
+
82
+ ## MCP Configuration
83
+
84
+ ### Claude Desktop (macOS)
85
+
86
+ Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
87
+
88
+ ```json
89
+ {
90
+ "mcpServers": {
91
+ "cloudflare-crawl": {
92
+ "command": "npm",
93
+ "args": ["start"],
94
+ "env": {
95
+ "CF_API_TOKEN": "your_api_token",
96
+ "CF_ACCOUNT_ID": "your_account_id"
97
+ },
98
+ "path": "/path/to/cloudflare-crawl-mcp"
99
+ }
100
+ }
101
+ }
102
+ ```
103
+
104
+ ### Claude Code (CLI)
105
+
106
+ ```json
107
+ {
108
+ "mcpServers": {
109
+ "cloudflare-crawl": {
110
+ "command": "npm",
111
+ "args": ["start"],
112
+ "env": {
113
+ "CF_API_TOKEN": "your_api_token",
114
+ "CF_ACCOUNT_ID": "your_account_id"
115
+ }
116
+ }
117
+ }
118
+ }
119
+ ```
120
+
121
+ ### Cursor
122
+
123
+ Add to `~/.cursor/settings.json` (MCP configuration):
124
+
125
+ ```json
126
+ {
127
+ "mcpServers": {
128
+ "cloudflare-crawl": {
129
+ "command": "npm",
130
+ "args": ["start"],
131
+ "env": {
132
+ "CF_API_TOKEN": "your_api_token",
133
+ "CF_ACCOUNT_ID": "your_account_id"
134
+ },
135
+ "path": "/path/to/cloudflare-crawl-mcp"
136
+ }
137
+ }
138
+ }
139
+ ```
140
+
141
+ ## Available Tools
142
+
143
+ ### crawl_url_markdown
144
+
145
+ Crawl a website and return content in **Markdown** format.
146
+
147
+ ```typescript
148
+ {
149
+ "name": "crawl_url_markdown",
150
+ "arguments": {
151
+ "url": "https://example.com/docs",
152
+ "limit": 50,
153
+ "depth": 2,
154
+ "includePatterns": ["https://example.com/docs/**"],
155
+ "excludePatterns": ["https://example.com/docs/archive/**"],
156
+ "render": true
157
+ }
158
+ }
159
+ ```
160
+
161
+ ### crawl_url_html
162
+
163
+ Crawl a website and return content in **HTML** format.
164
+
165
+ ```typescript
166
+ {
167
+ "name": "crawl_url_html",
168
+ "arguments": {
169
+ "url": "https://example.com",
170
+ "limit": 10
171
+ }
172
+ }
173
+ ```
174
+
175
+ ### crawl_url_json
176
+
177
+ Crawl a website and return content in **JSON** format (uses Workers AI for data extraction).
178
+
179
+ ```typescript
180
+ {
181
+ "name": "crawl_url_json",
182
+ "arguments": {
183
+ "url": "https://example.com/products",
184
+ "limit": 20
185
+ }
186
+ }
187
+ ```
188
+
189
+ ## Parameters
190
+
191
+ | Parameter | Type | Default | Description |
192
+ |-----------|------|---------|-------------|
193
+ | `url` | string | required | Starting URL to crawl |
194
+ | `limit` | number | 10 | Maximum pages to crawl (max: 100,000) |
195
+ | `depth` | number | 1 | Maximum link depth from starting URL |
196
+ | `includeSubdomains` | boolean | false | Follow links to subdomains |
197
+ | `includeExternalLinks` | boolean | false | Follow links to external domains |
198
+ | `includePatterns` | string[] | [] | Wildcard patterns to include |
199
+ | `excludePatterns` | string[] | [] | Wildcard patterns to exclude |
200
+ | `render` | boolean | true | Execute JavaScript (false = faster static fetch) |
201
+
202
+ ### Pattern Syntax
203
+
204
+ - `*` - Matches any characters except `/`
205
+ - `**` - Matches any characters including `/`
206
+
207
+ Examples:
208
+ - `https://example.com/docs/**` - All URLs under /docs
209
+ - `https://example.com/*.html` - All HTML files directly in root
210
+
211
+ ## Development
212
+
213
+ ### Commands
214
+
215
+ ```bash
216
+ npm install # Install dependencies
217
+ npm run typecheck # Type check with tsc
218
+ npm run lint # Lint with ESLint
219
+ npm run build # Build TypeScript
220
+ npm start # Run server
221
+ npm test # Run tests
222
+ npm run test:watch # Run tests in watch mode
223
+ ```
224
+
225
+ CI runs typecheck, lint, build and test.
226
+
227
+ ### Testing
228
+
229
+ The project includes comprehensive tests covering:
230
+
231
+ - Environment variable handling
232
+ - Crawl options building
233
+ - Result formatting (Markdown, HTML, JSON)
234
+ - Error handling
235
+ - API integration
236
+
237
+ Run tests:
238
+ ```bash
239
+ npm test
240
+ ```
241
+
242
+ ## Architecture
243
+
244
+ ```
245
+ src/
246
+ ├── index.ts # Main MCP server implementation
247
+
248
+ ├── API Layer
249
+ │ ├── initiateCrawl() # POST to /crawl endpoint
250
+ │ ├── waitForCrawl() # Poll for job completion
251
+ │ └── getCrawlResults() # Fetch final results
252
+
253
+ ├── Formatters
254
+ │ ├── formatMarkdownResult()
255
+ │ ├── formatHtmlResult()
256
+ │ └── formatJsonResult()
257
+
258
+ └── MCP Handlers
259
+ ├── ListToolsRequestSchema # Tool registration
260
+ └── CallToolRequestSchema # Tool execution
261
+ ```
262
+
263
+ ## Cloudflare Limits
264
+
265
+ - **Max crawl duration**: 7 days
266
+ - **Results available**: 14 days after completion
267
+ - **Max pages per job**: 100,000
268
+ - **Free plan**: 10 minutes of browser time per day
269
+
270
+ See [Cloudflare Browser Rendering Limits](https://developers.cloudflare.com/browser-rendering/limits/) for details.
271
+
272
+ ## Troubleshooting
273
+
274
+ ### Crawl returns no results
275
+
276
+ - Check `robots.txt` blocking (use `render: false` to bypass)
277
+ - Verify `includePatterns` match actual URLs
278
+ - Try increasing `depth` or disabling pattern filters
279
+
280
+ ### Job cancelled due to limits
281
+
282
+ - Upgrade to Workers Paid plan
283
+ - Use `render: false` for static content
284
+ - Reduce `limit` parameter
285
+
286
+ ### Authentication errors
287
+
288
+ - Verify API Token has Browser Rendering permissions
289
+ - Confirm Account ID is correct
290
+
291
+ ## License
292
+
293
+ MIT License - see [LICENSE](LICENSE) file.
294
+
295
+ ## Contributing
296
+
297
+ Contributions are welcome! Please read our contributing guidelines before submitting PRs at https://github.com/lukaszraczylo/cloudflare-crawl-mcp.
298
+
299
+ ## Support
300
+
301
+ - Open an issue at https://github.com/lukaszraczylo/cloudflare-crawl-mcp/issues
302
+ - Check Cloudflare's [Browser Rendering Docs](https://developers.cloudflare.com/browser-rendering/) for API details
@@ -0,0 +1 @@
1
+ export {};
package/dist/index.js ADDED
@@ -0,0 +1,339 @@
1
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
2
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
3
+ import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
4
+ const API_BASE = "https://api.cloudflare.com/client/v4";
5
+ const MAX_RETRIES = 3;
6
+ const RATE_LIMIT_DELAY_MS = 10000;
7
+ let lastRequestTime = 0;
8
+ let requestCount = 0;
9
+ let windowStart = Date.now();
10
+ function getEnv(key) {
11
+ const value = process.env[key];
12
+ if (!value) {
13
+ throw new Error(`Missing required environment variable: ${key}`);
14
+ }
15
+ return value;
16
+ }
17
+ async function enforceRateLimit() {
18
+ const now = Date.now();
19
+ const windowDuration = 60000;
20
+ if (now - windowStart >= windowDuration) {
21
+ requestCount = 0;
22
+ windowStart = now;
23
+ }
24
+ const requestsPerMinute = parseInt(process.env.CF_RATE_LIMIT || "6", 10);
25
+ if (requestCount >= requestsPerMinute) {
26
+ const waitTime = windowDuration - (now - windowStart);
27
+ console.error(`Rate limit reached (${requestsPerMinute}/min). Waiting ${waitTime}ms...`);
28
+ await new Promise((resolve) => setTimeout(resolve, waitTime));
29
+ requestCount = 0;
30
+ windowStart = Date.now();
31
+ }
32
+ const timeSinceLastRequest = now - lastRequestTime;
33
+ if (timeSinceLastRequest < RATE_LIMIT_DELAY_MS && requestCount > 0) {
34
+ const waitTime = RATE_LIMIT_DELAY_MS - timeSinceLastRequest;
35
+ await new Promise((resolve) => setTimeout(resolve, waitTime));
36
+ }
37
+ lastRequestTime = Date.now();
38
+ requestCount++;
39
+ }
40
+ async function fetchWithRetry(fn, retries = MAX_RETRIES) {
41
+ let lastError = null;
42
+ for (let attempt = 0; attempt < retries; attempt++) {
43
+ try {
44
+ return await fn();
45
+ }
46
+ catch (error) {
47
+ lastError = error;
48
+ const errorStr = error.message || "";
49
+ const isRateLimit = errorStr.includes("429") ||
50
+ errorStr.includes("Rate limit");
51
+ if (!isRateLimit || attempt === retries - 1) {
52
+ throw error;
53
+ }
54
+ const retryAfterMatch = errorStr.match(/Retry-After[:\s]*(\d+)/i);
55
+ const delay = retryAfterMatch
56
+ ? parseInt(retryAfterMatch[1], 10) * 1000
57
+ : Math.min(1000 * Math.pow(2, attempt), 30000);
58
+ console.error(`Rate limited. Retrying in ${delay}ms...`);
59
+ await new Promise((resolve) => setTimeout(resolve, delay));
60
+ }
61
+ }
62
+ throw lastError;
63
+ }
64
+ async function initiateCrawl(accountId, apiToken, options) {
65
+ await enforceRateLimit();
66
+ return fetchWithRetry(async () => {
67
+ const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl`, {
68
+ method: "POST",
69
+ headers: {
70
+ Authorization: `Bearer ${apiToken}`,
71
+ "Content-Type": "application/json",
72
+ },
73
+ body: JSON.stringify({
74
+ url: options.url,
75
+ limit: options.limit ?? 10,
76
+ depth: options.depth ?? 1,
77
+ formats: options.formats ?? ["markdown"],
78
+ render: options.render ?? true,
79
+ maxAge: options.maxAge,
80
+ source: options.source ?? "all",
81
+ options: options.options ?? {},
82
+ }),
83
+ });
84
+ if (!response.ok) {
85
+ const error = await response.text();
86
+ const retryAfter = response.headers.get("Retry-After");
87
+ const errorMsg = `Failed to initiate crawl: ${response.status} ${error}${retryAfter ? ` Retry-After: ${retryAfter}` : ""}`;
88
+ throw new Error(errorMsg);
89
+ }
90
+ const data = await response.json();
91
+ if (!data.success) {
92
+ throw new Error(`Crawl initiation failed: ${JSON.stringify(data.errors)}`);
93
+ }
94
+ return data.result.id;
95
+ });
96
+ }
97
+ async function waitForCrawl(accountId, apiToken, jobId, maxAttempts = 120, delayMs = 5000) {
98
+ for (let i = 0; i < maxAttempts; i++) {
99
+ const response = await fetch(`${API_BASE}/accounts/${accountId}/browser-rendering/crawl/${jobId}?limit=1`, {
100
+ headers: {
101
+ Authorization: `Bearer ${apiToken}`,
102
+ },
103
+ });
104
+ if (!response.ok) {
105
+ const error = await response.text();
106
+ throw new Error(`Failed to check crawl status: ${response.status} ${error}`);
107
+ }
108
+ const data = await response.json();
109
+ const status = data.result.status;
110
+ if (status !== "running") {
111
+ return data.result;
112
+ }
113
+ await new Promise((resolve) => setTimeout(resolve, delayMs));
114
+ }
115
+ throw new Error("Crawl job did not complete within timeout");
116
+ }
117
+ function buildCrawlOptions(args, formats) {
118
+ return {
119
+ url: args.url,
120
+ limit: args.limit,
121
+ depth: args.depth,
122
+ formats,
123
+ render: args.render,
124
+ options: {
125
+ includeExternalLinks: args.includeExternalLinks,
126
+ includeSubdomains: args.includeSubdomains,
127
+ includePatterns: args.includePatterns,
128
+ excludePatterns: args.excludePatterns,
129
+ },
130
+ };
131
+ }
132
+ function formatMarkdownResult(result) {
133
+ const records = result.records || [];
134
+ const completedRecords = records.filter((r) => r.status === "completed");
135
+ const content = completedRecords
136
+ .map((record) => {
137
+ const title = record.metadata?.title || record.url;
138
+ return `## ${title}\n\nURL: ${record.url}\n\n${record.markdown || ""}\n\n---\n`;
139
+ })
140
+ .join("\n");
141
+ return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
142
+ }
143
+ function formatHtmlResult(result) {
144
+ const records = result.records || [];
145
+ const completedRecords = records.filter((r) => r.status === "completed");
146
+ const content = completedRecords
147
+ .map((record) => {
148
+ const title = record.metadata?.title || record.url;
149
+ return `<article>\n <h2>${title}</h2>\n <p>Source: <a href="${record.url}">${record.url}</a></p>\n <div class="content">${record.html || ""}</div>\n</article>\n`;
150
+ })
151
+ .join("\n");
152
+ return `Crawl completed: ${completedRecords.length} of ${result.total} pages crawled successfully.\n\n${content}`;
153
+ }
154
+ function formatJsonResult(result) {
155
+ const records = result.records || [];
156
+ const completedRecords = records.filter((r) => r.status === "completed");
157
+ const jsonOutput = {
158
+ summary: {
159
+ total: result.total,
160
+ completed: completedRecords.length,
161
+ status: result.status,
162
+ },
163
+ pages: completedRecords.map((record) => ({
164
+ url: record.url,
165
+ title: record.metadata?.title,
166
+ status: record.metadata?.status,
167
+ markdown: record.markdown,
168
+ html: record.html,
169
+ json: record.json,
170
+ })),
171
+ };
172
+ return JSON.stringify(jsonOutput, null, 2);
173
+ }
174
+ function handleErrorResult(result, jobId) {
175
+ const errorMessages = {
176
+ errored: `Crawl job errored. Job ID: ${jobId}`,
177
+ cancelled_due_to_timeout: `Crawl job cancelled due to timeout (7 days max). Job ID: ${jobId}`,
178
+ cancelled_due_to_limits: `Crawl job cancelled due to account limits. Job ID: ${jobId}`,
179
+ cancelled_by_user: `Crawl job was cancelled by user. Job ID: ${jobId}`,
180
+ };
181
+ const message = errorMessages[result.status] || `Crawl job failed with status: ${result.status}. Job ID: ${jobId}`;
182
+ return {
183
+ content: [{ type: "text", text: message }],
184
+ isError: true,
185
+ };
186
+ }
187
+ const server = new Server({
188
+ name: "cloudflare-crawl-mcp",
189
+ version: "1.0.0",
190
+ }, {
191
+ capabilities: {
192
+ tools: {},
193
+ },
194
+ });
195
+ const baseToolSchema = {
196
+ type: "object",
197
+ properties: {
198
+ url: {
199
+ type: "string",
200
+ description: "The starting URL to crawl",
201
+ },
202
+ limit: {
203
+ type: "number",
204
+ description: "Maximum number of pages to crawl (default: 10, max: 100000)",
205
+ },
206
+ depth: {
207
+ type: "number",
208
+ description: "Maximum link depth to crawl from the starting URL (default: 1)",
209
+ },
210
+ includeSubdomains: {
211
+ type: "boolean",
212
+ description: "If true, follows links to subdomains of the starting URL (default: false)",
213
+ },
214
+ includeExternalLinks: {
215
+ type: "boolean",
216
+ description: "If true, follows links to external domains (default: false)",
217
+ },
218
+ includePatterns: {
219
+ type: "array",
220
+ items: { type: "string" },
221
+ description: "Only visits URLs that match one of these wildcard patterns",
222
+ },
223
+ excludePatterns: {
224
+ type: "array",
225
+ items: { type: "string" },
226
+ description: "Does not visit URLs that match any of these wildcard patterns",
227
+ },
228
+ render: {
229
+ type: "boolean",
230
+ description: "If false, does a fast HTML fetch without executing JavaScript (default: true)",
231
+ },
232
+ },
233
+ required: ["url"],
234
+ };
235
+ const RATE_LIMIT_INFO = `
236
+ ---
237
+ **Cloudflare Browser Rendering Limits:**
238
+
239
+ | Plan | Concurrent Browsers | Browser Time | REST API Rate |
240
+ |------|---------------------|--------------|---------------|
241
+ | Free | 3 | 10 min/day | 6 req/min |
242
+ | Paid | 10 | 10 hours/month | 600 req/min |
243
+
244
+ **Environment Variables:**
245
+ - CF_RATE_LIMIT: Override REST API requests per minute (default: 6 for Free, 600 for Paid)
246
+
247
+ **Tips:**
248
+ - Use \`render: false\` for static content to avoid browser time usage
249
+ - Use \`maxAge\` to cache results and reduce API calls
250
+ - Set \`limit\` and \`depth\` appropriately to stay within limits
251
+ ---`;
252
+ server.setRequestHandler(ListToolsRequestSchema, async () => {
253
+ return {
254
+ tools: [
255
+ {
256
+ name: "crawl_url_markdown",
257
+ description: `Crawl a website using Cloudflare Browser Rendering and return content in Markdown format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
258
+ inputSchema: {
259
+ ...baseToolSchema,
260
+ properties: {
261
+ ...baseToolSchema.properties,
262
+ },
263
+ },
264
+ },
265
+ {
266
+ name: "crawl_url_html",
267
+ description: `Crawl a website using Cloudflare Browser Rendering and return content in HTML format. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
268
+ inputSchema: baseToolSchema,
269
+ },
270
+ {
271
+ name: "crawl_url_json",
272
+ description: `Crawl a website using Cloudflare Browser Rendering and return content in JSON format. This uses Workers AI for data extraction. Supports following links across the site up to a configurable depth or page limit.${RATE_LIMIT_INFO}`,
273
+ inputSchema: baseToolSchema,
274
+ },
275
+ ],
276
+ };
277
+ });
278
+ server.setRequestHandler(CallToolRequestSchema, async (request) => {
279
+ const { name, arguments: args } = request.params;
280
+ const toolMatch = name.match(/^crawl_url_(markdown|html|json)$/);
281
+ if (!toolMatch) {
282
+ return {
283
+ content: [{ type: "text", text: `Unknown tool: ${name}` }],
284
+ isError: true,
285
+ };
286
+ }
287
+ const format = toolMatch[1];
288
+ const formatMap = {
289
+ markdown: ["markdown"],
290
+ html: ["html"],
291
+ json: ["json"],
292
+ };
293
+ const formats = formatMap[format];
294
+ try {
295
+ const apiToken = getEnv("CF_API_TOKEN");
296
+ const accountId = getEnv("CF_ACCOUNT_ID");
297
+ const crawlArgs = {
298
+ url: args.url,
299
+ limit: args.limit,
300
+ depth: args.depth,
301
+ includeSubdomains: args.includeSubdomains,
302
+ includeExternalLinks: args.includeExternalLinks,
303
+ includePatterns: args.includePatterns,
304
+ excludePatterns: args.excludePatterns,
305
+ render: args.render,
306
+ };
307
+ const options = buildCrawlOptions(crawlArgs, formats);
308
+ const jobId = await initiateCrawl(accountId, apiToken, options);
309
+ const result = await waitForCrawl(accountId, apiToken, jobId);
310
+ const terminalStatuses = ["errored", "cancelled_due_to_timeout", "cancelled_due_to_limits", "cancelled_by_user"];
311
+ if (terminalStatuses.includes(result.status)) {
312
+ return handleErrorResult(result, jobId);
313
+ }
314
+ const formatterMap = {
315
+ markdown: formatMarkdownResult,
316
+ html: formatHtmlResult,
317
+ json: formatJsonResult,
318
+ };
319
+ const formattedContent = formatterMap[format](result);
320
+ return {
321
+ content: [{ type: "text", text: formattedContent }],
322
+ };
323
+ }
324
+ catch (error) {
325
+ const message = error instanceof Error ? error.message : String(error);
326
+ return {
327
+ content: [{ type: "text", text: `Error: ${message}` }],
328
+ isError: true,
329
+ };
330
+ }
331
+ });
332
+ async function main() {
333
+ const transport = new StdioServerTransport();
334
+ await server.connect(transport);
335
+ }
336
+ main().catch((error) => {
337
+ console.error("Server error:", error);
338
+ process.exit(1);
339
+ });
@@ -0,0 +1 @@
1
+ export {};