@zenrows/mcp 1.0.0 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -22
- package/dist/http.d.ts +18 -0
- package/dist/http.js +61 -0
- package/dist/index.js +5 -198
- package/dist/server.d.ts +2 -0
- package/dist/server.js +199 -0
- package/package.json +17 -2
package/README.md
CHANGED
|
@@ -10,16 +10,23 @@
|
|
|
10
10
|
Give any MCP-compatible AI assistant the ability to scrape any webpage — including JavaScript-rendered content and anti-bot protected sites.
|
|
11
11
|
</p>
|
|
12
12
|
|
|
13
|
+
<p align="center">
|
|
14
|
+
<a href="https://www.npmjs.com/package/@zenrows/mcp"><img src="https://img.shields.io/npm/v/@zenrows/mcp" alt="npm version"></a>
|
|
15
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue" alt="MIT License"></a>
|
|
16
|
+
</p>
|
|
17
|
+
|
|
13
18
|
---
|
|
14
19
|
|
|
15
20
|
## Quick Start
|
|
16
21
|
|
|
17
22
|
**Claude Code**
|
|
23
|
+
|
|
18
24
|
```bash
|
|
19
25
|
claude mcp add zenrows -e ZENROWS_API_KEY=YOUR_API_KEY -- npx -y @zenrows/mcp
|
|
20
26
|
```
|
|
21
27
|
|
|
22
28
|
Or ask your AI assistant naturally once configured:
|
|
29
|
+
|
|
23
30
|
```
|
|
24
31
|
Scrape https://example.com and summarize the content.
|
|
25
32
|
```
|
|
@@ -32,22 +39,22 @@ Scrape https://example.com and summarize the content.
|
|
|
32
39
|
|
|
33
40
|
Fetches a webpage and returns its content as clean markdown (default), plaintext, raw HTML, PDF, structured JSON, or a screenshot. See the [ZenRows API docs](https://docs.zenrows.com/universal-scraper-api/api-reference#parameter-overview) for full parameter reference.
|
|
34
41
|
|
|
35
|
-
| Parameter
|
|
36
|
-
|
|
37
|
-
| `url`
|
|
38
|
-
| `js_render`
|
|
39
|
-
| `premium_proxy`
|
|
40
|
-
| `proxy_country`
|
|
41
|
-
| `response_type`
|
|
42
|
-
| `autoparse`
|
|
43
|
-
| `css_extractor`
|
|
44
|
-
| `outputs`
|
|
45
|
-
| `screenshot`
|
|
46
|
-
| `screenshot_fullpage` | boolean
|
|
47
|
-
| `screenshot_selector` | string
|
|
48
|
-
| `wait_for`
|
|
49
|
-
| `wait`
|
|
50
|
-
| `js_instructions`
|
|
42
|
+
| Parameter | Type | Default | Description |
|
|
43
|
+
| --------------------- | -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
44
|
+
| `url` | string | **required** | Webpage URL to scrape |
|
|
45
|
+
| `js_render` | boolean | `false` | Enable JS rendering for SPAs and dynamic content |
|
|
46
|
+
| `premium_proxy` | boolean | `false` | Use residential proxies to bypass anti-bot systems |
|
|
47
|
+
| `proxy_country` | string | — | ISO 3166-1 alpha-2 country code (e.g. `US`, `GB`). Requires `premium_proxy` |
|
|
48
|
+
| `response_type` | `markdown` \| `plaintext` \| `pdf` \| `html` | `markdown` | Output format. `html` returns raw source (ZenRows default when no param is sent). Ignored when `autoparse`, `css_extractor`, `outputs`, or screenshot params are set |
|
|
49
|
+
| `autoparse` | boolean | — | Auto-extract structured JSON from the page |
|
|
50
|
+
| `css_extractor` | string | — | JSON map of CSS selectors: `{"title":"h1","price":".price"}` |
|
|
51
|
+
| `outputs` | string | — | Comma-separated data types to extract as JSON: `emails`, `headings`, `links`, `menus`, `images`, `videos`, `audios`. Use `*` for all |
|
|
52
|
+
| `screenshot` | boolean | — | Capture an above-the-fold screenshot. Returns an image |
|
|
53
|
+
| `screenshot_fullpage` | boolean | — | Capture a full-page screenshot. Returns an image |
|
|
54
|
+
| `screenshot_selector` | string | — | Capture a screenshot of a specific element via CSS selector |
|
|
55
|
+
| `wait_for` | string | — | CSS selector to wait for before capturing. Requires `js_render` |
|
|
56
|
+
| `wait` | number | — | Milliseconds to wait after load (max 30000). Requires `js_render` |
|
|
57
|
+
| `js_instructions` | string | — | JSON array of browser actions. Requires `js_render` |
|
|
51
58
|
|
|
52
59
|
---
|
|
53
60
|
|
|
@@ -240,9 +247,3 @@ npm run dev # run with .env loaded (requires Node 20.6+)
|
|
|
240
247
|
npm run build # compile to dist/
|
|
241
248
|
npm run inspect # open MCP inspector UI
|
|
242
249
|
```
|
|
243
|
-
|
|
244
|
-
---
|
|
245
|
-
|
|
246
|
-
## License
|
|
247
|
-
|
|
248
|
-
MIT
|
package/dist/http.d.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
interface LambdaEvent {
|
|
2
|
+
requestContext: {
|
|
3
|
+
http: {
|
|
4
|
+
method: string;
|
|
5
|
+
};
|
|
6
|
+
};
|
|
7
|
+
rawPath: string;
|
|
8
|
+
rawQueryString?: string;
|
|
9
|
+
headers: Record<string, string>;
|
|
10
|
+
body?: string;
|
|
11
|
+
isBase64Encoded?: boolean;
|
|
12
|
+
}
|
|
13
|
+
export declare const handler: (event: LambdaEvent) => Promise<{
|
|
14
|
+
statusCode: number;
|
|
15
|
+
headers: Record<string, string>;
|
|
16
|
+
body: string;
|
|
17
|
+
}>;
|
|
18
|
+
export {};
|
package/dist/http.js
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { Hono } from "hono";
|
|
2
|
+
import { serve } from "@hono/node-server";
|
|
3
|
+
import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
|
|
4
|
+
import { createServer } from "./server.js";
|
|
5
|
+
// ─── app ──────────────────────────────────────────────────────────────────────
|
|
6
|
+
const app = new Hono();
|
|
7
|
+
function extractApiKey(req) {
|
|
8
|
+
const auth = req.headers.get("authorization");
|
|
9
|
+
if (auth) {
|
|
10
|
+
const m = auth.match(/^Bearer\s+(.+)$/i);
|
|
11
|
+
if (m)
|
|
12
|
+
return m[1];
|
|
13
|
+
}
|
|
14
|
+
return new URL(req.url).searchParams.get("apikey") ?? undefined;
|
|
15
|
+
}
|
|
16
|
+
app.all("/mcp", async (c) => {
|
|
17
|
+
const apiKey = extractApiKey(c.req.raw);
|
|
18
|
+
if (!apiKey) {
|
|
19
|
+
return c.json({
|
|
20
|
+
error: "Missing API key. Use Authorization: Bearer <key> header or ?apikey=<key> query param.",
|
|
21
|
+
}, 401);
|
|
22
|
+
}
|
|
23
|
+
const transport = new WebStandardStreamableHTTPServerTransport({
|
|
24
|
+
sessionIdGenerator: undefined, // stateless — no session tracking between requests
|
|
25
|
+
enableJsonResponse: true, // return JSON instead of SSE (simpler for Lambda + most MCP clients)
|
|
26
|
+
});
|
|
27
|
+
const server = createServer(apiKey);
|
|
28
|
+
await server.connect(transport);
|
|
29
|
+
const response = await transport.handleRequest(c.req.raw);
|
|
30
|
+
await transport.close();
|
|
31
|
+
return response;
|
|
32
|
+
});
|
|
33
|
+
app.get("/health", (c) => c.json({ ok: true }));
|
|
34
|
+
export const handler = async (event) => {
|
|
35
|
+
const qs = event.rawQueryString ? `?${event.rawQueryString}` : "";
|
|
36
|
+
const url = `https://mcp.zenrows.com${event.rawPath}${qs}`;
|
|
37
|
+
const method = event.requestContext.http.method;
|
|
38
|
+
let body;
|
|
39
|
+
if (event.body && method !== "GET" && method !== "HEAD") {
|
|
40
|
+
body = event.isBase64Encoded ? Buffer.from(event.body, "base64") : event.body;
|
|
41
|
+
}
|
|
42
|
+
const request = new Request(url, {
|
|
43
|
+
method,
|
|
44
|
+
headers: new Headers(event.headers),
|
|
45
|
+
body,
|
|
46
|
+
});
|
|
47
|
+
const response = await app.fetch(request);
|
|
48
|
+
const responseBody = await response.text();
|
|
49
|
+
const headers = {};
|
|
50
|
+
response.headers.forEach((v, k) => {
|
|
51
|
+
headers[k] = v;
|
|
52
|
+
});
|
|
53
|
+
return { statusCode: response.status, headers, body: responseBody };
|
|
54
|
+
};
|
|
55
|
+
// ─── Local dev ────────────────────────────────────────────────────────────────
|
|
56
|
+
if (!process.env.AWS_LAMBDA_FUNCTION_NAME) {
|
|
57
|
+
const port = Number(process.env.PORT) || 3000;
|
|
58
|
+
serve({ fetch: app.fetch, port }, () => {
|
|
59
|
+
process.stderr.write(`ZenRows MCP HTTP server listening on http://localhost:${port}/mcp\n`);
|
|
60
|
+
});
|
|
61
|
+
}
|
package/dist/index.js
CHANGED
|
@@ -1,205 +1,12 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { createRequire } from "module";
|
|
3
|
-
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
4
2
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
5
|
-
import {
|
|
6
|
-
const require = createRequire(import.meta.url);
|
|
7
|
-
const pkg = require("../package.json");
|
|
8
|
-
const ZENROWS_API_URL = "https://api.zenrows.com/v1/";
|
|
3
|
+
import { createServer } from "./server.js";
|
|
9
4
|
const apiKey = process.env.ZENROWS_API_KEY;
|
|
10
5
|
if (!apiKey) {
|
|
11
6
|
process.stderr.write("Error: ZENROWS_API_KEY environment variable is required\n");
|
|
12
7
|
process.exit(1);
|
|
13
8
|
}
|
|
14
|
-
const server =
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
// ─── scrape ──────────────────────────────────────────────────────────────────
|
|
19
|
-
server.registerTool("scrape", {
|
|
20
|
-
description: `Scrape any webpage and return its content using ZenRows.
|
|
21
|
-
|
|
22
|
-
Use this tool to fetch webpage content for analysis. By default it returns clean
|
|
23
|
-
markdown, which is ideal for LLM processing.
|
|
24
|
-
|
|
25
|
-
When to enable options:
|
|
26
|
-
- js_render: page uses React/Vue/Angular, loads content dynamically, or content
|
|
27
|
-
appears missing on the first attempt
|
|
28
|
-
- premium_proxy: site returns 403/blocked errors even with js_render enabled
|
|
29
|
-
- wait_for: specific content loads after initial render (requires js_render)
|
|
30
|
-
- css_extractor: you only need specific elements, not the whole page
|
|
31
|
-
- autoparse: structured data pages like products or articles
|
|
32
|
-
|
|
33
|
-
Examples:
|
|
34
|
-
Basic: { url: "https://example.com" }
|
|
35
|
-
Dynamic: { url: "https://spa.com", js_render: true }
|
|
36
|
-
Protected:{ url: "https://protected.com", js_render: true, premium_proxy: true }
|
|
37
|
-
Extract: { url: "https://shop.com", css_extractor: '{"title":"h1","price":".price"}' }`,
|
|
38
|
-
inputSchema: {
|
|
39
|
-
url: z
|
|
40
|
-
.string()
|
|
41
|
-
.url()
|
|
42
|
-
.describe("The webpage URL to scrape"),
|
|
43
|
-
js_render: z
|
|
44
|
-
.boolean()
|
|
45
|
-
.optional()
|
|
46
|
-
.default(false)
|
|
47
|
-
.describe("Enable JavaScript rendering via headless browser. Required for SPAs " +
|
|
48
|
-
"(React, Vue, Angular) and pages that load content dynamically."),
|
|
49
|
-
premium_proxy: z
|
|
50
|
-
.boolean()
|
|
51
|
-
.optional()
|
|
52
|
-
.default(false)
|
|
53
|
-
.describe("Use premium residential proxies to bypass anti-bot protection. " +
|
|
54
|
-
"Required for heavily protected sites. Implies higher credit cost."),
|
|
55
|
-
proxy_country: z
|
|
56
|
-
.string()
|
|
57
|
-
.optional()
|
|
58
|
-
.describe("Country for geo-targeted scraping. ISO 3166-1 alpha-2 code (e.g. 'US', 'GB', 'DE'). " +
|
|
59
|
-
"Requires premium_proxy=true."),
|
|
60
|
-
response_type: z
|
|
61
|
-
.enum(["markdown", "plaintext", "pdf", "html"])
|
|
62
|
-
.optional()
|
|
63
|
-
.default("markdown")
|
|
64
|
-
.describe("Output format. 'markdown' (default) preserves structure and is ideal for LLMs. " +
|
|
65
|
-
"'plaintext' strips all formatting for pure text extraction. " +
|
|
66
|
-
"'pdf' returns a PDF of the page. " +
|
|
67
|
-
"'html' returns the raw HTML source (omits the response_type param; ZenRows default). " +
|
|
68
|
-
"Ignored when autoparse, css_extractor, outputs, or screenshot params are set."),
|
|
69
|
-
autoparse: z
|
|
70
|
-
.boolean()
|
|
71
|
-
.optional()
|
|
72
|
-
.describe("Automatically extract structured data from the page into JSON. " +
|
|
73
|
-
"Best for product pages, articles, and listings."),
|
|
74
|
-
css_extractor: z
|
|
75
|
-
.string()
|
|
76
|
-
.optional()
|
|
77
|
-
.describe("Extract specific elements using CSS selectors. " +
|
|
78
|
-
'JSON object mapping names to selectors, e.g. \'{"title":"h1","price":".price-tag"}\'. ' +
|
|
79
|
-
"Returns JSON instead of full page content."),
|
|
80
|
-
wait_for: z
|
|
81
|
-
.string()
|
|
82
|
-
.optional()
|
|
83
|
-
.describe("CSS selector to wait for before capturing. Use when key content loads " +
|
|
84
|
-
"after the initial page render. Requires js_render=true."),
|
|
85
|
-
wait: z
|
|
86
|
-
.number()
|
|
87
|
-
.int()
|
|
88
|
-
.min(0)
|
|
89
|
-
.max(30000)
|
|
90
|
-
.optional()
|
|
91
|
-
.describe("Milliseconds to wait after page load before capturing content. " +
|
|
92
|
-
"Max 30000 (30s). Requires js_render=true."),
|
|
93
|
-
js_instructions: z
|
|
94
|
-
.string()
|
|
95
|
-
.optional()
|
|
96
|
-
.describe("JSON array of browser interactions to run before scraping. Requires js_render=true. " +
|
|
97
|
-
'Example: [{"click":"#load-more"},{"wait":1000},{"wait_for":".results"}]'),
|
|
98
|
-
outputs: z
|
|
99
|
-
.string()
|
|
100
|
-
.optional()
|
|
101
|
-
.describe("Comma-separated list of data types to extract as structured JSON. " +
|
|
102
|
-
"Available: emails, headings, links, menus, images, videos, audios. " +
|
|
103
|
-
"Use '*' for all types. Returns JSON instead of full page content."),
|
|
104
|
-
screenshot: z
|
|
105
|
-
.boolean()
|
|
106
|
-
.optional()
|
|
107
|
-
.describe("Capture an above-the-fold screenshot of the page. " +
|
|
108
|
-
"Returns an image instead of text content. Useful for visual verification or debugging."),
|
|
109
|
-
screenshot_fullpage: z
|
|
110
|
-
.boolean()
|
|
111
|
-
.optional()
|
|
112
|
-
.describe("Capture a full-page screenshot including content below the fold. " +
|
|
113
|
-
"Returns an image instead of text content."),
|
|
114
|
-
screenshot_selector: z
|
|
115
|
-
.string()
|
|
116
|
-
.optional()
|
|
117
|
-
.describe("Capture a screenshot of a specific element using a CSS selector. " +
|
|
118
|
-
'Example: ".product-card". Returns an image instead of text content.'),
|
|
119
|
-
},
|
|
120
|
-
}, async (params) => {
|
|
121
|
-
const searchParams = new URLSearchParams({
|
|
122
|
-
apikey: apiKey,
|
|
123
|
-
url: params.url,
|
|
124
|
-
});
|
|
125
|
-
if (params.js_render || params.screenshot || params.screenshot_fullpage || params.screenshot_selector)
|
|
126
|
-
searchParams.set("js_render", "true");
|
|
127
|
-
if (params.premium_proxy)
|
|
128
|
-
searchParams.set("premium_proxy", "true");
|
|
129
|
-
if (params.proxy_country)
|
|
130
|
-
searchParams.set("proxy_country", params.proxy_country.toUpperCase());
|
|
131
|
-
if (params.autoparse)
|
|
132
|
-
searchParams.set("autoparse", "true");
|
|
133
|
-
if (params.css_extractor)
|
|
134
|
-
searchParams.set("css_extractor", params.css_extractor);
|
|
135
|
-
if (params.wait_for)
|
|
136
|
-
searchParams.set("wait_for", params.wait_for);
|
|
137
|
-
if (params.wait != null)
|
|
138
|
-
searchParams.set("wait", String(params.wait));
|
|
139
|
-
if (params.js_instructions)
|
|
140
|
-
searchParams.set("js_instructions", params.js_instructions);
|
|
141
|
-
if (params.outputs)
|
|
142
|
-
searchParams.set("outputs", params.outputs);
|
|
143
|
-
if (params.screenshot || params.screenshot_fullpage || params.screenshot_selector)
|
|
144
|
-
searchParams.set("screenshot", "true");
|
|
145
|
-
if (params.screenshot_fullpage)
|
|
146
|
-
searchParams.set("screenshot_fullpage", "true");
|
|
147
|
-
if (params.screenshot_selector)
|
|
148
|
-
searchParams.set("screenshot_selector", params.screenshot_selector);
|
|
149
|
-
// response_type is mutually exclusive with autoparse, css_extractor, outputs, and screenshot params.
|
|
150
|
-
// 'html' is the ZenRows default (no param); all other values are passed through.
|
|
151
|
-
const isScreenshot = params.screenshot || params.screenshot_fullpage || params.screenshot_selector;
|
|
152
|
-
const effectiveType = params.response_type ?? "markdown";
|
|
153
|
-
if (!params.autoparse && !params.css_extractor && !params.outputs && !isScreenshot && effectiveType !== "html") {
|
|
154
|
-
searchParams.set("response_type", effectiveType);
|
|
155
|
-
}
|
|
156
|
-
let response;
|
|
157
|
-
try {
|
|
158
|
-
response = await fetch(`${ZENROWS_API_URL}?${searchParams}`, {
|
|
159
|
-
headers: { "User-Agent": `zenrows/mcp ${pkg.version}` },
|
|
160
|
-
});
|
|
161
|
-
}
|
|
162
|
-
catch (err) {
|
|
163
|
-
return {
|
|
164
|
-
content: [
|
|
165
|
-
{
|
|
166
|
-
type: "text",
|
|
167
|
-
text: `Network error contacting ZenRows: ${err instanceof Error ? err.message : String(err)}`,
|
|
168
|
-
},
|
|
169
|
-
],
|
|
170
|
-
isError: true,
|
|
171
|
-
};
|
|
172
|
-
}
|
|
173
|
-
if (!response.ok) {
|
|
174
|
-
const body = await response.text();
|
|
175
|
-
return {
|
|
176
|
-
content: [{ type: "text", text: `ZenRows error ${response.status}: ${body}` }],
|
|
177
|
-
isError: true,
|
|
178
|
-
};
|
|
179
|
-
}
|
|
180
|
-
const contentType = response.headers.get("content-type") ?? "";
|
|
181
|
-
const buffer = await response.arrayBuffer();
|
|
182
|
-
const bytes = new Uint8Array(buffer);
|
|
183
|
-
const isPng = bytes[0] === 0x89 && bytes[1] === 0x50 && bytes[2] === 0x4e && bytes[3] === 0x47;
|
|
184
|
-
const isJpeg = bytes[0] === 0xff && bytes[1] === 0xd8;
|
|
185
|
-
if (contentType.startsWith("image/") || isPng || isJpeg) {
|
|
186
|
-
const mimeType = isPng ? "image/png" : isJpeg ? "image/jpeg" : contentType.split(";")[0].trim();
|
|
187
|
-
const base64 = Buffer.from(buffer).toString("base64");
|
|
188
|
-
return {
|
|
189
|
-
content: [{ type: "image", data: base64, mimeType }],
|
|
190
|
-
};
|
|
191
|
-
}
|
|
192
|
-
return {
|
|
193
|
-
content: [{ type: "text", text: new TextDecoder().decode(buffer) }],
|
|
194
|
-
};
|
|
195
|
-
});
|
|
196
|
-
// ─── boot ─────────────────────────────────────────────────────────────────────
|
|
197
|
-
async function main() {
|
|
198
|
-
const transport = new StdioServerTransport();
|
|
199
|
-
await server.connect(transport);
|
|
200
|
-
console.error("ZenRows MCP server running on stdio");
|
|
201
|
-
}
|
|
202
|
-
main().catch((error) => {
|
|
203
|
-
console.error("Fatal error in main():", error);
|
|
204
|
-
process.exit(1);
|
|
205
|
-
});
|
|
9
|
+
const server = createServer(apiKey);
|
|
10
|
+
const transport = new StdioServerTransport();
|
|
11
|
+
await server.connect(transport);
|
|
12
|
+
process.stderr.write("ZenRows MCP server running on stdio\n");
|
package/dist/server.d.ts
ADDED
package/dist/server.js
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import { createRequire } from "module";
|
|
2
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
3
|
+
import { z } from "zod";
|
|
4
|
+
const require = createRequire(import.meta.url);
|
|
5
|
+
const pkg = require("../package.json");
|
|
6
|
+
const ZENROWS_API_URL = "https://api.zenrows.com/v1/";
|
|
7
|
+
export function createServer(apiKey) {
|
|
8
|
+
const server = new McpServer({
|
|
9
|
+
name: "zenrows",
|
|
10
|
+
version: pkg.version,
|
|
11
|
+
});
|
|
12
|
+
// ─── scrape ────────────────────────────────────────────────────────────────
|
|
13
|
+
server.registerTool("scrape", {
|
|
14
|
+
description: `Scrape any webpage and return its content using ZenRows.
|
|
15
|
+
|
|
16
|
+
Use this tool to fetch webpage content for analysis. By default it returns clean
|
|
17
|
+
markdown, which is ideal for LLM processing.
|
|
18
|
+
|
|
19
|
+
When to enable options:
|
|
20
|
+
- js_render: page uses React/Vue/Angular, loads content dynamically, or content
|
|
21
|
+
appears missing on the first attempt
|
|
22
|
+
- premium_proxy: site returns 403/blocked errors even with js_render enabled
|
|
23
|
+
- wait_for: specific content loads after initial render (requires js_render)
|
|
24
|
+
- css_extractor: you only need specific elements, not the whole page
|
|
25
|
+
- autoparse: structured data pages like products or articles
|
|
26
|
+
|
|
27
|
+
Examples:
|
|
28
|
+
Basic: { url: "https://example.com" }
|
|
29
|
+
Dynamic: { url: "https://spa.com", js_render: true }
|
|
30
|
+
Protected:{ url: "https://protected.com", js_render: true, premium_proxy: true }
|
|
31
|
+
Extract: { url: "https://shop.com", css_extractor: '{"title":"h1","price":".price"}' }`,
|
|
32
|
+
inputSchema: {
|
|
33
|
+
url: z.string().url().describe("The webpage URL to scrape"),
|
|
34
|
+
js_render: z
|
|
35
|
+
.boolean()
|
|
36
|
+
.optional()
|
|
37
|
+
.default(false)
|
|
38
|
+
.describe("Enable JavaScript rendering via headless browser. Required for SPAs " +
|
|
39
|
+
"(React, Vue, Angular) and pages that load content dynamically."),
|
|
40
|
+
premium_proxy: z
|
|
41
|
+
.boolean()
|
|
42
|
+
.optional()
|
|
43
|
+
.default(false)
|
|
44
|
+
.describe("Use premium residential proxies to bypass anti-bot protection. " +
|
|
45
|
+
"Required for heavily protected sites. Implies higher credit cost."),
|
|
46
|
+
proxy_country: z
|
|
47
|
+
.string()
|
|
48
|
+
.optional()
|
|
49
|
+
.describe("Country for geo-targeted scraping. ISO 3166-1 alpha-2 code (e.g. 'US', 'GB', 'DE'). " +
|
|
50
|
+
"Requires premium_proxy=true."),
|
|
51
|
+
response_type: z
|
|
52
|
+
.enum(["markdown", "plaintext", "pdf", "html"])
|
|
53
|
+
.optional()
|
|
54
|
+
.default("markdown")
|
|
55
|
+
.describe("Output format. 'markdown' (default) preserves structure and is ideal for LLMs. " +
|
|
56
|
+
"'plaintext' strips all formatting for pure text extraction. " +
|
|
57
|
+
"'pdf' returns a PDF of the page. " +
|
|
58
|
+
"'html' returns the raw HTML source (omits the response_type param; ZenRows default). " +
|
|
59
|
+
"Ignored when autoparse, css_extractor, outputs, or screenshot params are set."),
|
|
60
|
+
autoparse: z
|
|
61
|
+
.boolean()
|
|
62
|
+
.optional()
|
|
63
|
+
.describe("Automatically extract structured data from the page into JSON. " +
|
|
64
|
+
"Best for product pages, articles, and listings."),
|
|
65
|
+
css_extractor: z
|
|
66
|
+
.string()
|
|
67
|
+
.optional()
|
|
68
|
+
.describe("Extract specific elements using CSS selectors. " +
|
|
69
|
+
'JSON object mapping names to selectors, e.g. \'{"title":"h1","price":".price-tag"}\'. ' +
|
|
70
|
+
"Returns JSON instead of full page content."),
|
|
71
|
+
wait_for: z
|
|
72
|
+
.string()
|
|
73
|
+
.optional()
|
|
74
|
+
.describe("CSS selector to wait for before capturing. Use when key content loads " +
|
|
75
|
+
"after the initial page render. Requires js_render=true."),
|
|
76
|
+
wait: z
|
|
77
|
+
.number()
|
|
78
|
+
.int()
|
|
79
|
+
.min(0)
|
|
80
|
+
.max(30000)
|
|
81
|
+
.optional()
|
|
82
|
+
.describe("Milliseconds to wait after page load before capturing content. " +
|
|
83
|
+
"Max 30000 (30s). Requires js_render=true."),
|
|
84
|
+
js_instructions: z
|
|
85
|
+
.string()
|
|
86
|
+
.optional()
|
|
87
|
+
.describe("JSON array of browser interactions to run before scraping. Requires js_render=true. " +
|
|
88
|
+
'Example: [{"click":"#load-more"},{"wait":1000},{"wait_for":".results"}]'),
|
|
89
|
+
outputs: z
|
|
90
|
+
.string()
|
|
91
|
+
.optional()
|
|
92
|
+
.describe("Comma-separated list of data types to extract as structured JSON. " +
|
|
93
|
+
"Available: emails, headings, links, menus, images, videos, audios. " +
|
|
94
|
+
"Use '*' for all types. Returns JSON instead of full page content."),
|
|
95
|
+
screenshot: z
|
|
96
|
+
.boolean()
|
|
97
|
+
.optional()
|
|
98
|
+
.describe("Capture an above-the-fold screenshot of the page. " +
|
|
99
|
+
"Returns an image instead of text content. Useful for visual verification or debugging."),
|
|
100
|
+
screenshot_fullpage: z
|
|
101
|
+
.boolean()
|
|
102
|
+
.optional()
|
|
103
|
+
.describe("Capture a full-page screenshot including content below the fold. " +
|
|
104
|
+
"Returns an image instead of text content."),
|
|
105
|
+
screenshot_selector: z
|
|
106
|
+
.string()
|
|
107
|
+
.optional()
|
|
108
|
+
.describe("Capture a screenshot of a specific element using a CSS selector. " +
|
|
109
|
+
'Example: ".product-card". Returns an image instead of text content.'),
|
|
110
|
+
},
|
|
111
|
+
}, async (params) => {
|
|
112
|
+
const searchParams = new URLSearchParams({
|
|
113
|
+
apikey: apiKey,
|
|
114
|
+
url: params.url,
|
|
115
|
+
});
|
|
116
|
+
if (params.js_render ||
|
|
117
|
+
params.screenshot ||
|
|
118
|
+
params.screenshot_fullpage ||
|
|
119
|
+
params.screenshot_selector)
|
|
120
|
+
searchParams.set("js_render", "true");
|
|
121
|
+
if (params.premium_proxy)
|
|
122
|
+
searchParams.set("premium_proxy", "true");
|
|
123
|
+
if (params.proxy_country)
|
|
124
|
+
searchParams.set("proxy_country", params.proxy_country.toUpperCase());
|
|
125
|
+
if (params.autoparse)
|
|
126
|
+
searchParams.set("autoparse", "true");
|
|
127
|
+
if (params.css_extractor)
|
|
128
|
+
searchParams.set("css_extractor", params.css_extractor);
|
|
129
|
+
if (params.wait_for)
|
|
130
|
+
searchParams.set("wait_for", params.wait_for);
|
|
131
|
+
if (params.wait != null)
|
|
132
|
+
searchParams.set("wait", String(params.wait));
|
|
133
|
+
if (params.js_instructions)
|
|
134
|
+
searchParams.set("js_instructions", params.js_instructions);
|
|
135
|
+
if (params.outputs)
|
|
136
|
+
searchParams.set("outputs", params.outputs);
|
|
137
|
+
if (params.screenshot || params.screenshot_fullpage || params.screenshot_selector)
|
|
138
|
+
searchParams.set("screenshot", "true");
|
|
139
|
+
if (params.screenshot_fullpage)
|
|
140
|
+
searchParams.set("screenshot_fullpage", "true");
|
|
141
|
+
if (params.screenshot_selector)
|
|
142
|
+
searchParams.set("screenshot_selector", params.screenshot_selector);
|
|
143
|
+
// response_type is mutually exclusive with autoparse, css_extractor, outputs, and screenshot params.
|
|
144
|
+
// 'html' is the ZenRows default (no param); all other values are passed through.
|
|
145
|
+
const isScreenshot = params.screenshot || params.screenshot_fullpage || params.screenshot_selector;
|
|
146
|
+
const effectiveType = params.response_type ?? "markdown";
|
|
147
|
+
if (!params.autoparse &&
|
|
148
|
+
!params.css_extractor &&
|
|
149
|
+
!params.outputs &&
|
|
150
|
+
!isScreenshot &&
|
|
151
|
+
effectiveType !== "html") {
|
|
152
|
+
searchParams.set("response_type", effectiveType);
|
|
153
|
+
}
|
|
154
|
+
let response;
|
|
155
|
+
try {
|
|
156
|
+
response = await fetch(`${ZENROWS_API_URL}?${searchParams}`, {
|
|
157
|
+
headers: { "User-Agent": `zenrows/mcp ${pkg.version}` },
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
catch (err) {
|
|
161
|
+
return {
|
|
162
|
+
content: [
|
|
163
|
+
{
|
|
164
|
+
type: "text",
|
|
165
|
+
text: `Network error contacting ZenRows: ${err instanceof Error ? err.message : String(err)}`,
|
|
166
|
+
},
|
|
167
|
+
],
|
|
168
|
+
isError: true,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
if (!response.ok) {
|
|
172
|
+
const body = await response.text();
|
|
173
|
+
return {
|
|
174
|
+
content: [{ type: "text", text: `ZenRows error ${response.status}: ${body}` }],
|
|
175
|
+
isError: true,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
179
|
+
const buffer = await response.arrayBuffer();
|
|
180
|
+
const bytes = new Uint8Array(buffer);
|
|
181
|
+
const isPng = bytes[0] === 0x89 && bytes[1] === 0x50 && bytes[2] === 0x4e && bytes[3] === 0x47;
|
|
182
|
+
const isJpeg = bytes[0] === 0xff && bytes[1] === 0xd8;
|
|
183
|
+
if (contentType.startsWith("image/") || isPng || isJpeg) {
|
|
184
|
+
const mimeType = isPng
|
|
185
|
+
? "image/png"
|
|
186
|
+
: isJpeg
|
|
187
|
+
? "image/jpeg"
|
|
188
|
+
: contentType.split(";")[0].trim();
|
|
189
|
+
const base64 = Buffer.from(buffer).toString("base64");
|
|
190
|
+
return {
|
|
191
|
+
content: [{ type: "image", data: base64, mimeType }],
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
return {
|
|
195
|
+
content: [{ type: "text", text: new TextDecoder().decode(buffer) }],
|
|
196
|
+
};
|
|
197
|
+
});
|
|
198
|
+
return server;
|
|
199
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@zenrows/mcp",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.4",
|
|
4
4
|
"description": "ZenRows MCP server — Universal Scraper API for AI coding assistants",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -11,16 +11,31 @@
|
|
|
11
11
|
],
|
|
12
12
|
"scripts": {
|
|
13
13
|
"build": "tsc && chmod +x dist/index.js",
|
|
14
|
+
"clean": "rm -rf dist",
|
|
14
15
|
"dev": "node --env-file=.env --import tsx src/index.ts",
|
|
16
|
+
"dev:http": "node --env-file=.env --import tsx src/http.ts",
|
|
17
|
+
"format": "prettier --write .",
|
|
18
|
+
"format:check": "prettier --check .",
|
|
15
19
|
"inspect": "npm run build && npx @modelcontextprotocol/inspector node dist/index.js",
|
|
16
|
-
"
|
|
20
|
+
"lint": "eslint src/**/*.ts",
|
|
21
|
+
"lint:fix": "eslint src/**/*.ts --fix",
|
|
22
|
+
"prepare": "npm run build",
|
|
23
|
+
"prepublishOnly": "npm run clean && npm run build && npm run typecheck && npm run lint",
|
|
24
|
+
"publish-beta": "npm publish --tag beta",
|
|
25
|
+
"typecheck": "tsc --noEmit"
|
|
17
26
|
},
|
|
18
27
|
"dependencies": {
|
|
28
|
+
"@hono/node-server": "^1.19.9",
|
|
19
29
|
"@modelcontextprotocol/sdk": "^1.27.1",
|
|
30
|
+
"hono": "^4.11.4",
|
|
20
31
|
"zod": "^3.23.0"
|
|
21
32
|
},
|
|
22
33
|
"devDependencies": {
|
|
23
34
|
"@types/node": "^22.0.0",
|
|
35
|
+
"@typescript-eslint/eslint-plugin": "^8.57.0",
|
|
36
|
+
"@typescript-eslint/parser": "^8.57.0",
|
|
37
|
+
"eslint": "^10.0.3",
|
|
38
|
+
"prettier": "^3.8.1",
|
|
24
39
|
"tsx": "^4.19.0",
|
|
25
40
|
"typescript": "^5.6.0"
|
|
26
41
|
},
|