npm - scrape-do-mcp - Versions diffs - 0.1.5 → 0.2.0 - Mend

scrape-do-mcp 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README-ZH.md CHANGED Viewed

@@ -6,8 +6,15 @@ Scrape.do 网页抓取和 Google 搜索 MCP 服务器 - 支持反机器人保护
 ## 功能特点
-- **scrape_url**: 抓取任意网页并返回 Markdown 格式内容。自动绕过 Cloudflare、WAF、CAPTCHA 和反爬虫保护。支持 JavaScript 渲染页面。
-- **google_search**: 搜索 Google 并返回结构化的 SERP 结果 JSON。包含自然搜索结果、知识图谱、本地商家、新闻、相关问题（People Also Ask）等。
+- **scrape_url**: 抓取任意网页并返回 Markdown 格式内容。自动绕过 Cloudflare、WAF、CAPTCHA 和反爬虫保护。支持 JavaScript 渲染、截图、地理定位（150+ 国家）、设备模拟、会话保持、自定义请求头/Cookie、超时控制等。
+- **google_search**: 搜索 Google 并返回结构化的 SERP 结果 JSON。包含自然搜索结果、知识图谱、本地商家、新闻、相关问题等。支持地理定位和设备筛选。
+## 可用工具
+| 工具 | 描述 |
+|------|------|
+| `scrape_url` | 全功能网页抓取，反机器人绕过。支持：JavaScript 渲染、截图（PNG）、地理定位（150+ 国家）、设备模拟（桌面/手机/平板）、会话保持、自定义请求头/Cookie、超时控制等。 |
+| `google_search` | Google SERP 结构化抓取，返回 JSON。支持：自然搜索结果、知识图谱、本地商家、新闻、People Also Ask、视频结果等，支持地理定位、设备筛选、时间筛选。 |
 ## 安装
@@ -55,12 +62,52 @@ claude mcp add-json scrape-do --scope user '{
 抓取任意网页并获取 Markdown 内容。
 ```typescript
-// 参数
+// 完整参数
 {
-  url: string,           // 要抓取的网址
-  render_js?: boolean,  // 渲染 JavaScript（默认 false）
-  super_proxy?: boolean, // 使用住宅代理（消耗 10 积分，默认 false）
-  output?: "markdown" | "raw"  // 输出格式（默认 markdown）
+  // 必需
+  url: string,                    // 要抓取的网址
+  // 代理和渲染
+  render_js?: boolean,            // 渲染 JavaScript（默认 false）
+  super_proxy?: boolean,           // 使用住宅/移动代理（消耗 10 积分）
+  geoCode?: string,               // 国家代码（如 'us', 'cn', 'gb'）
+  regionalGeoCode?: string,       // 区域（如 'asia', 'europe'）
+  device?: "desktop" | "mobile" | "tablet",  // 设备类型
+  sessionId?: number,             // 保持相同 IP 的会话
+  // 超时和重试
+  timeout?: number,               // 最大超时时间（毫秒，默认 60000）
+  retryTimeout?: number,          // 重试超时（毫秒）
+  disableRetry?: boolean,         // 禁用自动重试
+  // 输出格式
+  output?: "markdown" | "raw",  // 输出格式（默认 markdown）
+  returnJSON?: boolean,           // 以 JSON 形式返回网络请求
+  transparentResponse?: boolean,   // 返回原始响应
+  // 截图
+  screenshot?: boolean,           // 截图（PNG）
+  fullScreenShot?: boolean,      // 全页截图
+  particularScreenShot?: string,  // 元素截图（CSS 选择器）
+  // 浏览器控制
+  waitSelector?: string,          // 等待元素（CSS 选择器）
+  customWait?: number,           // 加载后等待时间（毫秒）
+  waitUntil?: "domcontentloaded" | "load" | "networkidle" | "networkidle0" | "networkidle2",
+  width?: number,                // 视口宽度（默认 1920）
+  height?: number,               // 视口高度（默认 1080）
+  blockResources?: boolean,       // 阻止 CSS/图片/字体（默认 true）
+  // 请求头和 Cookie
+  customHeaders?: boolean,        // 处理所有请求头
+  extraHeaders?: boolean,       // 添加额外请求头
+  forwardHeaders?: boolean,      // 转发你的请求头
+  setCookies?: string,          // 设置 Cookie（格式：'name=value; name2=value2'）
+  pureCookies?: boolean,        // 返回原始 Cookie
+  // 其他
+  disableRedirection?: boolean, // 禁用重定向
+  callback?: string             // Webhook URL 异步接收结果
 }
 ```
@@ -69,14 +116,22 @@ claude mcp add-json scrape-do --scope user '{
 搜索 Google 并获取结构化结果。
 ```typescript
-// 参数
+// 完整参数
 {
-  query: string,                      // 搜索关键词
-  country?: string,                   // 国家代码（默认 "us"）
-  language?: string,                  // 界面语言（默认 "en"）
-  page?: number,                      // 页码（默认 1）
+  // 必需
+  query: string,                  // 搜索关键词
+  // 搜索选项
+  country?: string,                // 国家代码（默认 'us'）
+  language?: string,              // 界面语言（默认 'en'）
+  domain?: string,               // Google 域名（如 'com', 'co.uk'）
+  page?: number,                  // 页码（默认 1）
+  num?: number,                  // 每页结果数（默认 10）
   time_period?: "" | "last_hour" | "last_day" | "last_week" | "last_month" | "last_year",
-  device?: "desktop" | "mobile"       // 设备类型（默认 desktop）
+  device?: "desktop" | "mobile", // 设备类型
+  // 高级
+  includeHtml?: boolean           // 在响应中包含原始 HTML
 }
 ```
@@ -108,6 +163,31 @@ claude mcp add-json scrape-do --scope user '{
 抓取 https://example.com 并返回原始 HTML 而不是 markdown。
 ```
+### 地理定位抓取
+```
+用日本（geoCode: jp）的 IP 抓取 https://www.amazon.com/product/12345
+```
+### 移动设备模拟
+```
+用移动设备抓取 https://example.com 来查看移动版页面。
+```
+### 截图
+```
+截取 https://example.com 的屏幕截图并返回图片。
+```
+### 等待元素加载
+```
+抓取 https://example.com 但先等待 id 为 "content" 的元素加载完成。
+```
+### 会话保持
+```
+使用会话 ID 12345 抓取 https://example.com 的多个页面，以保持相同的 IP。
+```
 ## 与其他工具对比
 | 功能 | scrape-do-mcp | Firecrawl | Browserbase |
@@ -133,7 +213,7 @@ claude mcp add-json scrape-do --scope user '{
 | scrape_url（super_proxy） | 10 积分/次 |
 | google_search | 1 积分/次 |
-注册即送 **1,000 积分**：https://app.scrape.do
+**免费：每月 1,000 积分** - 无需信用卡：https://app.scrape.do
 ## 开发

package/README.md CHANGED Viewed

@@ -6,8 +6,15 @@ MCP Server for Scrape.do - Web Scraping & Google Search with anti-bot bypass
 ## Features
-- **scrape_url**: Scrape any webpage and return content as Markdown. Automatically bypasses Cloudflare, WAFs, CAPTCHAs, and anti-bot protection. Supports JavaScript-rendered pages.
-- **google_search**: Search Google and return structured SERP results as JSON. Returns organic results, knowledge graph, local businesses, news stories, and more.
+- **scrape_url**: Scrape any webpage and return content as Markdown. Automatically bypasses Cloudflare, WAFs, CAPTCHAs, and anti-bot protection. Supports JavaScript rendering, screenshots, geo-targeting (150+ countries), device emulation, session persistence, and more.
+- **google_search**: Search Google and return structured SERP results as JSON. Returns organic results, knowledge graph, local businesses, news stories, and more. Supports geo-targeting and device filtering.
+## Available Tools
+| Tool | Description |
+|------|-------------|
+| `scrape_url` | Full-featured web scraping with anti-bot bypass. Supports: JavaScript rendering, screenshots (PNG), geo-targeting (150+ countries), device emulation (desktop/mobile/tablet), session persistence, custom headers/cookies, timeout control, and more. |
+| `google_search` | Google SERP scraping returning structured JSON. Supports: organic results, knowledge graph, local businesses, news, People Also Ask, video results, geo-targeting, device filtering, and time-based filtering. |
 ## Installation
@@ -55,12 +62,52 @@ Get your free API token at: https://app.scrape.do
 Scrape any webpage and get content as Markdown.
 ```typescript
-// Parameters
+// All Parameters
 {
-  url: string,           // Target URL to scrape
-  render_js?: boolean,   // Render JavaScript (default: false)
-  super_proxy?: boolean, // Use residential proxies (costs 10 credits, default: false)
-  output?: "markdown" | "raw"  // Output format (default: markdown)
+  // Required
+  url: string,                    // Target URL to scrape
+  // Proxy & Rendering
+  render_js?: boolean,            // Render JavaScript (default: false)
+  super_proxy?: boolean,           // Use residential/mobile proxies (costs 10 credits)
+  geoCode?: string,               // Country code (e.g., 'us', 'cn', 'gb')
+  regionalGeoCode?: string,       // Region (e.g., 'asia', 'europe')
+  device?: "desktop" | "mobile" | "tablet",  // Device type
+  sessionId?: number,             // Keep same IP for session
+  // Timeout & Retry
+  timeout?: number,               // Max timeout in ms (default: 60000)
+  retryTimeout?: number,          // Retry timeout in ms
+  disableRetry?: boolean,         // Disable auto retry
+  // Output Format
+  output?: "markdown" | "raw",   // Output format (default: markdown)
+  returnJSON?: boolean,           // Return network requests as JSON
+  transparentResponse?: boolean,   // Return pure response
+  // Screenshot
+  screenshot?: boolean,           // Take screenshot (PNG)
+  fullScreenShot?: boolean,       // Full page screenshot
+  particularScreenShot?: string,   // Screenshot of element (CSS selector)
+  // Browser Control
+  waitSelector?: string,          // Wait for element (CSS selector)
+  customWait?: number,            // Wait time after load (ms)
+  waitUntil?: "domcontentloaded" | "load" | "networkidle" | "networkidle0" | "networkidle2",
+  width?: number,                // Viewport width (default: 1920)
+  height?: number,               // Viewport height (default: 1080)
+  blockResources?: boolean,      // Block CSS/images/fonts (default: true)
+  // Headers & Cookies
+  customHeaders?: boolean,        // Handle all headers
+  extraHeaders?: boolean,        // Add extra headers
+  forwardHeaders?: boolean,      // Forward your headers
+  setCookies?: string,           // Set cookies ('name=value; name2=value2')
+  pureCookies?: boolean,         // Return original cookies
+  // Other
+  disableRedirection?: boolean,  // Disable redirect
+  callback?: string              // Webhook URL for async results
 }
 ```
@@ -69,14 +116,22 @@ Scrape any webpage and get content as Markdown.
 Search Google and get structured results.
 ```typescript
-// Parameters
+// All Parameters
 {
-  query: string,                      // Search query
-  country?: string,                    // Country code (default: "us")
-  language?: string,                   // Interface language (default: "en")
-  page?: number,                       // Page number (default: 1)
+  // Required
+  query: string,                  // Search query
+  // Search Options
+  country?: string,                // Country code (default: 'us')
+  language?: string,               // Interface language (default: 'en')
+  domain?: string,                // Google domain (e.g., 'com', 'co.uk')
+  page?: number,                  // Page number (default: 1)
+  num?: number,                   // Results per page (default: 10)
   time_period?: "" | "last_hour" | "last_day" | "last_week" | "last_month" | "last_year",
-  device?: "desktop" | "mobile"        // Device type (default: desktop)
+  device?: "desktop" | "mobile", // Device type
+  // Advanced
+  includeHtml?: boolean           // Include raw HTML in response
 }
 ```
@@ -110,6 +165,31 @@ Use render_js=true to get the fully rendered content.
 Scrape https://example.com and return raw HTML instead of markdown.
 ```
+### Geo-targeting
+```
+Scrape https://www.amazon.com/product/12345 as if I'm in Japan (geoCode: jp)
+```
+### Mobile Device
+```
+Scrape https://example.com using a mobile device to see the mobile version.
+```
+### Take Screenshot
+```
+Take a screenshot of https://example.com and return the image.
+```
+### Wait for Element
+```
+Scrape https://example.com but wait for the element with id "content" to load first.
+```
+### Session Persistence
+```
+Scrape multiple pages of https://example.com using sessionId 12345 to maintain the same IP.
+```
 ## Comparison with Alternatives
 | Feature | scrape-do-mcp | Firecrawl | Browserbase |
@@ -135,7 +215,7 @@ Scrape https://example.com and return raw HTML instead of markdown.
 | scrape_url (super_proxy) | 10 credits/request |
 | google_search | 1 credit/request |
-Free registration includes **1,000 credits**: https://app.scrape.do
+**Free: 1,000 credits/month** - No credit card required: https://app.scrape.do
 ## Development

package/dist/index.js CHANGED Viewed

@@ -12,32 +12,126 @@ const SCRAPE_DO_TOKEN = process.env.SCRAPE_DO_TOKEN || "";
 const SCRAPE_API_BASE = "https://api.scrape.do";
 const server = new mcp_js_1.McpServer({
     name: "scrape-do-mcp",
-    version: "0.1.3",
+    version: "0.2.0",
 });
 // ─── Tool 1: scrape_url ──────────────────────────────────────────────────────
-server.tool("scrape_url", "Scrape any webpage and return its content as Markdown. Automatically bypasses Cloudflare, WAFs, CAPTCHAs, and anti-bot protection. Supports JavaScript-rendered pages.", {
+server.tool("scrape_url", "Scrape any webpage and return its content as Markdown. Automatically bypasses Cloudflare, WAFs, CAPTCHAs, and anti-bot protection. Supports JavaScript-rendered pages, screenshots, geo-targeting, and more.", {
+    // Required
     url: zod_1.z.string().url().describe("The target URL to scrape"),
+    // Proxy & Rendering
     render_js: zod_1.z.boolean().optional().default(false).describe("Render JavaScript (use for React/Vue/SPA pages)"),
-    super_proxy: zod_1.z.boolean().optional().default(false).describe("Use residential/mobile proxies for harder-to-detect requests (costs 10 credits instead of 1)"),
+    super_proxy: zod_1.z.boolean().optional().default(false).describe("Use residential & mobile proxy networks (costs 10 credits instead of 1)"),
+    geoCode: zod_1.z.string().optional().describe("Country code for geo-targeting (e.g., 'us', 'cn', 'gb', 'jp'). See full list at https://scrape.do/features/geo-targeting/"),
+    regionalGeoCode: zod_1.z.string().optional().describe("Regional geo targeting (e.g., 'asia', 'europe', 'africa')"),
+    device: zod_1.z.enum(["desktop", "mobile", "tablet"]).optional().default("desktop").describe("Device type to emulate"),
+    sessionId: zod_1.z.number().optional().describe("Use the same IP address continuously with a session (0-999999999)"),
+    // Timeout & Retry
+    timeout: zod_1.z.number().optional().default(60000).describe("Maximum timeout for request in milliseconds (max 120000)"),
+    retryTimeout: zod_1.z.number().optional().describe("Maximum timeout for retry mechanism in milliseconds"),
+    disableRetry: zod_1.z.boolean().optional().default(false).describe("Disable automatic retry on failure"),
+    // Output Format
     output: zod_1.z.enum(["markdown", "raw"]).optional().default("markdown").describe("Output format: markdown (default) or raw HTML"),
-}, async ({ url, render_js, super_proxy, output }) => {
+    returnJSON: zod_1.z.boolean().optional().default(false).describe("Returns network requests with content as JSON"),
+    transparentResponse: zod_1.z.boolean().optional().default(false).describe("Return pure response without Scrape.do processing"),
+    // Screenshot
+    screenshot: zod_1.z.boolean().optional().default(false).describe("Return a screenshot from the webpage (PNG)"),
+    fullScreenShot: zod_1.z.boolean().optional().default(false).describe("Return a full page screenshot"),
+    particularScreenShot: zod_1.z.string().optional().describe("Return screenshot of a specific area (CSS selector)"),
+    // Browser Control
+    waitSelector: zod_1.z.string().optional().describe("CSS selector to wait for before returning"),
+    customWait: zod_1.z.number().optional().describe("Wait time in milliseconds after content loaded"),
+    waitUntil: zod_1.z.enum(["domcontentloaded", "load", "networkidle", "networkidle0", "networkidle2"]).optional().default("domcontentloaded").describe("When to consider page loaded"),
+    width: zod_1.z.number().optional().default(1920).describe("Browser viewport width in pixels"),
+    height: zod_1.z.number().optional().default(1080).describe("Browser viewport height in pixels"),
+    blockResources: zod_1.z.boolean().optional().default(true).describe("Block CSS, images, and fonts to speed up loading"),
+    // Headers & Cookies
+    customHeaders: zod_1.z.boolean().optional().default(false).describe("Handle all request headers for the target webpage"),
+    extraHeaders: zod_1.z.boolean().optional().default(false).describe("Add extra headers or change header values"),
+    forwardHeaders: zod_1.z.boolean().optional().default(false).describe("Forward your own headers to the target website"),
+    setCookies: zod_1.z.string().optional().describe("Set cookies for the target webpage (format: 'name=value; name2=value2')"),
+    pureCookies: zod_1.z.boolean().optional().default(false).describe("Return original Set-Cookie headers from target website"),
+    // Other
+    disableRedirection: zod_1.z.boolean().optional().default(false).describe("Disable request redirection"),
+    callback: zod_1.z.string().optional().describe("Get results via webhook URL without waiting"),
+}, async (params) => {
     if (!SCRAPE_DO_TOKEN) {
         return {
             content: [{ type: "text", text: "Error: SCRAPE_DO_TOKEN is not set. Get your free token at https://app.scrape.do" }],
             isError: true,
         };
     }
+    const { url, render_js, super_proxy, geoCode, regionalGeoCode, device, sessionId, timeout, retryTimeout, disableRetry, output, returnJSON, transparentResponse, screenshot, fullScreenShot, particularScreenShot, waitSelector, customWait, waitUntil, width, height, blockResources, customHeaders, extraHeaders, forwardHeaders, setCookies, pureCookies, disableRedirection, callback, } = params;
     try {
+        const requestParams = {
+            token: SCRAPE_DO_TOKEN,
+            url,
+            render: render_js,
+            super: super_proxy,
+            output,
+        };
+        // Add optional parameters if provided
+        if (geoCode)
+            requestParams.geoCode = geoCode;
+        if (regionalGeoCode)
+            requestParams.regionalGeoCode = regionalGeoCode;
+        if (device && device !== "desktop")
+            requestParams.device = device;
+        if (sessionId)
+            requestParams.sessionId = sessionId;
+        if (timeout && timeout !== 60000)
+            requestParams.timeout = timeout;
+        if (retryTimeout)
+            requestParams.retryTimeout = retryTimeout;
+        if (disableRetry)
+            requestParams.disableRetry = disableRetry;
+        if (returnJSON)
+            requestParams.returnJSON = returnJSON;
+        if (transparentResponse)
+            requestParams.transparentResponse = transparentResponse;
+        if (screenshot)
+            requestParams.screenShot = screenshot;
+        if (fullScreenShot)
+            requestParams.fullScreenShot = fullScreenShot;
+        if (particularScreenShot)
+            requestParams.particularScreenShot = particularScreenShot;
+        if (waitSelector)
+            requestParams.waitSelector = waitSelector;
+        if (customWait)
+            requestParams.customWait = customWait;
+        if (waitUntil && waitUntil !== "domcontentloaded")
+            requestParams.waitUntil = waitUntil;
+        if (width && width !== 1920)
+            requestParams.width = width;
+        if (height && height !== 1080)
+            requestParams.height = height;
+        if (blockResources === false)
+            requestParams.blockResources = false;
+        if (customHeaders)
+            requestParams.customHeaders = customHeaders;
+        if (extraHeaders)
+            requestParams.extraHeaders = extraHeaders;
+        if (forwardHeaders)
+            requestParams.forwardHeaders = forwardHeaders;
+        if (setCookies)
+            requestParams.setCookies = setCookies;
+        if (pureCookies)
+            requestParams.pureCookies = pureCookies;
+        if (disableRedirection)
+            requestParams.disableRedirection = disableRedirection;
+        if (callback)
+            requestParams.callback = callback;
         const response = await axios_1.default.get(SCRAPE_API_BASE, {
-            params: {
-                token: SCRAPE_DO_TOKEN,
-                url,
-                render: render_js,
-                super: super_proxy,
-                output,
-            },
-            timeout: 60000,
+            params: requestParams,
+            timeout: Math.min(timeout || 60000, 120000),
+            responseType: screenshot || fullScreenShot || particularScreenShot ? 'arraybuffer' : 'text',
         });
+        // If screenshot, return as base64
+        if (screenshot || fullScreenShot || particularScreenShot) {
+            const base64 = Buffer.from(response.data, 'binary').toString('base64');
+            return {
+                content: [{ type: "text", text: `Screenshot (base64): ${base64}` }],
+            };
+        }
         return {
             content: [{ type: "text", text: response.data }],
         };
@@ -52,13 +146,19 @@ server.tool("scrape_url", "Scrape any webpage and return its content as Markdown
 });
 // ─── Tool 2: google_search ───────────────────────────────────────────────────
 server.tool("google_search", "Search Google and return structured SERP results as JSON. Returns organic results, knowledge graph, local businesses, news stories, related questions (People Also Ask), video results, and more.", {
+    // Required
     query: zod_1.z.string().describe("Search query, e.g. 'best python frameworks 2026'"),
-    country: zod_1.z.string().optional().default("us").describe("Country code for results, e.g. 'us', 'cn', 'gb', 'jp'"),
-    language: zod_1.z.string().optional().default("en").describe("Interface language, e.g. 'en', 'zh', 'ja', 'de'"),
+    // Search Options
+    country: zod_1.z.string().optional().default("us").describe("Country code for results (e.g., 'us', 'cn', 'gb', 'jp'). See: https://scrape.do/features/geo-targeting/"),
+    language: zod_1.z.string().optional().default("en").describe("Interface language (e.g., 'en', 'zh', 'ja', 'de')"),
+    domain: zod_1.z.string().optional().describe("Google domain (e.g., 'com', 'co.uk', 'de', 'fr')"),
     page: zod_1.z.number().optional().default(1).describe("Page number (1 = first page, 2 = second page)"),
     time_period: zod_1.z.enum(["", "last_hour", "last_day", "last_week", "last_month", "last_year"]).optional().default("").describe("Filter results by time period"),
     device: zod_1.z.enum(["desktop", "mobile"]).optional().default("desktop").describe("Device type affecting SERP layout"),
-}, async ({ query, country, language, page, time_period, device }) => {
+    // Advanced
+    num: zod_1.z.number().optional().describe("Number of results per page (default: 10)"),
+    includeHtml: zod_1.z.boolean().optional().default(false).describe("Include raw HTML alongside parsed JSON"),
+}, async ({ query, country, language, domain, page, time_period, device, num, includeHtml }) => {
     if (!SCRAPE_DO_TOKEN) {
         return {
             content: [{ type: "text", text: "Error: SCRAPE_DO_TOKEN is not set. Get your free token at https://app.scrape.do" }],
@@ -71,9 +171,15 @@ server.tool("google_search", "Search Google and return structured SERP results a
             q: query,
             gl: country,
             hl: language,
-            start: (page - 1) * 10,
+            start: (page - 1) * (num || 10),
             device,
         };
+        if (domain)
+            params.domain = domain;
+        if (num)
+            params.num = num;
+        if (includeHtml)
+            params.include_html = includeHtml;
         if (time_period)
             params.time_period = time_period;
         const response = await axios_1.default.get(`${SCRAPE_API_BASE}/plugin/google/search`, {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "scrape-do-mcp",
-  "version": "0.1.5",
+  "version": "0.2.0",
   "description": "MCP Server for Scrape.do - Web Scraping & Google Search with anti-bot bypass",
   "main": "dist/index.js",
   "bin": {