@deventerprisesoftware/scrapi-mcp 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -3
- package/dist/index.js +96 -32
- package/package.json +3 -3
package/README.md
CHANGED
|
@@ -11,22 +11,73 @@ MCP server for using [ScrAPI](https://scrapi.tech) to scrape web pages.
|
|
|
11
11
|
|
|
12
12
|
ScrAPI is your ultimate web scraping solution, offering powerful, reliable, and easy-to-use features to extract data from any website effortlessly.
|
|
13
13
|
|
|
14
|
+
<a href="https://glama.ai/mcp/servers/@DevEnterpriseSoftware/scrapi-mcp">
|
|
15
|
+
<img width="380" height="200" src="https://glama.ai/mcp/servers/@DevEnterpriseSoftware/scrapi-mcp/badge" alt="ScrAPI Server MCP server" />
|
|
16
|
+
</a>
|
|
17
|
+
|
|
14
18
|
## Tools
|
|
15
19
|
|
|
16
20
|
1. `scrape_url_html`
|
|
17
21
|
- Use a URL to scrape a website using the ScrAPI service and retrieve the result as HTML.
|
|
18
22
|
Use this for scraping website content that is difficult to access because of bot detection, captchas or even geolocation restrictions.
|
|
19
23
|
The result will be in HTML which is preferable if advanced parsing is required.
|
|
20
|
-
-
|
|
24
|
+
- Inputs:
|
|
25
|
+
- `url` (string, required): The URL to scrape
|
|
26
|
+
- `browserCommands` (string, optional): JSON array of browser commands to execute before scraping
|
|
21
27
|
- Returns: HTML content of the URL
|
|
22
28
|
|
|
23
29
|
2. `scrape_url_markdown`
|
|
24
30
|
- Use a URL to scrape a website using the ScrAPI service and retrieve the result as Markdown.
|
|
25
31
|
Use this for scraping website content that is difficult to access because of bot detection, captchas or even geolocation restrictions.
|
|
26
32
|
The result will be in Markdown which is preferable if the text content of the webpage is important and not the structural information of the page.
|
|
27
|
-
-
|
|
33
|
+
- Inputs:
|
|
34
|
+
- `url` (string, required): The URL to scrape
|
|
35
|
+
- `browserCommands` (string, optional): JSON array of browser commands to execute before scraping
|
|
28
36
|
- Returns: Markdown content of the URL
|
|
29
37
|
|
|
38
|
+
## Browser Commands
|
|
39
|
+
|
|
40
|
+
Both tools support optional browser commands that allow you to interact with the page before scraping. This is useful for:
|
|
41
|
+
- Clicking buttons (e.g., "Accept Cookies", "Load More")
|
|
42
|
+
- Filling out forms
|
|
43
|
+
- Selecting dropdown options
|
|
44
|
+
- Scrolling to load dynamic content
|
|
45
|
+
- Waiting for elements to appear
|
|
46
|
+
- Executing custom JavaScript
|
|
47
|
+
|
|
48
|
+
### Available Commands
|
|
49
|
+
|
|
50
|
+
Commands are provided as a JSON array string. All commands are executed with human-like behavior (random mouse movements, variable typing speed, etc.):
|
|
51
|
+
|
|
52
|
+
| Command | Format | Description |
|
|
53
|
+
|---------|--------|-------------|
|
|
54
|
+
| **Click** | `{"click": "#buttonId"}` | Click an element using CSS selector |
|
|
55
|
+
| **Input** | `{"input": {"input[name='email']": "value"}}` | Fill an input field |
|
|
56
|
+
| **Select** | `{"select": {"select[name='country']": "USA"}}` | Select from dropdown (by value or text) |
|
|
57
|
+
| **Scroll** | `{"scroll": 1000}` | Scroll down by pixels (negative values scroll up) |
|
|
58
|
+
| **Wait** | `{"wait": 5000}` | Wait for milliseconds (max 15000) |
|
|
59
|
+
| **WaitFor** | `{"waitfor": "#elementId"}` | Wait for element to appear in DOM |
|
|
60
|
+
| **JavaScript** | `{"javascript": "console.log('test')"}` | Execute custom JavaScript code |
|
|
61
|
+
|
|
62
|
+
### Example Usage
|
|
63
|
+
|
|
64
|
+
```json
|
|
65
|
+
[
|
|
66
|
+
{"click": "#accept-cookies"},
|
|
67
|
+
{"wait": 2000},
|
|
68
|
+
{"input": {"input[name='search']": "web scraping"}},
|
|
69
|
+
{"click": "button[type='submit']"},
|
|
70
|
+
{"waitfor": "#results"},
|
|
71
|
+
{"scroll": 500}
|
|
72
|
+
]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Finding CSS Selectors
|
|
76
|
+
|
|
77
|
+
Need help finding CSS selectors? Try the [Rayrun browser extension](https://chromewebstore.google.com/detail/rayrun/olljocejdgeipcaompahmnfebhkfmnma) to easily select elements and generate selectors.
|
|
78
|
+
|
|
79
|
+
For more details, see the [Browser Commands documentation](https://scrapi.tech/docs/api_details/v1_scrape/browser_commands).
|
|
80
|
+
|
|
30
81
|
## Setup
|
|
31
82
|
|
|
32
83
|
### API Key (optional)
|
|
@@ -101,4 +152,4 @@ docker build -t deventerprisesoftware/scrapi-mcp -f Dockerfile .
|
|
|
101
152
|
|
|
102
153
|
## License
|
|
103
154
|
|
|
104
|
-
This MCP server is licensed under the MIT License. This means you are free to use, modify, and distribute the software, subject to the terms and conditions of the MIT License. For more details, please see the LICENSE file in the project repository.
|
|
155
|
+
This MCP server is licensed under the MIT License. This means you are free to use, modify, and distribute the software, subject to the terms and conditions of the MIT License. For more details, please see the LICENSE file in the project repository.
|
package/dist/index.js
CHANGED
|
@@ -8,12 +8,19 @@ import { z } from "zod";
|
|
|
8
8
|
const PORT = process.env.PORT || 5000;
|
|
9
9
|
const SCRAPI_API_KEY = process.env.SCRAPI_API_KEY || "00000000-0000-0000-0000-000000000000";
|
|
10
10
|
const SCRAPI_SERVER_NAME = "ScrAPI MCP Server";
|
|
11
|
-
const SCRAPI_SERVER_VERSION = "0.
|
|
11
|
+
const SCRAPI_SERVER_VERSION = "0.2.0";
|
|
12
12
|
const app = express();
|
|
13
13
|
app.use(cors({
|
|
14
14
|
origin: "*",
|
|
15
|
-
exposedHeaders: ["
|
|
16
|
-
allowedHeaders: [
|
|
15
|
+
exposedHeaders: ["mcp-session-id", "mcp-protocol-version"],
|
|
16
|
+
allowedHeaders: [
|
|
17
|
+
"Content-Type",
|
|
18
|
+
"mcp-session-id",
|
|
19
|
+
"mcp-protocol-version",
|
|
20
|
+
"Authorization",
|
|
21
|
+
],
|
|
22
|
+
methods: ["GET", "POST", "OPTIONS"],
|
|
23
|
+
preflightContinue: false
|
|
17
24
|
}));
|
|
18
25
|
app.use(express.json());
|
|
19
26
|
// Define session configuration schema
|
|
@@ -38,28 +45,56 @@ export default function createServer({ config, }) {
|
|
|
38
45
|
title: "Scrape URL and respond with HTML",
|
|
39
46
|
description: "Use a URL to scrape a website using the ScrAPI service and retrieve the result as HTML. " +
|
|
40
47
|
"Use this for scraping website content that is difficult to access because of bot detection, captchas or even geolocation restrictions. " +
|
|
41
|
-
"The result will be in HTML which is preferable if advanced parsing is required
|
|
48
|
+
"The result will be in HTML which is preferable if advanced parsing is required.\n\n" +
|
|
49
|
+
"BROWSER COMMANDS: You can optionally provide browser commands to interact with the page before scraping (e.g., clicking buttons, filling forms, scrolling). " +
|
|
50
|
+
"Provide commands as a JSON array string. Available commands:\n" +
|
|
51
|
+
"- Click: {\"click\": \"#buttonId\"} - Click an element using CSS selector\n" +
|
|
52
|
+
"- Input: {\"input\": {\"input[name='email']\": \"value\"}} - Fill an input field\n" +
|
|
53
|
+
"- Select: {\"select\": {\"select[name='country']\": \"USA\"}} - Select from dropdown\n" +
|
|
54
|
+
"- Scroll: {\"scroll\": 1000} - Scroll down (negative values scroll up)\n" +
|
|
55
|
+
"- Wait: {\"wait\": 5000} - Wait milliseconds (max 15000)\n" +
|
|
56
|
+
"- WaitFor: {\"waitfor\": \"#elementId\"} - Wait for element to appear\n" +
|
|
57
|
+
"- JavaScript: {\"javascript\": \"console.log('test')\"} - Execute custom JS\n" +
|
|
58
|
+
"Example: [{\"click\": \"#accept-cookies\"}, {\"wait\": 2000}, {\"input\": {\"input[name='search']\": \"query\"}}]",
|
|
42
59
|
inputSchema: {
|
|
43
60
|
url: z
|
|
44
61
|
.string()
|
|
45
62
|
.url({ message: "Invalid URL" })
|
|
46
63
|
.describe("The URL to scrape"),
|
|
64
|
+
browserCommands: z
|
|
65
|
+
.string()
|
|
66
|
+
.optional()
|
|
67
|
+
.describe("Optional JSON array of browser commands to execute before scraping. See tool description for available commands and format."),
|
|
47
68
|
},
|
|
48
|
-
}, async ({ url }) => await scrapeUrl(url, "HTML"));
|
|
69
|
+
}, async ({ url, browserCommands }) => await scrapeUrl(url, "HTML", browserCommands));
|
|
49
70
|
server.registerTool("scrape_url_markdown", {
|
|
50
71
|
title: "Scrape URL and respond with Markdown",
|
|
51
72
|
description: "Use a URL to scrape a website using the ScrAPI service and retrieve the result as Markdown. " +
|
|
52
73
|
"Use this for scraping website content that is difficult to access because of bot detection, captchas or even geolocation restrictions. " +
|
|
53
|
-
"The result will be in Markdown which is preferable if the text content of the webpage is important and not the structural information of the page
|
|
74
|
+
"The result will be in Markdown which is preferable if the text content of the webpage is important and not the structural information of the page.\n\n" +
|
|
75
|
+
"BROWSER COMMANDS: You can optionally provide browser commands to interact with the page before scraping (e.g., clicking buttons, filling forms, scrolling). " +
|
|
76
|
+
"Provide commands as a JSON array string. Available commands:\n" +
|
|
77
|
+
"- Click: {\"click\": \"#buttonId\"} - Click an element using CSS selector\n" +
|
|
78
|
+
"- Input: {\"input\": {\"input[name='email']\": \"value\"}} - Fill an input field\n" +
|
|
79
|
+
"- Select: {\"select\": {\"select[name='country']\": \"USA\"}} - Select from dropdown\n" +
|
|
80
|
+
"- Scroll: {\"scroll\": 1000} - Scroll down (negative values scroll up)\n" +
|
|
81
|
+
"- Wait: {\"wait\": 5000} - Wait milliseconds (max 15000)\n" +
|
|
82
|
+
"- WaitFor: {\"waitfor\": \"#elementId\"} - Wait for element to appear\n" +
|
|
83
|
+
"- JavaScript: {\"javascript\": \"console.log('test')\"} - Execute custom JS\n" +
|
|
84
|
+
"Example: [{\"click\": \"#accept-cookies\"}, {\"wait\": 2000}, {\"input\": {\"input[name='search']\": \"query\"}}]",
|
|
54
85
|
inputSchema: {
|
|
55
86
|
url: z
|
|
56
87
|
.string()
|
|
57
88
|
.url({ message: "Invalid URL" })
|
|
58
89
|
.describe("The URL to scrape"),
|
|
90
|
+
browserCommands: z
|
|
91
|
+
.string()
|
|
92
|
+
.optional()
|
|
93
|
+
.describe("Optional JSON array of browser commands to execute before scraping. See tool description for available commands and format."),
|
|
59
94
|
},
|
|
60
|
-
}, async ({ url }) => await scrapeUrl(url, "Markdown"));
|
|
61
|
-
async function scrapeUrl(url, format) {
|
|
62
|
-
|
|
95
|
+
}, async ({ url, browserCommands }) => await scrapeUrl(url, "Markdown", browserCommands));
|
|
96
|
+
async function scrapeUrl(url, format, browserCommands) {
|
|
97
|
+
const body = {
|
|
63
98
|
url: url,
|
|
64
99
|
useBrowser: true,
|
|
65
100
|
solveCaptchas: true,
|
|
@@ -67,6 +102,38 @@ export default function createServer({ config, }) {
|
|
|
67
102
|
proxyType: "Residential",
|
|
68
103
|
responseFormat: format,
|
|
69
104
|
};
|
|
105
|
+
// Parse and add browser commands if provided
|
|
106
|
+
if (browserCommands && browserCommands.trim() !== "") {
|
|
107
|
+
try {
|
|
108
|
+
const commands = JSON.parse(browserCommands);
|
|
109
|
+
if (Array.isArray(commands)) {
|
|
110
|
+
body.browserCommands = commands;
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
return {
|
|
114
|
+
content: [
|
|
115
|
+
{
|
|
116
|
+
type: "text",
|
|
117
|
+
text: "Error: Browser commands must be a JSON array.",
|
|
118
|
+
},
|
|
119
|
+
],
|
|
120
|
+
isError: true,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
catch (error) {
|
|
125
|
+
const errorMessage = error instanceof Error ? error.message : "Unknown error";
|
|
126
|
+
return {
|
|
127
|
+
content: [
|
|
128
|
+
{
|
|
129
|
+
type: "text",
|
|
130
|
+
text: `Error: Invalid browser commands format. ${errorMessage}`,
|
|
131
|
+
},
|
|
132
|
+
],
|
|
133
|
+
isError: true,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
}
|
|
70
137
|
try {
|
|
71
138
|
const response = await fetch("https://api.scrapi.tech/v1/scrape", {
|
|
72
139
|
method: "POST",
|
|
@@ -76,7 +143,7 @@ export default function createServer({ config, }) {
|
|
|
76
143
|
"X-API-KEY": config.scrapiApiKey || SCRAPI_API_KEY,
|
|
77
144
|
},
|
|
78
145
|
body: JSON.stringify(body),
|
|
79
|
-
signal: AbortSignal.timeout(
|
|
146
|
+
signal: AbortSignal.timeout(300000),
|
|
80
147
|
});
|
|
81
148
|
const data = await response.text();
|
|
82
149
|
if (response.ok) {
|
|
@@ -104,29 +171,17 @@ export default function createServer({ config, }) {
|
|
|
104
171
|
}
|
|
105
172
|
catch (error) {
|
|
106
173
|
console.error("Error calling API:", error);
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
"X-API-KEY": SCRAPI_API_KEY,
|
|
114
|
-
},
|
|
115
|
-
body: JSON.stringify(body),
|
|
116
|
-
signal: AbortSignal.timeout(30000),
|
|
117
|
-
});
|
|
118
|
-
const data = await response.text();
|
|
119
|
-
return {
|
|
120
|
-
content: [
|
|
121
|
-
{
|
|
122
|
-
type: "text",
|
|
123
|
-
text: data,
|
|
124
|
-
_meta: {
|
|
125
|
-
mimeType: `text/${format.toLowerCase()}`,
|
|
174
|
+
const errorMessage = error instanceof Error ? error.message : "Unknown error occurred";
|
|
175
|
+
return {
|
|
176
|
+
content: [
|
|
177
|
+
{
|
|
178
|
+
type: "text",
|
|
179
|
+
text: `Error: Failed to scrape URL. ${errorMessage}`,
|
|
126
180
|
},
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
181
|
+
],
|
|
182
|
+
isError: true,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
130
185
|
}
|
|
131
186
|
return server.server;
|
|
132
187
|
}
|
|
@@ -161,6 +216,15 @@ app.all("/mcp", async (req, res) => {
|
|
|
161
216
|
}
|
|
162
217
|
}
|
|
163
218
|
});
|
|
219
|
+
app.options("/*", (req, res) => {
|
|
220
|
+
const reqHeaders = req.header("access-control-request-headers");
|
|
221
|
+
if (reqHeaders) {
|
|
222
|
+
res.setHeader("Access-Control-Allow-Headers", reqHeaders);
|
|
223
|
+
}
|
|
224
|
+
res.setHeader("Access-Control-Allow-Methods", "GET,POST,OPTIONS");
|
|
225
|
+
res.setHeader("Access-Control-Allow-Origin", "*");
|
|
226
|
+
res.sendStatus(204);
|
|
227
|
+
});
|
|
164
228
|
// Main function to start the server in the appropriate mode
|
|
165
229
|
async function main() {
|
|
166
230
|
const transport = process.env.TRANSPORT || "stdio";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@deventerprisesoftware/scrapi-mcp",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "MCP server for using ScrAPI to scrape web pages.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"mcp",
|
|
@@ -37,7 +37,7 @@
|
|
|
37
37
|
"docker:release": "npm run build && npm run docker:build && npm run docker:push"
|
|
38
38
|
},
|
|
39
39
|
"dependencies": {
|
|
40
|
-
"@modelcontextprotocol/sdk": "^1.
|
|
40
|
+
"@modelcontextprotocol/sdk": "^1.25.2",
|
|
41
41
|
"cors": "^2.8.5",
|
|
42
42
|
"express": "^5.2.1",
|
|
43
43
|
"zod": "^3.25.76"
|
|
@@ -45,7 +45,7 @@
|
|
|
45
45
|
"devDependencies": {
|
|
46
46
|
"@types/cors": "^2.8.19",
|
|
47
47
|
"@types/express": "^5.0.6",
|
|
48
|
-
"@types/node": "^
|
|
48
|
+
"@types/node": "^25.0.3",
|
|
49
49
|
"shx": "^0.4.0",
|
|
50
50
|
"typescript": "^5.9.3"
|
|
51
51
|
}
|