mcp-server-scraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +114 -0
- package/dist/index.js +214 -0
- package/dist/index.js.map +1 -0
- package/package.json +70 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ofer Shapira
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# mcp-server-scraper
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/mcp-server-scraper)
|
|
4
|
+
[](https://www.npmjs.com/package/mcp-server-scraper)
|
|
5
|
+
[](https://github.com/ofershap/mcp-server-scraper/actions/workflows/ci.yml)
|
|
6
|
+
[](https://www.typescriptlang.org/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
Extract clean, readable content from any URL. Returns markdown text, links, and metadata. No API keys, no config. A free alternative to Firecrawl for scraping docs, blogs, and articles.
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npx mcp-server-scraper
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
> Works with Claude Desktop, Cursor, VS Code Copilot, and any MCP client. No accounts or API keys needed.
|
|
16
|
+
|
|
17
|
+

|
|
18
|
+
|
|
19
|
+
<sub>Demo built with <a href="https://github.com/ofershap/remotion-readme-kit">remotion-readme-kit</a></sub>
|
|
20
|
+
|
|
21
|
+
## Why
|
|
22
|
+
|
|
23
|
+
When you're working with an AI assistant and need to reference a docs page, a blog post, or an API reference, you usually end up copy-pasting content manually. Tools like Firecrawl solve this but require a paid API key. This server does the same thing for free. It fetches a URL, runs it through Mozilla Readability (the same engine behind Firefox Reader View), and returns clean markdown. It works well for server-rendered content like documentation sites, blog posts, and articles. It won't handle JavaScript-heavy SPAs, but for the most common use case of "read this docs page and summarize it," it does the job.
|
|
24
|
+
|
|
25
|
+
## Tools
|
|
26
|
+
|
|
27
|
+
| Tool | What it does |
|
|
28
|
+
| ------------------ | ---------------------------------------------------------------- |
|
|
29
|
+
| `scrape_url` | Extract clean text content from a URL (Readability-powered) |
|
|
30
|
+
| `extract_links` | Get all links with href and anchor text |
|
|
31
|
+
| `extract_metadata` | Get title, description, OG tags, canonical, favicon |
|
|
32
|
+
| `search_page` | Search for a query string within the page, return matching lines |
|
|
33
|
+
| `scrape_multiple` | Batch scrape multiple URLs, get title + excerpt per URL |
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
### Cursor
|
|
38
|
+
|
|
39
|
+
Add to `.cursor/mcp.json`:
|
|
40
|
+
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"mcpServers": {
|
|
44
|
+
"scraper": {
|
|
45
|
+
"command": "npx",
|
|
46
|
+
"args": ["-y", "mcp-server-scraper"]
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Claude Desktop
|
|
53
|
+
|
|
54
|
+
Add to `claude_desktop_config.json`:
|
|
55
|
+
|
|
56
|
+
```json
|
|
57
|
+
{
|
|
58
|
+
"mcpServers": {
|
|
59
|
+
"scraper": {
|
|
60
|
+
"command": "npx",
|
|
61
|
+
"args": ["-y", "mcp-server-scraper"]
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### VS Code
|
|
68
|
+
|
|
69
|
+
Add to your MCP settings (e.g. `.vscode/mcp.json`):
|
|
70
|
+
|
|
71
|
+
```json
|
|
72
|
+
{
|
|
73
|
+
"mcp": {
|
|
74
|
+
"servers": {
|
|
75
|
+
"scraper": {
|
|
76
|
+
"command": "npx",
|
|
77
|
+
"args": ["-y", "mcp-server-scraper"]
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Examples
|
|
85
|
+
|
|
86
|
+
- "Scrape the API docs from https://docs.example.com and summarize them"
|
|
87
|
+
- "Extract all links from this page"
|
|
88
|
+
- "What's the OG image and description for this URL?"
|
|
89
|
+
- "Search this page for mentions of 'authentication'"
|
|
90
|
+
- "Scrape these 5 URLs and give me a summary of each"
|
|
91
|
+
|
|
92
|
+
## How it works
|
|
93
|
+
|
|
94
|
+
Uses [Mozilla Readability](https://github.com/mozilla/readability) (the engine behind Firefox Reader View) plus [linkedom](https://github.com/WebReflection/linkedom) for fast HTML parsing in Node. No headless browser needed. Works best with server-rendered pages: docs, blogs, articles, news sites.
|
|
95
|
+
|
|
96
|
+
## Development
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
npm install
|
|
100
|
+
npm run typecheck
|
|
101
|
+
npm run build
|
|
102
|
+
npm test
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Author
|
|
106
|
+
|
|
107
|
+
**Ofer Shapira**
|
|
108
|
+
|
|
109
|
+
[](https://linkedin.com/in/ofershap)
|
|
110
|
+
[](https://github.com/ofershap)
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
[MIT](LICENSE) © [Ofer Shapira](https://github.com/ofershap)
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/index.ts
|
|
4
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
5
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
6
|
+
import { z } from "zod";
|
|
7
|
+
|
|
8
|
+
// src/scraper.ts
|
|
9
|
+
import { Readability } from "@mozilla/readability";
|
|
10
|
+
import { parseHTML } from "linkedom";
|
|
11
|
+
async function fetchPage(url) {
|
|
12
|
+
const response = await fetch(url, {
|
|
13
|
+
headers: {
|
|
14
|
+
"User-Agent": "Mozilla/5.0 (compatible; mcp-server-scraper/1.0)",
|
|
15
|
+
Accept: "text/html,application/xhtml+xml"
|
|
16
|
+
},
|
|
17
|
+
redirect: "follow"
|
|
18
|
+
});
|
|
19
|
+
if (!response.ok) {
|
|
20
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
21
|
+
}
|
|
22
|
+
return response.text();
|
|
23
|
+
}
|
|
24
|
+
async function scrapeUrl(url) {
|
|
25
|
+
const html = await fetchPage(url);
|
|
26
|
+
const { document } = parseHTML(html);
|
|
27
|
+
const reader = new Readability(document);
|
|
28
|
+
const article = reader.parse();
|
|
29
|
+
if (!article) {
|
|
30
|
+
throw new Error("Could not extract readable content from the page");
|
|
31
|
+
}
|
|
32
|
+
const textContent = article.textContent?.trim() ?? "";
|
|
33
|
+
return {
|
|
34
|
+
title: article.title ?? "",
|
|
35
|
+
content: textContent,
|
|
36
|
+
excerpt: article.excerpt ?? "",
|
|
37
|
+
byline: article.byline ?? "",
|
|
38
|
+
siteName: article.siteName ?? "",
|
|
39
|
+
length: article.length ?? textContent.length
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
async function extractLinks(url) {
|
|
43
|
+
const html = await fetchPage(url);
|
|
44
|
+
const { document } = parseHTML(html);
|
|
45
|
+
const anchors = Array.from(document.querySelectorAll("a[href]"));
|
|
46
|
+
const links = [];
|
|
47
|
+
const baseUrl = new URL(url);
|
|
48
|
+
for (const a of anchors) {
|
|
49
|
+
const href = a.getAttribute("href");
|
|
50
|
+
if (!href || href.startsWith("#") || href.startsWith("javascript:"))
|
|
51
|
+
continue;
|
|
52
|
+
try {
|
|
53
|
+
const resolved = new URL(href, baseUrl).href;
|
|
54
|
+
links.push({ href: resolved, text: (a.textContent ?? "").trim() });
|
|
55
|
+
} catch {
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return links;
|
|
59
|
+
}
|
|
60
|
+
async function extractMetadata(url) {
|
|
61
|
+
const html = await fetchPage(url);
|
|
62
|
+
const { document } = parseHTML(html);
|
|
63
|
+
const getMeta = (name) => {
|
|
64
|
+
const el = document.querySelector(
|
|
65
|
+
`meta[property="${name}"], meta[name="${name}"]`
|
|
66
|
+
);
|
|
67
|
+
return el?.getAttribute("content") ?? "";
|
|
68
|
+
};
|
|
69
|
+
const title = document.querySelector("title")?.textContent ?? "";
|
|
70
|
+
const canonical = document.querySelector("link[rel='canonical']")?.getAttribute("href") ?? "";
|
|
71
|
+
const favicon = document.querySelector("link[rel='icon'], link[rel='shortcut icon']")?.getAttribute("href") ?? "";
|
|
72
|
+
return {
|
|
73
|
+
title,
|
|
74
|
+
description: getMeta("description"),
|
|
75
|
+
ogTitle: getMeta("og:title"),
|
|
76
|
+
ogDescription: getMeta("og:description"),
|
|
77
|
+
ogImage: getMeta("og:image"),
|
|
78
|
+
canonical,
|
|
79
|
+
favicon
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
async function searchPage(url, query) {
|
|
83
|
+
const html = await fetchPage(url);
|
|
84
|
+
const { document } = parseHTML(html);
|
|
85
|
+
const text = document.body?.textContent ?? "";
|
|
86
|
+
const lines = text.split("\n").map((l) => l.trim()).filter(Boolean);
|
|
87
|
+
const queryLower = query.toLowerCase();
|
|
88
|
+
return lines.filter(
|
|
89
|
+
(line) => line.toLowerCase().includes(queryLower)
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
async function scrapeMultiple(urls) {
|
|
93
|
+
const results = await Promise.allSettled(
|
|
94
|
+
urls.map(async (url) => {
|
|
95
|
+
const content = await scrapeUrl(url);
|
|
96
|
+
return {
|
|
97
|
+
url,
|
|
98
|
+
title: content.title,
|
|
99
|
+
excerpt: content.excerpt
|
|
100
|
+
};
|
|
101
|
+
})
|
|
102
|
+
);
|
|
103
|
+
return results.map((r, i) => {
|
|
104
|
+
if (r.status === "fulfilled") return r.value;
|
|
105
|
+
return {
|
|
106
|
+
url: urls[i] ?? "",
|
|
107
|
+
title: "",
|
|
108
|
+
excerpt: "",
|
|
109
|
+
error: String(r.reason.message)
|
|
110
|
+
};
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// src/index.ts
|
|
115
|
+
var server = new McpServer({
|
|
116
|
+
name: "mcp-server-scraper",
|
|
117
|
+
version: "1.0.0"
|
|
118
|
+
});
|
|
119
|
+
server.tool(
|
|
120
|
+
"scrape_url",
|
|
121
|
+
"Extract clean, readable text content from a URL using Mozilla Readability. Returns title, excerpt, and main content. Best for articles, docs, and blog posts.",
|
|
122
|
+
{
|
|
123
|
+
url: z.string().url().describe("The URL to scrape")
|
|
124
|
+
},
|
|
125
|
+
async ({ url }) => {
|
|
126
|
+
const content = await scrapeUrl(url);
|
|
127
|
+
const text = [
|
|
128
|
+
`# ${content.title}`,
|
|
129
|
+
content.byline ? `*${content.byline}*` : null,
|
|
130
|
+
content.siteName ? `*${content.siteName}*` : null,
|
|
131
|
+
"",
|
|
132
|
+
content.excerpt ? `> ${content.excerpt}` : null,
|
|
133
|
+
"",
|
|
134
|
+
"---",
|
|
135
|
+
"",
|
|
136
|
+
content.content,
|
|
137
|
+
"",
|
|
138
|
+
`_(${content.length} characters)_`
|
|
139
|
+
].filter(Boolean).join("\n");
|
|
140
|
+
return { content: [{ type: "text", text }] };
|
|
141
|
+
}
|
|
142
|
+
);
|
|
143
|
+
server.tool(
|
|
144
|
+
"extract_links",
|
|
145
|
+
"Extract all links from a page with their href and anchor text. Resolves relative URLs. Skips anchors and javascript: links.",
|
|
146
|
+
{
|
|
147
|
+
url: z.string().url().describe("The URL to extract links from")
|
|
148
|
+
},
|
|
149
|
+
async ({ url }) => {
|
|
150
|
+
const links = await extractLinks(url);
|
|
151
|
+
const text = links.length === 0 ? "No links found." : links.map((l, i) => `${i + 1}. [${l.text || l.href}](${l.href})`).join("\n");
|
|
152
|
+
return { content: [{ type: "text", text }] };
|
|
153
|
+
}
|
|
154
|
+
);
|
|
155
|
+
server.tool(
|
|
156
|
+
"extract_metadata",
|
|
157
|
+
"Extract page metadata: title, description, Open Graph tags (og:title, og:description, og:image), canonical URL, and favicon.",
|
|
158
|
+
{
|
|
159
|
+
url: z.string().url().describe("The URL to extract metadata from")
|
|
160
|
+
},
|
|
161
|
+
async ({ url }) => {
|
|
162
|
+
const meta = await extractMetadata(url);
|
|
163
|
+
const lines = [
|
|
164
|
+
`**Title:** ${meta.title || "(none)"}`,
|
|
165
|
+
`**Description:** ${meta.description || "(none)"}`,
|
|
166
|
+
`**og:title:** ${meta.ogTitle || "(none)"}`,
|
|
167
|
+
`**og:description:** ${meta.ogDescription || "(none)"}`,
|
|
168
|
+
`**og:image:** ${meta.ogImage || "(none)"}`,
|
|
169
|
+
`**Canonical:** ${meta.canonical || "(none)"}`,
|
|
170
|
+
`**Favicon:** ${meta.favicon || "(none)"}`
|
|
171
|
+
];
|
|
172
|
+
return { content: [{ type: "text", text: lines.join("\n") }] };
|
|
173
|
+
}
|
|
174
|
+
);
|
|
175
|
+
server.tool(
|
|
176
|
+
"search_page",
|
|
177
|
+
"Search for a query string within the page text. Returns matching lines (one per line). Use for finding mentions of a term.",
|
|
178
|
+
{
|
|
179
|
+
url: z.string().url().describe("The URL to search"),
|
|
180
|
+
query: z.string().describe("The search query")
|
|
181
|
+
},
|
|
182
|
+
async ({ url, query }) => {
|
|
183
|
+
const lines = await searchPage(url, query);
|
|
184
|
+
const text = lines.length === 0 ? `No lines containing "${query}" found.` : lines.map((l, i) => `${i + 1}. ${l}`).join("\n");
|
|
185
|
+
return { content: [{ type: "text", text }] };
|
|
186
|
+
}
|
|
187
|
+
);
|
|
188
|
+
server.tool(
|
|
189
|
+
"scrape_multiple",
|
|
190
|
+
"Batch scrape multiple URLs. Returns title and excerpt for each. Failures are reported per URL without failing the whole batch.",
|
|
191
|
+
{
|
|
192
|
+
urls: z.array(z.string().url()).describe("Array of URLs to scrape")
|
|
193
|
+
},
|
|
194
|
+
async ({ urls }) => {
|
|
195
|
+
const results = await scrapeMultiple(urls);
|
|
196
|
+
const lines = results.map((r, i) => {
|
|
197
|
+
if (r.error) {
|
|
198
|
+
return `${i + 1}. **${r.url}** \u2014 Error: ${r.error}`;
|
|
199
|
+
}
|
|
200
|
+
return `${i + 1}. **${r.title || "(no title)"}** \u2014 ${r.excerpt || "(no excerpt)"}
|
|
201
|
+
${r.url}`;
|
|
202
|
+
});
|
|
203
|
+
return { content: [{ type: "text", text: lines.join("\n\n") }] };
|
|
204
|
+
}
|
|
205
|
+
);
|
|
206
|
+
async function main() {
|
|
207
|
+
const transport = new StdioServerTransport();
|
|
208
|
+
await server.connect(transport);
|
|
209
|
+
}
|
|
210
|
+
main().catch((err) => {
|
|
211
|
+
console.error("Fatal error:", err);
|
|
212
|
+
process.exit(1);
|
|
213
|
+
});
|
|
214
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/scraper.ts"],"sourcesContent":["import { McpServer } from \"@modelcontextprotocol/sdk/server/mcp.js\";\nimport { StdioServerTransport } from \"@modelcontextprotocol/sdk/server/stdio.js\";\nimport { z } from \"zod\";\nimport {\n scrapeUrl,\n extractLinks,\n extractMetadata,\n searchPage,\n scrapeMultiple,\n} from \"./scraper.js\";\n\nconst server = new McpServer({\n name: \"mcp-server-scraper\",\n version: \"1.0.0\",\n});\n\nserver.tool(\n \"scrape_url\",\n \"Extract clean, readable text content from a URL using Mozilla Readability. Returns title, excerpt, and main content. Best for articles, docs, and blog posts.\",\n {\n url: z.string().url().describe(\"The URL to scrape\"),\n },\n async ({ url }) => {\n const content = await scrapeUrl(url);\n const text = [\n `# ${content.title}`,\n content.byline ? `*${content.byline}*` : null,\n content.siteName ? `*${content.siteName}*` : null,\n \"\",\n content.excerpt ? `> ${content.excerpt}` : null,\n \"\",\n \"---\",\n \"\",\n content.content,\n \"\",\n `_(${content.length} characters)_`,\n ]\n .filter(Boolean)\n .join(\"\\n\");\n\n return { content: [{ type: \"text\", text }] };\n },\n);\n\nserver.tool(\n \"extract_links\",\n \"Extract all links from a page with their href and anchor text. Resolves relative URLs. Skips anchors and javascript: links.\",\n {\n url: z.string().url().describe(\"The URL to extract links from\"),\n },\n async ({ url }) => {\n const links = await extractLinks(url);\n const text =\n links.length === 0\n ? \"No links found.\"\n : links\n .map((l, i) => `${i + 1}. [${l.text || l.href}](${l.href})`)\n .join(\"\\n\");\n\n return { content: [{ type: \"text\", text }] };\n },\n);\n\nserver.tool(\n \"extract_metadata\",\n \"Extract page metadata: title, description, Open Graph tags (og:title, og:description, og:image), canonical URL, and favicon.\",\n {\n url: z.string().url().describe(\"The URL to extract metadata from\"),\n },\n async ({ url }) => {\n const meta = await extractMetadata(url);\n const lines = [\n `**Title:** ${meta.title || \"(none)\"}`,\n `**Description:** ${meta.description || \"(none)\"}`,\n `**og:title:** ${meta.ogTitle || \"(none)\"}`,\n `**og:description:** ${meta.ogDescription || \"(none)\"}`,\n `**og:image:** ${meta.ogImage || \"(none)\"}`,\n `**Canonical:** ${meta.canonical || \"(none)\"}`,\n `**Favicon:** ${meta.favicon || \"(none)\"}`,\n ];\n return { content: [{ type: \"text\", text: lines.join(\"\\n\") }] };\n },\n);\n\nserver.tool(\n \"search_page\",\n \"Search for a query string within the page text. Returns matching lines (one per line). Use for finding mentions of a term.\",\n {\n url: z.string().url().describe(\"The URL to search\"),\n query: z.string().describe(\"The search query\"),\n },\n async ({ url, query }) => {\n const lines = await searchPage(url, query);\n const text =\n lines.length === 0\n ? `No lines containing \"${query}\" found.`\n : lines.map((l, i) => `${i + 1}. ${l}`).join(\"\\n\");\n\n return { content: [{ type: \"text\", text }] };\n },\n);\n\nserver.tool(\n \"scrape_multiple\",\n \"Batch scrape multiple URLs. Returns title and excerpt for each. Failures are reported per URL without failing the whole batch.\",\n {\n urls: z.array(z.string().url()).describe(\"Array of URLs to scrape\"),\n },\n async ({ urls }) => {\n const results = await scrapeMultiple(urls);\n const lines = results.map((r, i) => {\n if (r.error) {\n return `${i + 1}. **${r.url}** — Error: ${r.error}`;\n }\n return `${i + 1}. **${r.title || \"(no title)\"}** — ${r.excerpt || \"(no excerpt)\"}\\n ${r.url}`;\n });\n return { content: [{ type: \"text\", text: lines.join(\"\\n\\n\") }] };\n },\n);\n\nasync function main() {\n const transport = new StdioServerTransport();\n await server.connect(transport);\n}\n\nmain().catch((err) => {\n console.error(\"Fatal error:\", err);\n process.exit(1);\n});\n","import { Readability } from \"@mozilla/readability\";\nimport { parseHTML } from \"linkedom\";\n\nexport interface ScrapedContent {\n title: string;\n content: string;\n excerpt: string;\n byline: string;\n siteName: string;\n length: number;\n}\n\nexport interface PageMetadata {\n title: string;\n description: string;\n ogTitle: string;\n ogDescription: string;\n ogImage: string;\n canonical: string;\n favicon: string;\n}\n\nexport interface PageLink {\n href: string;\n text: string;\n}\n\nasync function fetchPage(url: string): Promise<string> {\n const response = await fetch(url, {\n headers: {\n \"User-Agent\": \"Mozilla/5.0 (compatible; mcp-server-scraper/1.0)\",\n Accept: \"text/html,application/xhtml+xml\",\n },\n redirect: \"follow\",\n });\n if (!response.ok) {\n throw new Error(`HTTP ${response.status}: ${response.statusText}`);\n }\n return response.text();\n}\n\nexport async function scrapeUrl(url: string): Promise<ScrapedContent> {\n const html = await fetchPage(url);\n const { document } = parseHTML(html);\n const reader = new Readability(document as unknown as Document);\n const article = reader.parse();\n if (!article) {\n throw new Error(\"Could not extract readable content from the page\");\n }\n const textContent = article.textContent?.trim() ?? \"\";\n return {\n title: article.title ?? \"\",\n content: textContent,\n excerpt: article.excerpt ?? \"\",\n byline: article.byline ?? \"\",\n siteName: article.siteName ?? \"\",\n length: article.length ?? textContent.length,\n };\n}\n\nexport async function extractLinks(url: string): Promise<PageLink[]> {\n const html = await fetchPage(url);\n const { document } = parseHTML(html);\n const anchors = Array.from(document.querySelectorAll(\"a[href]\"));\n const links: PageLink[] = [];\n const baseUrl = new URL(url);\n for (const a of anchors) {\n const href = a.getAttribute(\"href\");\n if (!href || href.startsWith(\"#\") || href.startsWith(\"javascript:\"))\n continue;\n try {\n const resolved = new URL(href, baseUrl).href;\n links.push({ href: resolved, text: (a.textContent ?? \"\").trim() });\n } catch {\n // skip invalid URLs\n }\n }\n return links;\n}\n\nexport async function extractMetadata(url: string): Promise<PageMetadata> {\n const html = await fetchPage(url);\n const { document } = parseHTML(html);\n\n const getMeta = (name: string): string => {\n const el = document.querySelector(\n `meta[property=\"${name}\"], meta[name=\"${name}\"]`,\n );\n return el?.getAttribute(\"content\") ?? \"\";\n };\n\n const title = document.querySelector(\"title\")?.textContent ?? \"\";\n const canonical =\n document.querySelector(\"link[rel='canonical']\")?.getAttribute(\"href\") ?? \"\";\n const favicon =\n document\n .querySelector(\"link[rel='icon'], link[rel='shortcut icon']\")\n ?.getAttribute(\"href\") ?? \"\";\n\n return {\n title,\n description: getMeta(\"description\"),\n ogTitle: getMeta(\"og:title\"),\n ogDescription: getMeta(\"og:description\"),\n ogImage: getMeta(\"og:image\"),\n canonical,\n favicon,\n };\n}\n\nexport async function searchPage(\n url: string,\n query: string,\n): Promise<string[]> {\n const html = await fetchPage(url);\n const { document } = parseHTML(html);\n const text = document.body?.textContent ?? \"\";\n const lines = text\n .split(\"\\n\")\n .map((l: string) => l.trim())\n .filter(Boolean);\n const queryLower = query.toLowerCase();\n return lines.filter((line: string) =>\n line.toLowerCase().includes(queryLower),\n );\n}\n\nexport async function scrapeMultiple(\n urls: string[],\n): Promise<{ url: string; title: string; excerpt: string; error?: string }[]> {\n const results = await Promise.allSettled(\n urls.map(async (url) => {\n const content = await scrapeUrl(url);\n return {\n url,\n title: content.title,\n excerpt: content.excerpt,\n };\n }),\n );\n return results.map((r, i) => {\n if (r.status === \"fulfilled\") return r.value;\n return {\n url: urls[i] ?? \"\",\n title: \"\",\n excerpt: \"\",\n error: String((r.reason as Error).message),\n };\n });\n}\n"],"mappings":";;;AAAA,SAAS,iBAAiB;AAC1B,SAAS,4BAA4B;AACrC,SAAS,SAAS;;;ACFlB,SAAS,mBAAmB;AAC5B,SAAS,iBAAiB;AA0B1B,eAAe,UAAU,KAA8B;AACrD,QAAM,WAAW,MAAM,MAAM,KAAK;AAAA,IAChC,SAAS;AAAA,MACP,cAAc;AAAA,MACd,QAAQ;AAAA,IACV;AAAA,IACA,UAAU;AAAA,EACZ,CAAC;AACD,MAAI,CAAC,SAAS,IAAI;AAChB,UAAM,IAAI,MAAM,QAAQ,SAAS,MAAM,KAAK,SAAS,UAAU,EAAE;AAAA,EACnE;AACA,SAAO,SAAS,KAAK;AACvB;AAEA,eAAsB,UAAU,KAAsC;AACpE,QAAM,OAAO,MAAM,UAAU,GAAG;AAChC,QAAM,EAAE,SAAS,IAAI,UAAU,IAAI;AACnC,QAAM,SAAS,IAAI,YAAY,QAA+B;AAC9D,QAAM,UAAU,OAAO,MAAM;AAC7B,MAAI,CAAC,SAAS;AACZ,UAAM,IAAI,MAAM,kDAAkD;AAAA,EACpE;AACA,QAAM,cAAc,QAAQ,aAAa,KAAK,KAAK;AACnD,SAAO;AAAA,IACL,OAAO,QAAQ,SAAS;AAAA,IACxB,SAAS;AAAA,IACT,SAAS,QAAQ,WAAW;AAAA,IAC5B,QAAQ,QAAQ,UAAU;AAAA,IAC1B,UAAU,QAAQ,YAAY;AAAA,IAC9B,QAAQ,QAAQ,UAAU,YAAY;AAAA,EACxC;AACF;AAEA,eAAsB,aAAa,KAAkC;AACnE,QAAM,OAAO,MAAM,UAAU,GAAG;AAChC,QAAM,EAAE,SAAS,IAAI,UAAU,IAAI;AACnC,QAAM,UAAU,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC;AAC/D,QAAM,QAAoB,CAAC;AAC3B,QAAM,UAAU,IAAI,IAAI,GAAG;AAC3B,aAAW,KAAK,SAAS;AACvB,UAAM,OAAO,EAAE,aAAa,MAAM;AAClC,QAAI,CAAC,QAAQ,KAAK,WAAW,GAAG,KAAK,KAAK,WAAW,aAAa;AAChE;AACF,QAAI;AACF,YAAM,WAAW,IAAI,IAAI,MAAM,OAAO,EAAE;AACxC,YAAM,KAAK,EAAE,MAAM,UAAU,OAAO,EAAE,eAAe,IAAI,KAAK,EAAE,CAAC;AAAA,IACnE,QAAQ;AAAA,IAER;AAAA,EACF;AACA,SAAO;AACT;AAEA,eAAsB,gBAAgB,KAAoC;AACxE,QAAM,OAAO,MAAM,UAAU,GAAG;AAChC,QAAM,EAAE,SAAS,IAAI,UAAU,IAAI;AAEnC,QAAM,UAAU,CAAC,SAAyB;AACxC,UAAM,KAAK,SAAS;AAAA,MAClB,kBAAkB,IAAI,kBAAkB,IAAI;AAAA,IAC9C;AACA,WAAO,IAAI,aAAa,SAAS,KAAK;AAAA,EACxC;AAEA,QAAM,QAAQ,SAAS,cAAc,OAAO,GAAG,eAAe;AAC9D,QAAM,YACJ,SAAS,cAAc,uBAAuB,GAAG,aAAa,MAAM,KAAK;AAC3E,QAAM,UACJ,SACG,cAAc,6CAA6C,GAC1D,aAAa,MAAM,KAAK;AAE9B,SAAO;AAAA,IACL;AAAA,IACA,aAAa,QAAQ,aAAa;AAAA,IAClC,SAAS,QAAQ,UAAU;AAAA,IAC3B,eAAe,QAAQ,gBAAgB;AAAA,IACvC,SAAS,QAAQ,UAAU;AAAA,IAC3B;AAAA,IACA;AAAA,EACF;AACF;AAEA,eAAsB,WACpB,KACA,OACmB;AACnB,QAAM,OAAO,MAAM,UAAU,GAAG;AAChC,QAAM,EAAE,SAAS,IAAI,UAAU,IAAI;AACnC,QAAM,OAAO,SAAS,MAAM,eAAe;AAC3C,QAAM,QAAQ,KACX,MAAM,IAAI,EACV,IAAI,CAAC,MAAc,EAAE,KAAK,CAAC,EAC3B,OAAO,OAAO;AACjB,QAAM,aAAa,MAAM,YAAY;AACrC,SAAO,MAAM;AAAA,IAAO,CAAC,SACnB,KAAK,YAAY,EAAE,SAAS,UAAU;AAAA,EACxC;AACF;AAEA,eAAsB,eACpB,MAC4E;AAC5E,QAAM,UAAU,MAAM,QAAQ;AAAA,IAC5B,KAAK,IAAI,OAAO,QAAQ;AACtB,YAAM,UAAU,MAAM,UAAU,GAAG;AACnC,aAAO;AAAA,QACL;AAAA,QACA,OAAO,QAAQ;AAAA,QACf,SAAS,QAAQ;AAAA,MACnB;AAAA,IACF,CAAC;AAAA,EACH;AACA,SAAO,QAAQ,IAAI,CAAC,GAAG,MAAM;AAC3B,QAAI,EAAE,WAAW,YAAa,QAAO,EAAE;AACvC,WAAO;AAAA,MACL,KAAK,KAAK,CAAC,KAAK;AAAA,MAChB,OAAO;AAAA,MACP,SAAS;AAAA,MACT,OAAO,OAAQ,EAAE,OAAiB,OAAO;AAAA,IAC3C;AAAA,EACF,CAAC;AACH;;;AD1IA,IAAM,SAAS,IAAI,UAAU;AAAA,EAC3B,MAAM;AAAA,EACN,SAAS;AACX,CAAC;AAED,OAAO;AAAA,EACL;AAAA,EACA;AAAA,EACA;AAAA,IACE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,mBAAmB;AAAA,EACpD;AAAA,EACA,OAAO,EAAE,IAAI,MAAM;AACjB,UAAM,UAAU,MAAM,UAAU,GAAG;AACnC,UAAM,OAAO;AAAA,MACX,KAAK,QAAQ,KAAK;AAAA,MAClB,QAAQ,SAAS,IAAI,QAAQ,MAAM,MAAM;AAAA,MACzC,QAAQ,WAAW,IAAI,QAAQ,QAAQ,MAAM;AAAA,MAC7C;AAAA,MACA,QAAQ,UAAU,KAAK,QAAQ,OAAO,KAAK;AAAA,MAC3C;AAAA,MACA;AAAA,MACA;AAAA,MACA,QAAQ;AAAA,MACR;AAAA,MACA,KAAK,QAAQ,MAAM;AAAA,IACrB,EACG,OAAO,OAAO,EACd,KAAK,IAAI;AAEZ,WAAO,EAAE,SAAS,CAAC,EAAE,MAAM,QAAQ,KAAK,CAAC,EAAE;AAAA,EAC7C;AACF;AAEA,OAAO;AAAA,EACL;AAAA,EACA;AAAA,EACA;AAAA,IACE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,+BAA+B;AAAA,EAChE;AAAA,EACA,OAAO,EAAE,IAAI,MAAM;AACjB,UAAM,QAAQ,MAAM,aAAa,GAAG;AACpC,UAAM,OACJ,MAAM,WAAW,IACb,oBACA,MACG,IAAI,CAAC,GAAG,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,QAAQ,EAAE,IAAI,KAAK,EAAE,IAAI,GAAG,EAC1D,KAAK,IAAI;AAElB,WAAO,EAAE,SAAS,CAAC,EAAE,MAAM,QAAQ,KAAK,CAAC,EAAE;AAAA,EAC7C;AACF;AAEA,OAAO;AAAA,EACL;AAAA,EACA;AAAA,EACA;AAAA,IACE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,kCAAkC;AAAA,EACnE;AAAA,EACA,OAAO,EAAE,IAAI,MAAM;AACjB,UAAM,OAAO,MAAM,gBAAgB,GAAG;AACtC,UAAM,QAAQ;AAAA,MACZ,cAAc,KAAK,SAAS,QAAQ;AAAA,MACpC,oBAAoB,KAAK,eAAe,QAAQ;AAAA,MAChD,iBAAiB,KAAK,WAAW,QAAQ;AAAA,MACzC,uBAAuB,KAAK,iBAAiB,QAAQ;AAAA,MACrD,iBAAiB,KAAK,WAAW,QAAQ;AAAA,MACzC,kBAAkB,KAAK,aAAa,QAAQ;AAAA,MAC5C,gBAAgB,KAAK,WAAW,QAAQ;AAAA,IAC1C;AACA,WAAO,EAAE,SAAS,CAAC,EAAE,MAAM,QAAQ,MAAM,MAAM,KAAK,IAAI,EAAE,CAAC,EAAE;AAAA,EAC/D;AACF;AAEA,OAAO;AAAA,EACL;AAAA,EACA;AAAA,EACA;AAAA,IACE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,mBAAmB;AAAA,IAClD,OAAO,EAAE,OAAO,EAAE,SAAS,kBAAkB;AAAA,EAC/C;AAAA,EACA,OAAO,EAAE,KAAK,MAAM,MAAM;AACxB,UAAM,QAAQ,MAAM,WAAW,KAAK,KAAK;AACzC,UAAM,OACJ,MAAM,WAAW,IACb,wBAAwB,KAAK,aAC7B,MAAM,IAAI,CAAC,GAAG,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI;AAErD,WAAO,EAAE,SAAS,CAAC,EAAE,MAAM,QAAQ,KAAK,CAAC,EAAE;AAAA,EAC7C;AACF;AAEA,OAAO;AAAA,EACL;AAAA,EACA;AAAA,EACA;AAAA,IACE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,SAAS,yBAAyB;AAAA,EACpE;AAAA,EACA,OAAO,EAAE,KAAK,MAAM;AAClB,UAAM,UAAU,MAAM,eAAe,IAAI;AACzC,UAAM,QAAQ,QAAQ,IAAI,CAAC,GAAG,MAAM;AAClC,UAAI,EAAE,OAAO;AACX,eAAO,GAAG,IAAI,CAAC,OAAO,EAAE,GAAG,oBAAe,EAAE,KAAK;AAAA,MACnD;AACA,aAAO,GAAG,IAAI,CAAC,OAAO,EAAE,SAAS,YAAY,aAAQ,EAAE,WAAW,cAAc;AAAA,KAAQ,EAAE,GAAG;AAAA,IAC/F,CAAC;AACD,WAAO,EAAE,SAAS,CAAC,EAAE,MAAM,QAAQ,MAAM,MAAM,KAAK,MAAM,EAAE,CAAC,EAAE;AAAA,EACjE;AACF;AAEA,eAAe,OAAO;AACpB,QAAM,YAAY,IAAI,qBAAqB;AAC3C,QAAM,OAAO,QAAQ,SAAS;AAChC;AAEA,KAAK,EAAE,MAAM,CAAC,QAAQ;AACpB,UAAQ,MAAM,gBAAgB,GAAG;AACjC,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
package/package.json
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "mcp-server-scraper",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "MCP server for web scraping — extract clean markdown, links, and metadata from any URL. Zero auth. Free alternative to Firecrawl.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"mcp-server-scraper": "./dist/index.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"dist"
|
|
11
|
+
],
|
|
12
|
+
"scripts": {
|
|
13
|
+
"build": "tsup",
|
|
14
|
+
"typecheck": "tsc --noEmit",
|
|
15
|
+
"test": "vitest run",
|
|
16
|
+
"test:watch": "vitest",
|
|
17
|
+
"test:coverage": "vitest run --coverage",
|
|
18
|
+
"lint": "eslint . && prettier --check .",
|
|
19
|
+
"format": "prettier --write .",
|
|
20
|
+
"prepare": "husky"
|
|
21
|
+
},
|
|
22
|
+
"keywords": [
|
|
23
|
+
"mcp",
|
|
24
|
+
"mcp-server",
|
|
25
|
+
"model-context-protocol",
|
|
26
|
+
"web-scraping",
|
|
27
|
+
"scraper",
|
|
28
|
+
"markdown",
|
|
29
|
+
"readability",
|
|
30
|
+
"fetch",
|
|
31
|
+
"content-extraction",
|
|
32
|
+
"ai",
|
|
33
|
+
"llm",
|
|
34
|
+
"claude",
|
|
35
|
+
"cursor"
|
|
36
|
+
],
|
|
37
|
+
"author": "Ofer Shapira",
|
|
38
|
+
"license": "MIT",
|
|
39
|
+
"repository": {
|
|
40
|
+
"type": "git",
|
|
41
|
+
"url": "https://github.com/ofershap/mcp-server-scraper.git"
|
|
42
|
+
},
|
|
43
|
+
"bugs": {
|
|
44
|
+
"url": "https://github.com/ofershap/mcp-server-scraper/issues"
|
|
45
|
+
},
|
|
46
|
+
"homepage": "https://github.com/ofershap/mcp-server-scraper#readme",
|
|
47
|
+
"dependencies": {
|
|
48
|
+
"@modelcontextprotocol/sdk": "^1.26.0",
|
|
49
|
+
"@mozilla/readability": "^0.5.0",
|
|
50
|
+
"linkedom": "^0.18.0",
|
|
51
|
+
"zod": "^3.25.0"
|
|
52
|
+
},
|
|
53
|
+
"devDependencies": {
|
|
54
|
+
"@eslint/js": "^9.0.0",
|
|
55
|
+
"@types/node": "^22.0.0",
|
|
56
|
+
"eslint": "^9.0.0",
|
|
57
|
+
"eslint-config-prettier": "^10.0.0",
|
|
58
|
+
"husky": "^9.0.0",
|
|
59
|
+
"lint-staged": "^15.0.0",
|
|
60
|
+
"prettier": "^3.0.0",
|
|
61
|
+
"tsup": "^8.0.0",
|
|
62
|
+
"typescript": "^5.7.0",
|
|
63
|
+
"typescript-eslint": "^8.0.0",
|
|
64
|
+
"vitest": "^3.2.0"
|
|
65
|
+
},
|
|
66
|
+
"lint-staged": {
|
|
67
|
+
"*.{ts,tsx,js}": "eslint --fix",
|
|
68
|
+
"*.{json,md,yml,yaml}": "prettier --write"
|
|
69
|
+
}
|
|
70
|
+
}
|