crawlforge-mcp-server 4.6.0 → 4.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -49
- package/package.json +16 -3
- package/src/core/AgentOrchestrator.js +3 -1
package/README.md
CHANGED
|
@@ -1,19 +1,63 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
16
|
-
-
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/banner.svg" alt="CrawlForge MCP Server" width="640">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<b>26 web scraping, crawling, deep-research & autonomous-extraction tools for Claude, Cursor & any MCP client.</b><br>
|
|
7
|
+
Clean Markdown & structured JSON from any site. Get started with <b>1,000 free credits</b> — no credit card required.
|
|
8
|
+
</p>
|
|
9
|
+
|
|
10
|
+
<p align="center">
|
|
11
|
+
<a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License: MIT"></a>
|
|
12
|
+
<a href="https://nodejs.org/"><img src="https://img.shields.io/badge/node-%3E%3D18.0.0-brightgreen" alt="Node.js Version"></a>
|
|
13
|
+
<a href="https://modelcontextprotocol.io/"><img src="https://img.shields.io/badge/MCP-Compatible-blue" alt="MCP Protocol"></a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/crawlforge-mcp-server"><img src="https://img.shields.io/npm/v/crawlforge-mcp-server.svg" alt="npm version"></a>
|
|
15
|
+
<a href="https://www.npmjs.com/package/crawlforge-mcp-server"><img src="https://img.shields.io/npm/dm/crawlforge-mcp-server.svg" alt="npm downloads"></a>
|
|
16
|
+
<a href="https://github.com/mysleekdesigns/crawlforge-mcp/stargazers"><img src="https://img.shields.io/github/stars/mysleekdesigns/crawlforge-mcp?style=social" alt="GitHub stars"></a>
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
⭐ <b><a href="https://github.com/mysleekdesigns/crawlforge-mcp">Star us on GitHub</a></b> to follow along — it genuinely helps others discover the project.
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
## Table of Contents
|
|
24
|
+
|
|
25
|
+
- [Why CrawlForge?](#-why-crawlforge)
|
|
26
|
+
- [CrawlForge vs. alternatives](#-crawlforge-vs-alternatives)
|
|
27
|
+
- [Quick Start (2 Minutes)](#-quick-start-2-minutes)
|
|
28
|
+
- [Available Tools](#-available-tools)
|
|
29
|
+
- [Pricing](#-pricing)
|
|
30
|
+
- [Advanced Configuration](#-advanced-configuration)
|
|
31
|
+
- [Usage Examples](#-usage-examples)
|
|
32
|
+
- [Security & Privacy](#-security--privacy)
|
|
33
|
+
- [Support](#-support)
|
|
34
|
+
- [Contributing](#-contributing)
|
|
35
|
+
|
|
36
|
+
## 🎯 Why CrawlForge?
|
|
37
|
+
|
|
38
|
+
- **26 MCP-native tools** — scraping, crawling, search, deep research, an autonomous `agent`, a unified multi-format `scrape`, document processing, stealth browsing, and more, callable directly from your AI assistant.
|
|
39
|
+
- **Generous free tier** — 1,000 credits to start instantly, no credit card. Credits never expire and roll over month-to-month.
|
|
40
|
+
- **Local-LLM by default** — `extract_with_llm` runs against a local **Ollama** model out of the box: no LLM API key, no per-token cost, and your data never leaves your machine. Cloud (OpenAI/Anthropic) is opt-in.
|
|
41
|
+
- **LLM-ready output** — clean Markdown, structured JSON (schema-driven), screenshots, links, and metadata from a single fetch.
|
|
42
|
+
- **Autonomous `agent`** — describe what you need in natural language; it plans, gathers, and shapes an answer under orchestrator-enforced hard stops (max steps/URLs/wall-clock) — no URLs required.
|
|
43
|
+
- **Security-hardened** — SSRF protection on every request, a fail-closed backend allow-list, a vetted action allowlist for browser automation, and per-tool credit gating.
|
|
44
|
+
- **Works everywhere MCP does** — Claude Desktop, Claude Code, Cursor, and any other MCP-enabled client, configured in one command.
|
|
45
|
+
|
|
46
|
+
## 📊 CrawlForge vs. alternatives
|
|
47
|
+
|
|
48
|
+
| | **CrawlForge MCP** | Firecrawl | Raw scraping API |
|
|
49
|
+
|---|:---:|:---:|:---:|
|
|
50
|
+
| Native MCP server | ✅ 26 tools | ✅ | ❌ |
|
|
51
|
+
| Free tier | ✅ 1,000 credits, rollover | Limited | Varies |
|
|
52
|
+
| Self-hosted / local LLM extraction (Ollama) | ✅ default, $0/token | ❌ | ❌ |
|
|
53
|
+
| Autonomous agent (no URLs needed) | ✅ `agent` | ✅ | ❌ |
|
|
54
|
+
| Deep research with source verification | ✅ `deep_research` | Partial | ❌ |
|
|
55
|
+
| Browser automation / actions | ✅ `scrape_with_actions` | ✅ | Varies |
|
|
56
|
+
| Stealth / anti-detection engines | ✅ Chromium + Camoufox | ✅ | Add-on |
|
|
57
|
+
| Pre-built site templates | ✅ 10 sites | ❌ | ❌ |
|
|
58
|
+
| License | MIT | AGPL-3.0 | Proprietary |
|
|
59
|
+
|
|
60
|
+
> Comparison reflects publicly documented capabilities at time of writing. CrawlForge is MIT-licensed and MCP-first — built to plug straight into AI coding assistants.
|
|
17
61
|
|
|
18
62
|
## 🚀 Quick Start (2 Minutes)
|
|
19
63
|
|
|
@@ -104,42 +148,56 @@ Restart Cursor to activate.
|
|
|
104
148
|
|
|
105
149
|
## 📊 Available Tools
|
|
106
150
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
151
|
+
**Basic Tools** (1 credit each)
|
|
152
|
+
|
|
153
|
+
| Tool | What it does |
|
|
154
|
+
|------|--------------|
|
|
155
|
+
| `fetch_url` | Fetch content from any URL |
|
|
156
|
+
| `extract_text` | Extract clean text from web pages |
|
|
157
|
+
| `extract_links` | Get all links from a page |
|
|
158
|
+
| `extract_metadata` | Extract page metadata (title, OG tags, schema.org) |
|
|
159
|
+
| `scrape_template` | Structured data from well-known sites (Amazon, GitHub, LinkedIn, YouTube, Reddit, Hacker News, npm, and more) without writing selectors |
|
|
160
|
+
|
|
161
|
+
**Advanced Tools** (2–3 credits)
|
|
162
|
+
|
|
163
|
+
| Tool | What it does |
|
|
164
|
+
|------|--------------|
|
|
165
|
+
| `scrape` | **Unified single-fetch, multi-format extraction.** Pass a `formats` array (markdown/html/rawHtml/text/links/metadata/screenshot/json-schema) plus `onlyMainContent`; one fetch serves every requested format with per-format partial-success warnings |
|
|
166
|
+
| `scrape_structured` | Extract structured data with CSS selectors |
|
|
167
|
+
| `search_web` | Search the web using Google Search API |
|
|
168
|
+
| `summarize_content` | Generate intelligent summaries |
|
|
169
|
+
| `analyze_content` | Comprehensive content analysis |
|
|
170
|
+
| `extract_structured` | LLM-powered schema-driven extraction |
|
|
171
|
+
| `extract_with_llm` | Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" \| "anthropic"` with the matching key for cloud models |
|
|
172
|
+
| `list_ollama_models` | List the Ollama models installed locally (free; helps you pick a `model` for `extract_with_llm`) |
|
|
173
|
+
| `track_changes` | Monitor content changes over time |
|
|
174
|
+
| `get_batch_results` | Retrieve paginated results for a `batch_scrape` job by `batchId` |
|
|
175
|
+
|
|
176
|
+
**Premium Tools** (5–10 credits)
|
|
177
|
+
|
|
178
|
+
| Tool | What it does |
|
|
179
|
+
|------|--------------|
|
|
180
|
+
| `agent` | **Autonomous research/extraction from a natural-language prompt — no URLs required.** Plans, gathers, and shapes an answer under hard safety stops (max steps/URLs/wall-clock enforced by the orchestrator, never the LLM) |
|
|
181
|
+
| `crawl_deep` | Deep crawl entire websites |
|
|
182
|
+
| `map_site` | Discover and map website structure (optional `search=` ranks the discovered URLs) |
|
|
183
|
+
| `batch_scrape` | Process multiple URLs simultaneously |
|
|
184
|
+
| `deep_research` | Multi-stage research with source verification |
|
|
185
|
+
| `stealth_mode` | Anti-detection browser management |
|
|
186
|
+
|
|
187
|
+
**Heavy Processing** (3–10 credits)
|
|
188
|
+
|
|
189
|
+
| Tool | What it does |
|
|
190
|
+
|------|--------------|
|
|
191
|
+
| `process_document` | Multi-format document processing |
|
|
192
|
+
| `extract_content` | Enhanced content extraction |
|
|
193
|
+
| `scrape_with_actions` | Browser automation chains |
|
|
194
|
+
| `generate_llms_txt` | Generate AI interaction guidelines |
|
|
195
|
+
| `localization` | Multi-language and geo-location management |
|
|
140
196
|
|
|
141
197
|
For the full canonical capabilities reference (all tools, CLI commands, stealth engines, research workflow), see [SKILL.md](SKILL.md).
|
|
142
198
|
|
|
199
|
+
<p align="right"><a href="#table-of-contents">↑ Back to top</a></p>
|
|
200
|
+
|
|
143
201
|
## 💳 Pricing
|
|
144
202
|
|
|
145
203
|
| Plan | Credits/Month | Best For |
|
|
@@ -241,6 +299,10 @@ See [docs/sandboxing-and-approvals.md](docs/sandboxing-and-approvals.md) for the
|
|
|
241
299
|
|
|
242
300
|
**v3.0.3 (2025-10-01)**: Removed authentication bypass vulnerability. All users must authenticate with valid API keys.
|
|
243
301
|
|
|
302
|
+
For the full security policy and how to report a vulnerability, see [SECURITY.md](SECURITY.md).
|
|
303
|
+
|
|
304
|
+
<p align="right"><a href="#table-of-contents">↑ Back to top</a></p>
|
|
305
|
+
|
|
244
306
|
## 🆘 Support
|
|
245
307
|
|
|
246
308
|
- **Documentation**: [https://www.crawlforge.dev/docs](https://www.crawlforge.dev/docs)
|
package/package.json
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.6.
|
|
4
|
-
"
|
|
3
|
+
"version": "4.6.2",
|
|
4
|
+
"mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
|
|
5
|
+
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
|
|
5
6
|
"main": "server.js",
|
|
6
7
|
"bin": {
|
|
7
8
|
"crawlforge": "src/cli/index.js",
|
|
@@ -50,7 +51,19 @@
|
|
|
50
51
|
"llms-txt",
|
|
51
52
|
"llms-txt-generator",
|
|
52
53
|
"ai-compliance",
|
|
53
|
-
"website-analysis"
|
|
54
|
+
"website-analysis",
|
|
55
|
+
"mcp-server",
|
|
56
|
+
"claude",
|
|
57
|
+
"cursor",
|
|
58
|
+
"ollama",
|
|
59
|
+
"ai-agents",
|
|
60
|
+
"deep-research",
|
|
61
|
+
"stealth-browser",
|
|
62
|
+
"html-to-markdown",
|
|
63
|
+
"llm",
|
|
64
|
+
"crawl",
|
|
65
|
+
"batch-scrape",
|
|
66
|
+
"screenshot"
|
|
54
67
|
],
|
|
55
68
|
"author": {
|
|
56
69
|
"name": "Simon Lacey",
|
|
@@ -182,7 +182,9 @@ export class AgentOrchestrator {
|
|
|
182
182
|
if (deadline()) break;
|
|
183
183
|
try {
|
|
184
184
|
const sr = await searchTool.execute({ query: q, limit: Math.ceil(capUrls / searchQueries.length) });
|
|
185
|
-
|
|
185
|
+
// SearchWebTool.execute() returns the raw results object; the MCP content-wrapped
|
|
186
|
+
// shape only appears if a caller (e.g. server.js) wraps it. Handle both.
|
|
187
|
+
const parsed = sr?.content?.[0]?.text ? JSON.parse(sr.content[0].text) : sr;
|
|
186
188
|
if (parsed?.results) {
|
|
187
189
|
for (const r of parsed.results) {
|
|
188
190
|
if (r.link && !urlQueue.includes(r.link)) urlQueue.push(r.link);
|