crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
package/CLAUDE.md
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server implementation providing 16 comprehensive web scraping, crawling, and content processing tools. Version 3.0 includes advanced content extraction, document processing, summarization, and analysis capabilities. Wave 2 adds asynchronous batch processing and browser automation features. Wave 3 introduces deep research orchestration, stealth scraping, localization, and change tracking.
|
|
8
|
+
|
|
9
|
+
## Development Commands
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Install dependencies
|
|
13
|
+
npm install
|
|
14
|
+
|
|
15
|
+
# Copy and configure environment
|
|
16
|
+
cp .env.example .env
|
|
17
|
+
# Edit .env to add Google API credentials if using Google search
|
|
18
|
+
|
|
19
|
+
# Run the server
|
|
20
|
+
npm start
|
|
21
|
+
|
|
22
|
+
# Test MCP protocol compliance
|
|
23
|
+
# NOTE: test-server.js doesn't exist, use integration tests instead
|
|
24
|
+
npm run test:integration # Integration tests including MCP compliance
|
|
25
|
+
|
|
26
|
+
# Lint checks (no linter configured yet, placeholder)
|
|
27
|
+
npm run lint
|
|
28
|
+
|
|
29
|
+
# Performance tests
|
|
30
|
+
npm run test:performance # Full performance test suite
|
|
31
|
+
npm run test:performance:quick # Quick performance tests
|
|
32
|
+
npm run test:load # Load testing
|
|
33
|
+
npm run test:memory # Memory usage tests
|
|
34
|
+
npm run test:benchmark # Component benchmarks
|
|
35
|
+
npm run test:integration # Integration tests
|
|
36
|
+
npm run test:security # Security test suite
|
|
37
|
+
npm run test:all # Run all tests
|
|
38
|
+
|
|
39
|
+
# Wave 2 Validation Tests
|
|
40
|
+
node tests/validation/test-wave2-runner.js # Test Wave 2 features
|
|
41
|
+
node tests/validation/test-batch-scrape.js # Test batch scraping
|
|
42
|
+
node tests/validation/test-scrape-with-actions.js # Test action scraping
|
|
43
|
+
node tests/integration/master-test-runner.js # Run master test suite
|
|
44
|
+
|
|
45
|
+
# Wave 3 Tests
|
|
46
|
+
npm run test:wave3 # Full Wave 3 validation
|
|
47
|
+
npm run test:wave3:quick # Quick Wave 3 tests
|
|
48
|
+
npm run test:wave3:verbose # Verbose Wave 3 output
|
|
49
|
+
npm run test:unit:wave3 # Wave 3 unit tests (Jest)
|
|
50
|
+
npm run test:integration:wave3 # Wave 3 integration tests
|
|
51
|
+
|
|
52
|
+
# Docker commands
|
|
53
|
+
npm run docker:build # Build Docker image
|
|
54
|
+
npm run docker:dev # Run development container
|
|
55
|
+
npm run docker:prod # Run production container
|
|
56
|
+
npm run docker:test # Run test container
|
|
57
|
+
npm run docker:perf # Run performance test container
|
|
58
|
+
|
|
59
|
+
# Security Testing (CI/CD Integration)
|
|
60
|
+
npm run test:security # Run comprehensive security test suite
|
|
61
|
+
npm audit # Check for dependency vulnerabilities
|
|
62
|
+
npm audit fix # Automatically fix vulnerabilities
|
|
63
|
+
npm outdated # Check for outdated packages
|
|
64
|
+
|
|
65
|
+
# Release management
|
|
66
|
+
npm run release:patch # Patch version bump
|
|
67
|
+
npm run release:minor # Minor version bump
|
|
68
|
+
npm run release:major # Major version bump
|
|
69
|
+
|
|
70
|
+
# Cleanup
|
|
71
|
+
npm run clean # Remove cache, logs, test results
|
|
72
|
+
|
|
73
|
+
# Running specific test files
|
|
74
|
+
node tests/unit/linkAnalyzer.test.js # Unit test for link analyzer
|
|
75
|
+
node tests/validation/wave3-validation.js # Wave 3 validation suite
|
|
76
|
+
node tests/security/security-test-suite.js # Security test suite
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## High-Level Architecture
|
|
80
|
+
|
|
81
|
+
### Core Infrastructure (`src/core/`)
|
|
82
|
+
- **PerformanceManager**: Centralized performance monitoring and optimization
|
|
83
|
+
- **JobManager**: Asynchronous job tracking and management for batch operations
|
|
84
|
+
- **WebhookDispatcher**: Event notification system for job completion callbacks
|
|
85
|
+
- **ActionExecutor**: Browser automation engine for complex interactions
|
|
86
|
+
- **ResearchOrchestrator**: Coordinates multi-stage research with query expansion and synthesis
|
|
87
|
+
- **StealthBrowserManager**: Manages stealth mode scraping with anti-detection features
|
|
88
|
+
- **LocalizationManager**: Handles multi-language content and localization
|
|
89
|
+
- **ChangeTracker**: Tracks and compares content changes over time
|
|
90
|
+
- **SnapshotManager**: Manages website snapshots and version history
|
|
91
|
+
|
|
92
|
+
### Tool Layer (`src/tools/`)
|
|
93
|
+
Tools are organized in subdirectories by category:
|
|
94
|
+
- `advanced/` - BatchScrapeTool, ScrapeWithActionsTool
|
|
95
|
+
- `crawl/` - crawlDeep, mapSite
|
|
96
|
+
- `extract/` - analyzeContent, extractContent, processDocument, summarizeContent
|
|
97
|
+
- `research/` - deepResearch
|
|
98
|
+
- `search/` - searchWeb and provider adapters (Google, DuckDuckGo)
|
|
99
|
+
- `tracking/` - trackChanges
|
|
100
|
+
|
|
101
|
+
### MCP Server Entry Point
|
|
102
|
+
The main server implementation is in `server.js` which:
|
|
103
|
+
1. Uses stdio transport for MCP protocol communication
|
|
104
|
+
2. Registers all 16 tools using `server.registerTool()` pattern
|
|
105
|
+
3. Each tool has inline Zod schema for parameter validation
|
|
106
|
+
4. Parameter extraction from `request.params?.arguments` structure
|
|
107
|
+
5. Response format uses `content` array with text objects
|
|
108
|
+
|
|
109
|
+
### Key Configuration
|
|
110
|
+
|
|
111
|
+
Critical environment variables:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Search Provider (auto, google, duckduckgo)
|
|
115
|
+
SEARCH_PROVIDER=auto
|
|
116
|
+
|
|
117
|
+
# Google API (optional, only if using Google)
|
|
118
|
+
GOOGLE_API_KEY=your_key
|
|
119
|
+
GOOGLE_SEARCH_ENGINE_ID=your_id
|
|
120
|
+
|
|
121
|
+
# Performance Settings
|
|
122
|
+
MAX_WORKERS=10
|
|
123
|
+
QUEUE_CONCURRENCY=10
|
|
124
|
+
CACHE_TTL=3600000
|
|
125
|
+
RATE_LIMIT_REQUESTS_PER_SECOND=10
|
|
126
|
+
|
|
127
|
+
# Crawling Limits
|
|
128
|
+
MAX_CRAWL_DEPTH=5
|
|
129
|
+
MAX_PAGES_PER_CRAWL=100
|
|
130
|
+
RESPECT_ROBOTS_TXT=true
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Common Development Tasks
|
|
134
|
+
|
|
135
|
+
### Running a Single Test
|
|
136
|
+
```bash
|
|
137
|
+
# Run a specific test file
|
|
138
|
+
node tests/unit/linkAnalyzer.test.js
|
|
139
|
+
|
|
140
|
+
# Run a specific Wave test
|
|
141
|
+
node tests/validation/test-batch-scrape.js
|
|
142
|
+
|
|
143
|
+
# Run Wave 3 tests with verbose output
|
|
144
|
+
npm run test:wave3:verbose
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Testing Tool Integration
|
|
148
|
+
```bash
|
|
149
|
+
# Test MCP protocol compliance
|
|
150
|
+
npm test
|
|
151
|
+
|
|
152
|
+
# Test specific tool functionality
|
|
153
|
+
node tests/validation/test-batch-scrape.js
|
|
154
|
+
node tests/validation/test-scrape-with-actions.js
|
|
155
|
+
|
|
156
|
+
# Test research features
|
|
157
|
+
node tests/validation/wave3-validation.js
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Debugging Tips
|
|
161
|
+
- Server logs are written to console via Winston logger
|
|
162
|
+
- Set `NODE_ENV=development` for verbose logging
|
|
163
|
+
- Use `--expose-gc` flag for memory profiling tests
|
|
164
|
+
- Check `cache/` directory for cached responses
|
|
165
|
+
- Review `logs/` directory for application logs
|
|
166
|
+
|
|
167
|
+
## CI/CD Security Integration
|
|
168
|
+
|
|
169
|
+
### Automated Security Testing Pipeline
|
|
170
|
+
|
|
171
|
+
The project includes comprehensive security testing integrated into the CI/CD pipeline:
|
|
172
|
+
|
|
173
|
+
#### Main CI Pipeline (`.github/workflows/ci.yml`)
|
|
174
|
+
The CI pipeline runs on every PR and push to main/develop branches and includes:
|
|
175
|
+
|
|
176
|
+
**Security Test Suite:**
|
|
177
|
+
- SSRF Protection validation
|
|
178
|
+
- Input validation (XSS, SQL injection, command injection)
|
|
179
|
+
- Rate limiting functionality
|
|
180
|
+
- DoS protection measures
|
|
181
|
+
- Regex DoS vulnerability detection
|
|
182
|
+
|
|
183
|
+
**Dependency Security:**
|
|
184
|
+
- npm audit with JSON output and summary generation
|
|
185
|
+
- Vulnerability severity analysis (critical/high/moderate/low)
|
|
186
|
+
- License compliance checking
|
|
187
|
+
- Outdated package detection
|
|
188
|
+
|
|
189
|
+
**Static Code Analysis:**
|
|
190
|
+
- CodeQL security analysis with extended queries
|
|
191
|
+
- ESLint security rules for dangerous patterns
|
|
192
|
+
- Hardcoded secret detection
|
|
193
|
+
- Security file scanning
|
|
194
|
+
|
|
195
|
+
**Reporting & Artifacts:**
|
|
196
|
+
- Comprehensive security reports generated
|
|
197
|
+
- PR comments with security summaries
|
|
198
|
+
- Artifact upload for detailed analysis
|
|
199
|
+
- Build failure on critical vulnerabilities
|
|
200
|
+
|
|
201
|
+
#### Dedicated Security Workflow (`.github/workflows/security.yml`)
|
|
202
|
+
Daily scheduled comprehensive security scanning:
|
|
203
|
+
|
|
204
|
+
**Dependency Security Scan:**
|
|
205
|
+
- Full vulnerability audit with configurable severity levels
|
|
206
|
+
- License compliance verification
|
|
207
|
+
- Detailed vulnerability reporting
|
|
208
|
+
|
|
209
|
+
**Static Code Analysis:**
|
|
210
|
+
- Extended CodeQL analysis with security-focused queries
|
|
211
|
+
- ESLint security plugin integration
|
|
212
|
+
- Pattern-based secret detection
|
|
213
|
+
|
|
214
|
+
**Container Security:**
|
|
215
|
+
- Trivy vulnerability scanning
|
|
216
|
+
- SARIF report generation
|
|
217
|
+
- Container base image analysis
|
|
218
|
+
|
|
219
|
+
**Automated Issue Creation:**
|
|
220
|
+
- GitHub issues created for critical vulnerabilities
|
|
221
|
+
- Detailed security reports with remediation steps
|
|
222
|
+
- Configurable severity thresholds
|
|
223
|
+
|
|
224
|
+
### Security Thresholds and Policies
|
|
225
|
+
|
|
226
|
+
**Build Failure Conditions:**
|
|
227
|
+
- Any critical severity vulnerabilities
|
|
228
|
+
- More than 3 high severity vulnerabilities
|
|
229
|
+
- Security test suite failures
|
|
230
|
+
|
|
231
|
+
**Automated Actions:**
|
|
232
|
+
- Daily security scans at 2 AM UTC
|
|
233
|
+
- PR blocking for security failures
|
|
234
|
+
- Automatic security issue creation
|
|
235
|
+
- Comprehensive artifact collection
|
|
236
|
+
|
|
237
|
+
### Running Security Tests Locally
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
# Run the complete security test suite
|
|
241
|
+
npm run test:security
|
|
242
|
+
|
|
243
|
+
# Check for dependency vulnerabilities
|
|
244
|
+
npm audit --audit-level moderate
|
|
245
|
+
|
|
246
|
+
# Fix automatically resolvable vulnerabilities
|
|
247
|
+
npm audit fix
|
|
248
|
+
|
|
249
|
+
# Generate security report manually
|
|
250
|
+
mkdir security-results
|
|
251
|
+
npm audit --json > security-results/audit.json
|
|
252
|
+
|
|
253
|
+
# Run specific security validation
|
|
254
|
+
node tests/security/security-test-suite.js
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Security Artifacts and Reports
|
|
258
|
+
|
|
259
|
+
**Generated Reports:**
|
|
260
|
+
- `SECURITY-REPORT.md`: Comprehensive security assessment
|
|
261
|
+
- `npm-audit.json`: Detailed vulnerability data
|
|
262
|
+
- `security-tests.log`: Test execution logs
|
|
263
|
+
- `dependency-analysis.md`: Package security analysis
|
|
264
|
+
- `license-check.md`: License compliance report
|
|
265
|
+
|
|
266
|
+
**Artifact Retention:**
|
|
267
|
+
- CI security results: 30 days
|
|
268
|
+
- Comprehensive security reports: 90 days
|
|
269
|
+
- Critical vulnerability reports: Indefinite
|
|
270
|
+
|
|
271
|
+
### Manual Security Scan Triggers
|
|
272
|
+
|
|
273
|
+
The security workflow can be manually triggered with custom parameters:
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
# Via GitHub CLI
|
|
277
|
+
gh workflow run security.yml \
|
|
278
|
+
--field scan_type=all \
|
|
279
|
+
--field severity_threshold=moderate
|
|
280
|
+
|
|
281
|
+
# Via GitHub UI
|
|
282
|
+
# Go to Actions > Security Scanning > Run workflow
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
**Available Options:**
|
|
286
|
+
- `scan_type`: all, dependencies, code-analysis, container-scan
|
|
287
|
+
- `severity_threshold`: low, moderate, high, critical
|
|
288
|
+
|
|
289
|
+
### Security Integration Best Practices
|
|
290
|
+
|
|
291
|
+
**For Contributors:**
|
|
292
|
+
1. Always run `npm run test:security` before submitting PRs
|
|
293
|
+
2. Address any security warnings in your code
|
|
294
|
+
3. Keep dependencies updated with `npm audit fix`
|
|
295
|
+
4. Review security artifacts when CI fails
|
|
296
|
+
|
|
297
|
+
**For Maintainers:**
|
|
298
|
+
1. Review security reports weekly
|
|
299
|
+
2. Respond to automated security issues promptly
|
|
300
|
+
3. Keep security thresholds updated
|
|
301
|
+
4. Monitor trending vulnerabilities in dependencies
|
|
302
|
+
|
|
303
|
+
### Security Documentation
|
|
304
|
+
|
|
305
|
+
Comprehensive security documentation is available in:
|
|
306
|
+
- `.github/SECURITY.md` - Complete security policy and procedures
|
|
307
|
+
- Security workflow logs and artifacts
|
|
308
|
+
- Generated security reports in CI runs
|
|
309
|
+
|
|
310
|
+
The security integration ensures that:
|
|
311
|
+
- No critical vulnerabilities reach production
|
|
312
|
+
- Security issues are detected early in development
|
|
313
|
+
- Comprehensive audit trails are maintained
|
|
314
|
+
- Automated remediation guidance is provided
|
|
315
|
+
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 CrawlForge
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# CrawlForge MCP Server
|
|
2
|
+
|
|
3
|
+
Professional web scraping and content extraction server implementing the Model Context Protocol (MCP). Get started with **1,000 free credits** - no credit card required!
|
|
4
|
+
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
[](https://nodejs.org/)
|
|
7
|
+
[](https://modelcontextprotocol.io/)
|
|
8
|
+
[](https://www.npmjs.com/package/crawlforge-mcp-server)
|
|
9
|
+
|
|
10
|
+
## šÆ Features
|
|
11
|
+
|
|
12
|
+
- **18+ Advanced Tools**: Web scraping, deep research, stealth browsing, content analysis
|
|
13
|
+
- **Free Tier**: 1,000 credits to get started instantly
|
|
14
|
+
- **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
|
|
15
|
+
- **Enterprise Ready**: Scale up with paid plans for production use
|
|
16
|
+
- **Credit-Based**: Pay only for what you use
|
|
17
|
+
|
|
18
|
+
## š Quick Start (2 Minutes)
|
|
19
|
+
|
|
20
|
+
### 1. Install from NPM
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npm install -g crawlforge-mcp-server
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### 2. Setup Your API Key
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
npx crawlforge-setup
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
This will:
|
|
33
|
+
- Guide you through getting your free API key
|
|
34
|
+
- Configure your credentials securely
|
|
35
|
+
- Verify your setup is working
|
|
36
|
+
|
|
37
|
+
**Don't have an API key?** Get one free at [https://crawlforge.com/signup](https://crawlforge.com/signup)
|
|
38
|
+
|
|
39
|
+
### 3. Configure Your IDE
|
|
40
|
+
|
|
41
|
+
<details>
|
|
42
|
+
<summary>š¤ For Claude Desktop</summary>
|
|
43
|
+
|
|
44
|
+
Add to `claude_desktop_config.json`:
|
|
45
|
+
```json
|
|
46
|
+
{
|
|
47
|
+
"mcpServers": {
|
|
48
|
+
"crawlforge": {
|
|
49
|
+
"command": "npx",
|
|
50
|
+
"args": ["crawlforge-mcp-server"]
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
**Location:**
|
|
57
|
+
- macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`
|
|
58
|
+
- Windows: `%APPDATA%/Claude/claude_desktop_config.json`
|
|
59
|
+
- Linux: `~/.config/Claude/claude_desktop_config.json`
|
|
60
|
+
|
|
61
|
+
Restart Claude Desktop to activate.
|
|
62
|
+
</details>
|
|
63
|
+
|
|
64
|
+
<details>
|
|
65
|
+
<summary>š» For Cursor IDE</summary>
|
|
66
|
+
|
|
67
|
+
Add to `.cursorrules` in your project:
|
|
68
|
+
```bash
|
|
69
|
+
mcp_servers:
|
|
70
|
+
crawlforge:
|
|
71
|
+
command: npx
|
|
72
|
+
args: ["crawlforge-mcp-server"]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Or use the MCP plugin in Cursor settings.
|
|
76
|
+
</details>
|
|
77
|
+
|
|
78
|
+
## š Available Tools
|
|
79
|
+
|
|
80
|
+
### Basic Tools (1 credit each)
|
|
81
|
+
- `fetch_url` - Fetch content from any URL
|
|
82
|
+
- `extract_text` - Extract clean text from web pages
|
|
83
|
+
- `extract_links` - Get all links from a page
|
|
84
|
+
- `extract_metadata` - Extract page metadata
|
|
85
|
+
|
|
86
|
+
### Advanced Tools (2-3 credits)
|
|
87
|
+
- `scrape_structured` - Extract structured data with CSS selectors
|
|
88
|
+
- `search_web` - Search the web with Google/DuckDuckGo
|
|
89
|
+
- `summarize_content` - Generate intelligent summaries
|
|
90
|
+
- `analyze_content` - Comprehensive content analysis
|
|
91
|
+
|
|
92
|
+
### Premium Tools (5-10 credits)
|
|
93
|
+
- `crawl_deep` - Deep crawl entire websites
|
|
94
|
+
- `map_site` - Discover and map website structure
|
|
95
|
+
- `batch_scrape` - Process multiple URLs simultaneously
|
|
96
|
+
- `deep_research` - Multi-stage research with source verification
|
|
97
|
+
- `stealth_mode` - Anti-detection browser management
|
|
98
|
+
|
|
99
|
+
### Heavy Processing (3-10 credits)
|
|
100
|
+
- `process_document` - Multi-format document processing
|
|
101
|
+
- `extract_content` - Enhanced content extraction
|
|
102
|
+
- `scrape_with_actions` - Browser automation chains
|
|
103
|
+
- `generate_llms_txt` - Generate AI interaction guidelines
|
|
104
|
+
- `localization` - Multi-language and geo-location management
|
|
105
|
+
|
|
106
|
+
## š³ Pricing
|
|
107
|
+
|
|
108
|
+
| Plan | Credits/Month | Price | Best For |
|
|
109
|
+
|------|---------------|-------|----------|
|
|
110
|
+
| **Free** | 1,000 | $0 | Testing & personal projects |
|
|
111
|
+
| **Hobby** | 10,000 | $19 | Small projects |
|
|
112
|
+
| **Pro** | 50,000 | $49 | Professional use |
|
|
113
|
+
| **Business** | 200,000 | $149 | Teams & automation |
|
|
114
|
+
| **Enterprise** | Unlimited | Custom | Large scale operations |
|
|
115
|
+
|
|
116
|
+
[View full pricing](https://crawlforge.com/pricing)
|
|
117
|
+
|
|
118
|
+
## š§ Advanced Configuration
|
|
119
|
+
|
|
120
|
+
### Environment Variables
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
# Optional: Set API key via environment
|
|
124
|
+
export CRAWLFORGE_API_KEY="sk_live_your_api_key_here"
|
|
125
|
+
|
|
126
|
+
# Optional: Custom API endpoint (for enterprise)
|
|
127
|
+
export CRAWLFORGE_API_URL="https://api.crawlforge.com"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Manual Configuration
|
|
131
|
+
|
|
132
|
+
Your configuration is stored at `~/.crawlforge/config.json`:
|
|
133
|
+
|
|
134
|
+
```json
|
|
135
|
+
{
|
|
136
|
+
"apiKey": "sk_live_...",
|
|
137
|
+
"userId": "user_...",
|
|
138
|
+
"email": "you@example.com"
|
|
139
|
+
}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## š Usage Examples
|
|
143
|
+
|
|
144
|
+
Once configured, use these tools in your AI assistant:
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
"Search for the latest AI news"
|
|
148
|
+
"Extract all links from example.com"
|
|
149
|
+
"Crawl the documentation site and summarize it"
|
|
150
|
+
"Monitor this page for changes"
|
|
151
|
+
"Extract product prices from this e-commerce site"
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## š Security & Privacy
|
|
155
|
+
|
|
156
|
+
- API keys are stored locally and encrypted
|
|
157
|
+
- All connections use HTTPS
|
|
158
|
+
- No data is stored on our servers beyond usage logs
|
|
159
|
+
- Compliant with robots.txt and rate limits
|
|
160
|
+
- GDPR compliant
|
|
161
|
+
|
|
162
|
+
## š Support
|
|
163
|
+
|
|
164
|
+
- **Documentation**: [https://crawlforge.com/docs](https://crawlforge.com/docs)
|
|
165
|
+
- **Issues**: [GitHub Issues](https://github.com/crawlforge/mcp-server/issues)
|
|
166
|
+
- **Email**: support@crawlforge.com
|
|
167
|
+
- **Discord**: [Join our community](https://discord.gg/crawlforge)
|
|
168
|
+
|
|
169
|
+
## š License
|
|
170
|
+
|
|
171
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
172
|
+
|
|
173
|
+
## š¤ Contributing
|
|
174
|
+
|
|
175
|
+
Contributions are welcome! Please read our [Contributing Guide](CONTRIBUTING.md) first.
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
**Built with ā¤ļø by the CrawlForge team**
|
|
180
|
+
|
|
181
|
+
[Website](https://crawlforge.com) | [Documentation](https://crawlforge.com/docs) | [API Reference](https://crawlforge.com/api)
|
package/package.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "crawlforge-mcp-server",
|
|
3
|
+
"version": "3.0.0",
|
|
4
|
+
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 16+ comprehensive web scraping, crawling, and content processing tools.",
|
|
5
|
+
"main": "server.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"crawlforge": "server.js",
|
|
8
|
+
"crawlforge-setup": "setup.js"
|
|
9
|
+
},
|
|
10
|
+
"scripts": {
|
|
11
|
+
"start": "node server.js",
|
|
12
|
+
"setup": "node setup.js",
|
|
13
|
+
"dev": "cross-env NODE_ENV=development node server.js",
|
|
14
|
+
"test": "node tests/integration/mcp-protocol-compliance.test.js",
|
|
15
|
+
"postinstall": "echo '\nš CrawlForge MCP Server installed!\n\nRun \"npx crawlforge-setup\" to configure your API key and get started.\n'",
|
|
16
|
+
"docker:build": "docker build -t crawlforge .",
|
|
17
|
+
"docker:dev": "docker-compose up crawlforge-dev",
|
|
18
|
+
"docker:prod": "docker-compose up crawlforge-prod"
|
|
19
|
+
},
|
|
20
|
+
"keywords": [
|
|
21
|
+
"mcp",
|
|
22
|
+
"model-context-protocol",
|
|
23
|
+
"web-scraping",
|
|
24
|
+
"scraper",
|
|
25
|
+
"html-parser",
|
|
26
|
+
"metadata-extraction",
|
|
27
|
+
"link-extraction",
|
|
28
|
+
"content-processing",
|
|
29
|
+
"document-analysis",
|
|
30
|
+
"web-crawler",
|
|
31
|
+
"search-engine",
|
|
32
|
+
"text-summarization",
|
|
33
|
+
"content-analysis",
|
|
34
|
+
"nlp",
|
|
35
|
+
"ai-tools",
|
|
36
|
+
"automation",
|
|
37
|
+
"data-extraction",
|
|
38
|
+
"pdf-processing",
|
|
39
|
+
"sitemap-parser",
|
|
40
|
+
"performance-optimized",
|
|
41
|
+
"llms-txt",
|
|
42
|
+
"llms-txt-generator",
|
|
43
|
+
"ai-compliance",
|
|
44
|
+
"website-analysis"
|
|
45
|
+
],
|
|
46
|
+
"author": {
|
|
47
|
+
"name": "Simon Lacey",
|
|
48
|
+
"email": "your-email@example.com"
|
|
49
|
+
},
|
|
50
|
+
"license": "MIT",
|
|
51
|
+
"repository": {
|
|
52
|
+
"type": "git",
|
|
53
|
+
"url": "git+https://github.com/crawlforge/mcp-server.git"
|
|
54
|
+
},
|
|
55
|
+
"bugs": {
|
|
56
|
+
"url": "https://github.com/crawlforge/mcp-server/issues"
|
|
57
|
+
},
|
|
58
|
+
"homepage": "https://crawlforge.com",
|
|
59
|
+
"type": "module",
|
|
60
|
+
"engines": {
|
|
61
|
+
"node": ">=18.0.0",
|
|
62
|
+
"npm": ">=8.0.0"
|
|
63
|
+
},
|
|
64
|
+
"os": [
|
|
65
|
+
"linux",
|
|
66
|
+
"darwin",
|
|
67
|
+
"win32"
|
|
68
|
+
],
|
|
69
|
+
"cpu": [
|
|
70
|
+
"x64",
|
|
71
|
+
"arm64"
|
|
72
|
+
],
|
|
73
|
+
"publishConfig": {
|
|
74
|
+
"access": "public",
|
|
75
|
+
"registry": "https://registry.npmjs.org/",
|
|
76
|
+
"tag": "latest"
|
|
77
|
+
},
|
|
78
|
+
"files": [
|
|
79
|
+
"server.js",
|
|
80
|
+
"setup.js",
|
|
81
|
+
"src/",
|
|
82
|
+
"README.md",
|
|
83
|
+
"LICENSE",
|
|
84
|
+
"CLAUDE.md",
|
|
85
|
+
"package.json"
|
|
86
|
+
],
|
|
87
|
+
"dependencies": {
|
|
88
|
+
"@googleapis/customsearch": "^5.0.1",
|
|
89
|
+
"@modelcontextprotocol/sdk": "^1.17.3",
|
|
90
|
+
"@mozilla/readability": "^0.6.0",
|
|
91
|
+
"cheerio": "^1.1.2",
|
|
92
|
+
"compromise": "^14.14.4",
|
|
93
|
+
"diff": "^8.0.2",
|
|
94
|
+
"dotenv": "^17.2.1",
|
|
95
|
+
"franc": "^6.2.0",
|
|
96
|
+
"isomorphic-dompurify": "^2.26.0",
|
|
97
|
+
"jsdom": "^26.1.0",
|
|
98
|
+
"lru-cache": "^11.1.0",
|
|
99
|
+
"node-cron": "^3.0.3",
|
|
100
|
+
"node-fetch": "^3.3.2",
|
|
101
|
+
"node-summarizer": "^1.0.7",
|
|
102
|
+
"p-queue": "^8.1.0",
|
|
103
|
+
"pdf-parse": "^1.1.1",
|
|
104
|
+
"playwright": "^1.54.2",
|
|
105
|
+
"robots-parser": "^3.0.1",
|
|
106
|
+
"winston": "^3.11.0",
|
|
107
|
+
"zod": "^3.23.8"
|
|
108
|
+
},
|
|
109
|
+
"devDependencies": {
|
|
110
|
+
"@jest/globals": "^30.0.5",
|
|
111
|
+
"cross-env": "^10.0.0",
|
|
112
|
+
"jest": "^30.0.5",
|
|
113
|
+
"shx": "^0.4.0"
|
|
114
|
+
}
|
|
115
|
+
}
|