crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
package/CLAUDE.md ADDED
@@ -0,0 +1,315 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ CrawlForge MCP Server - A professional MCP (Model Context Protocol) server implementation providing 16 comprehensive web scraping, crawling, and content processing tools. Version 3.0 includes advanced content extraction, document processing, summarization, and analysis capabilities. Wave 2 adds asynchronous batch processing and browser automation features. Wave 3 introduces deep research orchestration, stealth scraping, localization, and change tracking.
8
+
9
+ ## Development Commands
10
+
11
+ ```bash
12
+ # Install dependencies
13
+ npm install
14
+
15
+ # Copy and configure environment
16
+ cp .env.example .env
17
+ # Edit .env to add Google API credentials if using Google search
18
+
19
+ # Run the server
20
+ npm start
21
+
22
+ # Test MCP protocol compliance
23
+ # NOTE: test-server.js doesn't exist, use integration tests instead
24
+ npm run test:integration # Integration tests including MCP compliance
25
+
26
+ # Lint checks (no linter configured yet, placeholder)
27
+ npm run lint
28
+
29
+ # Performance tests
30
+ npm run test:performance # Full performance test suite
31
+ npm run test:performance:quick # Quick performance tests
32
+ npm run test:load # Load testing
33
+ npm run test:memory # Memory usage tests
34
+ npm run test:benchmark # Component benchmarks
35
+ npm run test:integration # Integration tests
36
+ npm run test:security # Security test suite
37
+ npm run test:all # Run all tests
38
+
39
+ # Wave 2 Validation Tests
40
+ node tests/validation/test-wave2-runner.js # Test Wave 2 features
41
+ node tests/validation/test-batch-scrape.js # Test batch scraping
42
+ node tests/validation/test-scrape-with-actions.js # Test action scraping
43
+ node tests/integration/master-test-runner.js # Run master test suite
44
+
45
+ # Wave 3 Tests
46
+ npm run test:wave3 # Full Wave 3 validation
47
+ npm run test:wave3:quick # Quick Wave 3 tests
48
+ npm run test:wave3:verbose # Verbose Wave 3 output
49
+ npm run test:unit:wave3 # Wave 3 unit tests (Jest)
50
+ npm run test:integration:wave3 # Wave 3 integration tests
51
+
52
+ # Docker commands
53
+ npm run docker:build # Build Docker image
54
+ npm run docker:dev # Run development container
55
+ npm run docker:prod # Run production container
56
+ npm run docker:test # Run test container
57
+ npm run docker:perf # Run performance test container
58
+
59
+ # Security Testing (CI/CD Integration)
60
+ npm run test:security # Run comprehensive security test suite
61
+ npm audit # Check for dependency vulnerabilities
62
+ npm audit fix # Automatically fix vulnerabilities
63
+ npm outdated # Check for outdated packages
64
+
65
+ # Release management
66
+ npm run release:patch # Patch version bump
67
+ npm run release:minor # Minor version bump
68
+ npm run release:major # Major version bump
69
+
70
+ # Cleanup
71
+ npm run clean # Remove cache, logs, test results
72
+
73
+ # Running specific test files
74
+ node tests/unit/linkAnalyzer.test.js # Unit test for link analyzer
75
+ node tests/validation/wave3-validation.js # Wave 3 validation suite
76
+ node tests/security/security-test-suite.js # Security test suite
77
+ ```
78
+
79
+ ## High-Level Architecture
80
+
81
+ ### Core Infrastructure (`src/core/`)
82
+ - **PerformanceManager**: Centralized performance monitoring and optimization
83
+ - **JobManager**: Asynchronous job tracking and management for batch operations
84
+ - **WebhookDispatcher**: Event notification system for job completion callbacks
85
+ - **ActionExecutor**: Browser automation engine for complex interactions
86
+ - **ResearchOrchestrator**: Coordinates multi-stage research with query expansion and synthesis
87
+ - **StealthBrowserManager**: Manages stealth mode scraping with anti-detection features
88
+ - **LocalizationManager**: Handles multi-language content and localization
89
+ - **ChangeTracker**: Tracks and compares content changes over time
90
+ - **SnapshotManager**: Manages website snapshots and version history
91
+
92
+ ### Tool Layer (`src/tools/`)
93
+ Tools are organized in subdirectories by category:
94
+ - `advanced/` - BatchScrapeTool, ScrapeWithActionsTool
95
+ - `crawl/` - crawlDeep, mapSite
96
+ - `extract/` - analyzeContent, extractContent, processDocument, summarizeContent
97
+ - `research/` - deepResearch
98
+ - `search/` - searchWeb and provider adapters (Google, DuckDuckGo)
99
+ - `tracking/` - trackChanges
100
+
101
+ ### MCP Server Entry Point
102
+ The main server implementation is in `server.js` which:
103
+ 1. Uses stdio transport for MCP protocol communication
104
+ 2. Registers all 16 tools using `server.registerTool()` pattern
105
+ 3. Each tool has inline Zod schema for parameter validation
106
+ 4. Parameter extraction from `request.params?.arguments` structure
107
+ 5. Response format uses `content` array with text objects
108
+
109
+ ### Key Configuration
110
+
111
+ Critical environment variables:
112
+
113
+ ```bash
114
+ # Search Provider (auto, google, duckduckgo)
115
+ SEARCH_PROVIDER=auto
116
+
117
+ # Google API (optional, only if using Google)
118
+ GOOGLE_API_KEY=your_key
119
+ GOOGLE_SEARCH_ENGINE_ID=your_id
120
+
121
+ # Performance Settings
122
+ MAX_WORKERS=10
123
+ QUEUE_CONCURRENCY=10
124
+ CACHE_TTL=3600000
125
+ RATE_LIMIT_REQUESTS_PER_SECOND=10
126
+
127
+ # Crawling Limits
128
+ MAX_CRAWL_DEPTH=5
129
+ MAX_PAGES_PER_CRAWL=100
130
+ RESPECT_ROBOTS_TXT=true
131
+ ```
132
+
133
+ ## Common Development Tasks
134
+
135
+ ### Running a Single Test
136
+ ```bash
137
+ # Run a specific test file
138
+ node tests/unit/linkAnalyzer.test.js
139
+
140
+ # Run a specific Wave test
141
+ node tests/validation/test-batch-scrape.js
142
+
143
+ # Run Wave 3 tests with verbose output
144
+ npm run test:wave3:verbose
145
+ ```
146
+
147
+ ### Testing Tool Integration
148
+ ```bash
149
+ # Test MCP protocol compliance
150
+ npm test
151
+
152
+ # Test specific tool functionality
153
+ node tests/validation/test-batch-scrape.js
154
+ node tests/validation/test-scrape-with-actions.js
155
+
156
+ # Test research features
157
+ node tests/validation/wave3-validation.js
158
+ ```
159
+
160
+ ### Debugging Tips
161
+ - Server logs are written to console via Winston logger
162
+ - Set `NODE_ENV=development` for verbose logging
163
+ - Use `--expose-gc` flag for memory profiling tests
164
+ - Check `cache/` directory for cached responses
165
+ - Review `logs/` directory for application logs
166
+
167
+ ## CI/CD Security Integration
168
+
169
+ ### Automated Security Testing Pipeline
170
+
171
+ The project includes comprehensive security testing integrated into the CI/CD pipeline:
172
+
173
+ #### Main CI Pipeline (`.github/workflows/ci.yml`)
174
+ The CI pipeline runs on every PR and push to main/develop branches and includes:
175
+
176
+ **Security Test Suite:**
177
+ - SSRF Protection validation
178
+ - Input validation (XSS, SQL injection, command injection)
179
+ - Rate limiting functionality
180
+ - DoS protection measures
181
+ - Regex DoS vulnerability detection
182
+
183
+ **Dependency Security:**
184
+ - npm audit with JSON output and summary generation
185
+ - Vulnerability severity analysis (critical/high/moderate/low)
186
+ - License compliance checking
187
+ - Outdated package detection
188
+
189
+ **Static Code Analysis:**
190
+ - CodeQL security analysis with extended queries
191
+ - ESLint security rules for dangerous patterns
192
+ - Hardcoded secret detection
193
+ - Security file scanning
194
+
195
+ **Reporting & Artifacts:**
196
+ - Comprehensive security reports generated
197
+ - PR comments with security summaries
198
+ - Artifact upload for detailed analysis
199
+ - Build failure on critical vulnerabilities
200
+
201
+ #### Dedicated Security Workflow (`.github/workflows/security.yml`)
202
+ Daily scheduled comprehensive security scanning:
203
+
204
+ **Dependency Security Scan:**
205
+ - Full vulnerability audit with configurable severity levels
206
+ - License compliance verification
207
+ - Detailed vulnerability reporting
208
+
209
+ **Static Code Analysis:**
210
+ - Extended CodeQL analysis with security-focused queries
211
+ - ESLint security plugin integration
212
+ - Pattern-based secret detection
213
+
214
+ **Container Security:**
215
+ - Trivy vulnerability scanning
216
+ - SARIF report generation
217
+ - Container base image analysis
218
+
219
+ **Automated Issue Creation:**
220
+ - GitHub issues created for critical vulnerabilities
221
+ - Detailed security reports with remediation steps
222
+ - Configurable severity thresholds
223
+
224
+ ### Security Thresholds and Policies
225
+
226
+ **Build Failure Conditions:**
227
+ - Any critical severity vulnerabilities
228
+ - More than 3 high severity vulnerabilities
229
+ - Security test suite failures
230
+
231
+ **Automated Actions:**
232
+ - Daily security scans at 2 AM UTC
233
+ - PR blocking for security failures
234
+ - Automatic security issue creation
235
+ - Comprehensive artifact collection
236
+
237
+ ### Running Security Tests Locally
238
+
239
+ ```bash
240
+ # Run the complete security test suite
241
+ npm run test:security
242
+
243
+ # Check for dependency vulnerabilities
244
+ npm audit --audit-level moderate
245
+
246
+ # Fix automatically resolvable vulnerabilities
247
+ npm audit fix
248
+
249
+ # Generate security report manually
250
+ mkdir security-results
251
+ npm audit --json > security-results/audit.json
252
+
253
+ # Run specific security validation
254
+ node tests/security/security-test-suite.js
255
+ ```
256
+
257
+ ### Security Artifacts and Reports
258
+
259
+ **Generated Reports:**
260
+ - `SECURITY-REPORT.md`: Comprehensive security assessment
261
+ - `npm-audit.json`: Detailed vulnerability data
262
+ - `security-tests.log`: Test execution logs
263
+ - `dependency-analysis.md`: Package security analysis
264
+ - `license-check.md`: License compliance report
265
+
266
+ **Artifact Retention:**
267
+ - CI security results: 30 days
268
+ - Comprehensive security reports: 90 days
269
+ - Critical vulnerability reports: Indefinite
270
+
271
+ ### Manual Security Scan Triggers
272
+
273
+ The security workflow can be manually triggered with custom parameters:
274
+
275
+ ```bash
276
+ # Via GitHub CLI
277
+ gh workflow run security.yml \
278
+ --field scan_type=all \
279
+ --field severity_threshold=moderate
280
+
281
+ # Via GitHub UI
282
+ # Go to Actions > Security Scanning > Run workflow
283
+ ```
284
+
285
+ **Available Options:**
286
+ - `scan_type`: all, dependencies, code-analysis, container-scan
287
+ - `severity_threshold`: low, moderate, high, critical
288
+
289
+ ### Security Integration Best Practices
290
+
291
+ **For Contributors:**
292
+ 1. Always run `npm run test:security` before submitting PRs
293
+ 2. Address any security warnings in your code
294
+ 3. Keep dependencies updated with `npm audit fix`
295
+ 4. Review security artifacts when CI fails
296
+
297
+ **For Maintainers:**
298
+ 1. Review security reports weekly
299
+ 2. Respond to automated security issues promptly
300
+ 3. Keep security thresholds updated
301
+ 4. Monitor trending vulnerabilities in dependencies
302
+
303
+ ### Security Documentation
304
+
305
+ Comprehensive security documentation is available in:
306
+ - `.github/SECURITY.md` - Complete security policy and procedures
307
+ - Security workflow logs and artifacts
308
+ - Generated security reports in CI runs
309
+
310
+ The security integration ensures that:
311
+ - No critical vulnerabilities reach production
312
+ - Security issues are detected early in development
313
+ - Comprehensive audit trails are maintained
314
+ - Automated remediation guidance is provided
315
+
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 CrawlForge
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,181 @@
1
+ # CrawlForge MCP Server
2
+
3
+ Professional web scraping and content extraction server implementing the Model Context Protocol (MCP). Get started with **1,000 free credits** - no credit card required!
4
+
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+ [![Node.js Version](https://img.shields.io/badge/node-%3E%3D18.0.0-brightgreen)](https://nodejs.org/)
7
+ [![MCP Protocol](https://img.shields.io/badge/MCP-Compatible-blue)](https://modelcontextprotocol.io/)
8
+ [![npm version](https://img.shields.io/npm/v/crawlforge-mcp-server.svg)](https://www.npmjs.com/package/crawlforge-mcp-server)
9
+
10
+ ## šŸŽÆ Features
11
+
12
+ - **18+ Advanced Tools**: Web scraping, deep research, stealth browsing, content analysis
13
+ - **Free Tier**: 1,000 credits to get started instantly
14
+ - **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
15
+ - **Enterprise Ready**: Scale up with paid plans for production use
16
+ - **Credit-Based**: Pay only for what you use
17
+
18
+ ## šŸš€ Quick Start (2 Minutes)
19
+
20
+ ### 1. Install from NPM
21
+
22
+ ```bash
23
+ npm install -g crawlforge-mcp-server
24
+ ```
25
+
26
+ ### 2. Setup Your API Key
27
+
28
+ ```bash
29
+ npx crawlforge-setup
30
+ ```
31
+
32
+ This will:
33
+ - Guide you through getting your free API key
34
+ - Configure your credentials securely
35
+ - Verify your setup is working
36
+
37
+ **Don't have an API key?** Get one free at [https://crawlforge.com/signup](https://crawlforge.com/signup)
38
+
39
+ ### 3. Configure Your IDE
40
+
41
+ <details>
42
+ <summary>šŸ¤– For Claude Desktop</summary>
43
+
44
+ Add to `claude_desktop_config.json`:
45
+ ```json
46
+ {
47
+ "mcpServers": {
48
+ "crawlforge": {
49
+ "command": "npx",
50
+ "args": ["crawlforge-mcp-server"]
51
+ }
52
+ }
53
+ }
54
+ ```
55
+
56
+ **Location:**
57
+ - macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`
58
+ - Windows: `%APPDATA%/Claude/claude_desktop_config.json`
59
+ - Linux: `~/.config/Claude/claude_desktop_config.json`
60
+
61
+ Restart Claude Desktop to activate.
62
+ </details>
63
+
64
+ <details>
65
+ <summary>šŸ’» For Cursor IDE</summary>
66
+
67
+ Add to `.cursorrules` in your project:
68
+ ```bash
69
+ mcp_servers:
70
+ crawlforge:
71
+ command: npx
72
+ args: ["crawlforge-mcp-server"]
73
+ ```
74
+
75
+ Or use the MCP plugin in Cursor settings.
76
+ </details>
77
+
78
+ ## šŸ“Š Available Tools
79
+
80
+ ### Basic Tools (1 credit each)
81
+ - `fetch_url` - Fetch content from any URL
82
+ - `extract_text` - Extract clean text from web pages
83
+ - `extract_links` - Get all links from a page
84
+ - `extract_metadata` - Extract page metadata
85
+
86
+ ### Advanced Tools (2-3 credits)
87
+ - `scrape_structured` - Extract structured data with CSS selectors
88
+ - `search_web` - Search the web with Google/DuckDuckGo
89
+ - `summarize_content` - Generate intelligent summaries
90
+ - `analyze_content` - Comprehensive content analysis
91
+
92
+ ### Premium Tools (5-10 credits)
93
+ - `crawl_deep` - Deep crawl entire websites
94
+ - `map_site` - Discover and map website structure
95
+ - `batch_scrape` - Process multiple URLs simultaneously
96
+ - `deep_research` - Multi-stage research with source verification
97
+ - `stealth_mode` - Anti-detection browser management
98
+
99
+ ### Heavy Processing (3-10 credits)
100
+ - `process_document` - Multi-format document processing
101
+ - `extract_content` - Enhanced content extraction
102
+ - `scrape_with_actions` - Browser automation chains
103
+ - `generate_llms_txt` - Generate AI interaction guidelines
104
+ - `localization` - Multi-language and geo-location management
105
+
106
+ ## šŸ’³ Pricing
107
+
108
+ | Plan | Credits/Month | Price | Best For |
109
+ |------|---------------|-------|----------|
110
+ | **Free** | 1,000 | $0 | Testing & personal projects |
111
+ | **Hobby** | 10,000 | $19 | Small projects |
112
+ | **Pro** | 50,000 | $49 | Professional use |
113
+ | **Business** | 200,000 | $149 | Teams & automation |
114
+ | **Enterprise** | Unlimited | Custom | Large scale operations |
115
+
116
+ [View full pricing](https://crawlforge.com/pricing)
117
+
118
+ ## šŸ”§ Advanced Configuration
119
+
120
+ ### Environment Variables
121
+
122
+ ```bash
123
+ # Optional: Set API key via environment
124
+ export CRAWLFORGE_API_KEY="sk_live_your_api_key_here"
125
+
126
+ # Optional: Custom API endpoint (for enterprise)
127
+ export CRAWLFORGE_API_URL="https://api.crawlforge.com"
128
+ ```
129
+
130
+ ### Manual Configuration
131
+
132
+ Your configuration is stored at `~/.crawlforge/config.json`:
133
+
134
+ ```json
135
+ {
136
+ "apiKey": "sk_live_...",
137
+ "userId": "user_...",
138
+ "email": "you@example.com"
139
+ }
140
+ ```
141
+
142
+ ## šŸ“– Usage Examples
143
+
144
+ Once configured, use these tools in your AI assistant:
145
+
146
+ ```
147
+ "Search for the latest AI news"
148
+ "Extract all links from example.com"
149
+ "Crawl the documentation site and summarize it"
150
+ "Monitor this page for changes"
151
+ "Extract product prices from this e-commerce site"
152
+ ```
153
+
154
+ ## šŸ”’ Security & Privacy
155
+
156
+ - API keys are stored locally and encrypted
157
+ - All connections use HTTPS
158
+ - No data is stored on our servers beyond usage logs
159
+ - Compliant with robots.txt and rate limits
160
+ - GDPR compliant
161
+
162
+ ## šŸ†˜ Support
163
+
164
+ - **Documentation**: [https://crawlforge.com/docs](https://crawlforge.com/docs)
165
+ - **Issues**: [GitHub Issues](https://github.com/crawlforge/mcp-server/issues)
166
+ - **Email**: support@crawlforge.com
167
+ - **Discord**: [Join our community](https://discord.gg/crawlforge)
168
+
169
+ ## šŸ“„ License
170
+
171
+ MIT License - see [LICENSE](LICENSE) file for details.
172
+
173
+ ## šŸ¤ Contributing
174
+
175
+ Contributions are welcome! Please read our [Contributing Guide](CONTRIBUTING.md) first.
176
+
177
+ ---
178
+
179
+ **Built with ā¤ļø by the CrawlForge team**
180
+
181
+ [Website](https://crawlforge.com) | [Documentation](https://crawlforge.com/docs) | [API Reference](https://crawlforge.com/api)
package/package.json ADDED
@@ -0,0 +1,115 @@
1
+ {
2
+ "name": "crawlforge-mcp-server",
3
+ "version": "3.0.0",
4
+ "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 16+ comprehensive web scraping, crawling, and content processing tools.",
5
+ "main": "server.js",
6
+ "bin": {
7
+ "crawlforge": "server.js",
8
+ "crawlforge-setup": "setup.js"
9
+ },
10
+ "scripts": {
11
+ "start": "node server.js",
12
+ "setup": "node setup.js",
13
+ "dev": "cross-env NODE_ENV=development node server.js",
14
+ "test": "node tests/integration/mcp-protocol-compliance.test.js",
15
+ "postinstall": "echo '\nšŸŽ‰ CrawlForge MCP Server installed!\n\nRun \"npx crawlforge-setup\" to configure your API key and get started.\n'",
16
+ "docker:build": "docker build -t crawlforge .",
17
+ "docker:dev": "docker-compose up crawlforge-dev",
18
+ "docker:prod": "docker-compose up crawlforge-prod"
19
+ },
20
+ "keywords": [
21
+ "mcp",
22
+ "model-context-protocol",
23
+ "web-scraping",
24
+ "scraper",
25
+ "html-parser",
26
+ "metadata-extraction",
27
+ "link-extraction",
28
+ "content-processing",
29
+ "document-analysis",
30
+ "web-crawler",
31
+ "search-engine",
32
+ "text-summarization",
33
+ "content-analysis",
34
+ "nlp",
35
+ "ai-tools",
36
+ "automation",
37
+ "data-extraction",
38
+ "pdf-processing",
39
+ "sitemap-parser",
40
+ "performance-optimized",
41
+ "llms-txt",
42
+ "llms-txt-generator",
43
+ "ai-compliance",
44
+ "website-analysis"
45
+ ],
46
+ "author": {
47
+ "name": "Simon Lacey",
48
+ "email": "your-email@example.com"
49
+ },
50
+ "license": "MIT",
51
+ "repository": {
52
+ "type": "git",
53
+ "url": "git+https://github.com/crawlforge/mcp-server.git"
54
+ },
55
+ "bugs": {
56
+ "url": "https://github.com/crawlforge/mcp-server/issues"
57
+ },
58
+ "homepage": "https://crawlforge.com",
59
+ "type": "module",
60
+ "engines": {
61
+ "node": ">=18.0.0",
62
+ "npm": ">=8.0.0"
63
+ },
64
+ "os": [
65
+ "linux",
66
+ "darwin",
67
+ "win32"
68
+ ],
69
+ "cpu": [
70
+ "x64",
71
+ "arm64"
72
+ ],
73
+ "publishConfig": {
74
+ "access": "public",
75
+ "registry": "https://registry.npmjs.org/",
76
+ "tag": "latest"
77
+ },
78
+ "files": [
79
+ "server.js",
80
+ "setup.js",
81
+ "src/",
82
+ "README.md",
83
+ "LICENSE",
84
+ "CLAUDE.md",
85
+ "package.json"
86
+ ],
87
+ "dependencies": {
88
+ "@googleapis/customsearch": "^5.0.1",
89
+ "@modelcontextprotocol/sdk": "^1.17.3",
90
+ "@mozilla/readability": "^0.6.0",
91
+ "cheerio": "^1.1.2",
92
+ "compromise": "^14.14.4",
93
+ "diff": "^8.0.2",
94
+ "dotenv": "^17.2.1",
95
+ "franc": "^6.2.0",
96
+ "isomorphic-dompurify": "^2.26.0",
97
+ "jsdom": "^26.1.0",
98
+ "lru-cache": "^11.1.0",
99
+ "node-cron": "^3.0.3",
100
+ "node-fetch": "^3.3.2",
101
+ "node-summarizer": "^1.0.7",
102
+ "p-queue": "^8.1.0",
103
+ "pdf-parse": "^1.1.1",
104
+ "playwright": "^1.54.2",
105
+ "robots-parser": "^3.0.1",
106
+ "winston": "^3.11.0",
107
+ "zod": "^3.23.8"
108
+ },
109
+ "devDependencies": {
110
+ "@jest/globals": "^30.0.5",
111
+ "cross-env": "^10.0.0",
112
+ "jest": "^30.0.5",
113
+ "shx": "^0.4.0"
114
+ }
115
+ }