crawlforge-mcp-server 3.0.0 ā 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +143 -46
- package/package.json +4 -1
package/CLAUDE.md
CHANGED
|
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
|
4
4
|
|
|
5
5
|
## Project Overview
|
|
6
6
|
|
|
7
|
-
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server implementation providing
|
|
7
|
+
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server implementation providing 18+ comprehensive web scraping, crawling, and content processing tools. Version 3.0 includes advanced content extraction, document processing, summarization, and analysis capabilities. Wave 2 adds asynchronous batch processing and browser automation features. Wave 3 introduces deep research orchestration, stealth scraping, localization, and change tracking.
|
|
8
8
|
|
|
9
9
|
## Development Commands
|
|
10
10
|
|
|
@@ -12,42 +12,30 @@ CrawlForge MCP Server - A professional MCP (Model Context Protocol) server imple
|
|
|
12
12
|
# Install dependencies
|
|
13
13
|
npm install
|
|
14
14
|
|
|
15
|
-
#
|
|
16
|
-
|
|
17
|
-
#
|
|
15
|
+
# Setup (required for first run unless in creator mode)
|
|
16
|
+
npm run setup
|
|
17
|
+
# Or provide API key via environment:
|
|
18
|
+
export CRAWLFORGE_API_KEY="your_api_key_here"
|
|
18
19
|
|
|
19
|
-
#
|
|
20
|
+
# Creator Mode (bypass API key requirement for development)
|
|
21
|
+
export BYPASS_API_KEY=true
|
|
20
22
|
npm start
|
|
21
23
|
|
|
24
|
+
# Run the server (production)
|
|
25
|
+
npm start
|
|
26
|
+
|
|
27
|
+
# Development mode with verbose logging
|
|
28
|
+
npm run dev
|
|
29
|
+
|
|
22
30
|
# Test MCP protocol compliance
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
#
|
|
30
|
-
|
|
31
|
-
npm run test:performance:quick # Quick performance tests
|
|
32
|
-
npm run test:load # Load testing
|
|
33
|
-
npm run test:memory # Memory usage tests
|
|
34
|
-
npm run test:benchmark # Component benchmarks
|
|
35
|
-
npm run test:integration # Integration tests
|
|
36
|
-
npm run test:security # Security test suite
|
|
37
|
-
npm run test:all # Run all tests
|
|
38
|
-
|
|
39
|
-
# Wave 2 Validation Tests
|
|
40
|
-
node tests/validation/test-wave2-runner.js # Test Wave 2 features
|
|
41
|
-
node tests/validation/test-batch-scrape.js # Test batch scraping
|
|
42
|
-
node tests/validation/test-scrape-with-actions.js # Test action scraping
|
|
43
|
-
node tests/integration/master-test-runner.js # Run master test suite
|
|
44
|
-
|
|
45
|
-
# Wave 3 Tests
|
|
46
|
-
npm run test:wave3 # Full Wave 3 validation
|
|
47
|
-
npm run test:wave3:quick # Quick Wave 3 tests
|
|
48
|
-
npm run test:wave3:verbose # Verbose Wave 3 output
|
|
49
|
-
npm run test:unit:wave3 # Wave 3 unit tests (Jest)
|
|
50
|
-
npm run test:integration:wave3 # Wave 3 integration tests
|
|
31
|
+
npm test
|
|
32
|
+
|
|
33
|
+
# Functional tests
|
|
34
|
+
node test-tools.js # Test all tools (basic, Wave 2, Wave 3)
|
|
35
|
+
node test-real-world.js # Test real-world usage scenarios
|
|
36
|
+
|
|
37
|
+
# MCP Protocol tests
|
|
38
|
+
node tests/integration/mcp-protocol-compliance.test.js # MCP protocol compliance
|
|
51
39
|
|
|
52
40
|
# Docker commands
|
|
53
41
|
npm run docker:build # Build Docker image
|
|
@@ -71,18 +59,19 @@ npm run release:major # Major version bump
|
|
|
71
59
|
npm run clean # Remove cache, logs, test results
|
|
72
60
|
|
|
73
61
|
# Running specific test files
|
|
74
|
-
node tests/
|
|
75
|
-
node
|
|
76
|
-
node
|
|
62
|
+
node tests/integration/mcp-protocol-compliance.test.js # MCP protocol compliance
|
|
63
|
+
node test-tools.js # All tools functional test
|
|
64
|
+
node test-real-world.js # Real-world scenarios test
|
|
77
65
|
```
|
|
78
66
|
|
|
79
67
|
## High-Level Architecture
|
|
80
68
|
|
|
81
69
|
### Core Infrastructure (`src/core/`)
|
|
70
|
+
- **AuthManager**: Authentication, credit tracking, and usage reporting (supports creator mode)
|
|
82
71
|
- **PerformanceManager**: Centralized performance monitoring and optimization
|
|
83
72
|
- **JobManager**: Asynchronous job tracking and management for batch operations
|
|
84
73
|
- **WebhookDispatcher**: Event notification system for job completion callbacks
|
|
85
|
-
- **ActionExecutor**: Browser automation engine for complex interactions
|
|
74
|
+
- **ActionExecutor**: Browser automation engine for complex interactions (Playwright-based)
|
|
86
75
|
- **ResearchOrchestrator**: Coordinates multi-stage research with query expansion and synthesis
|
|
87
76
|
- **StealthBrowserManager**: Manages stealth mode scraping with anti-detection features
|
|
88
77
|
- **LocalizationManager**: Handles multi-language content and localization
|
|
@@ -96,21 +85,50 @@ Tools are organized in subdirectories by category:
|
|
|
96
85
|
- `extract/` - analyzeContent, extractContent, processDocument, summarizeContent
|
|
97
86
|
- `research/` - deepResearch
|
|
98
87
|
- `search/` - searchWeb and provider adapters (Google, DuckDuckGo)
|
|
99
|
-
- `tracking/` - trackChanges
|
|
88
|
+
- `tracking/` - trackChanges (currently disabled in server.js)
|
|
89
|
+
- `llmstxt/` - generateLLMsTxt
|
|
90
|
+
|
|
91
|
+
### Available MCP Tools (18 total)
|
|
92
|
+
**Basic Tools (server.js inline):**
|
|
93
|
+
- fetch_url, extract_text, extract_links, extract_metadata, scrape_structured
|
|
94
|
+
|
|
95
|
+
**Advanced Tools:**
|
|
96
|
+
- search_web (conditional - requires search provider), crawl_deep, map_site
|
|
97
|
+
- extract_content, process_document, summarize_content, analyze_content
|
|
98
|
+
- batch_scrape, scrape_with_actions, deep_research
|
|
99
|
+
- generate_llms_txt, stealth_mode, localization
|
|
100
|
+
|
|
101
|
+
**Note:** track_changes tool is implemented but currently commented out in server.js (line 1409-1535)
|
|
100
102
|
|
|
101
103
|
### MCP Server Entry Point
|
|
102
104
|
The main server implementation is in `server.js` which:
|
|
103
|
-
1. Uses
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
105
|
+
1. **Authentication Flow**: Uses AuthManager for API key validation and credit tracking
|
|
106
|
+
- Checks for authentication on startup (skipped in creator mode)
|
|
107
|
+
- Auto-setup if CRAWLFORGE_API_KEY environment variable is present
|
|
108
|
+
- Creator mode enabled via BYPASS_API_KEY=true
|
|
109
|
+
2. **Tool Registration**: All tools registered via `server.registerTool()` pattern
|
|
110
|
+
- Wrapped with `withAuth()` function for credit tracking and authentication
|
|
111
|
+
- Each tool has inline Zod schema for parameter validation
|
|
112
|
+
- Response format uses `content` array with text objects
|
|
113
|
+
3. **Transport**: Uses stdio transport for MCP protocol communication
|
|
114
|
+
4. **Graceful Shutdown**: Cleans up browser instances, job managers, and other resources
|
|
115
|
+
|
|
116
|
+
### Tool Credit System
|
|
117
|
+
Each tool wrapped with `withAuth(toolName, handler)`:
|
|
118
|
+
- Checks credits before execution (skipped in creator mode)
|
|
119
|
+
- Reports usage with credit deduction on success
|
|
120
|
+
- Charges half credits on error
|
|
121
|
+
- Returns credit error if insufficient balance
|
|
108
122
|
|
|
109
123
|
### Key Configuration
|
|
110
124
|
|
|
111
|
-
Critical environment variables
|
|
125
|
+
Critical environment variables defined in `src/constants/config.js`:
|
|
112
126
|
|
|
113
127
|
```bash
|
|
128
|
+
# Authentication (required unless in creator mode)
|
|
129
|
+
CRAWLFORGE_API_KEY=your_api_key_here
|
|
130
|
+
BYPASS_API_KEY=true # Enable creator mode for development
|
|
131
|
+
|
|
114
132
|
# Search Provider (auto, google, duckduckgo)
|
|
115
133
|
SEARCH_PROVIDER=auto
|
|
116
134
|
|
|
@@ -130,6 +148,11 @@ MAX_PAGES_PER_CRAWL=100
|
|
|
130
148
|
RESPECT_ROBOTS_TXT=true
|
|
131
149
|
```
|
|
132
150
|
|
|
151
|
+
### Configuration Files
|
|
152
|
+
- `~/.crawlforge/config.json` - User authentication and API key storage
|
|
153
|
+
- `.env` - Environment variables for development
|
|
154
|
+
- `src/constants/config.js` - Central configuration with defaults and validation
|
|
155
|
+
|
|
133
156
|
## Common Development Tasks
|
|
134
157
|
|
|
135
158
|
### Running a Single Test
|
|
@@ -158,11 +181,22 @@ node tests/validation/wave3-validation.js
|
|
|
158
181
|
```
|
|
159
182
|
|
|
160
183
|
### Debugging Tips
|
|
161
|
-
- Server logs are written to console via Winston logger
|
|
184
|
+
- Server logs are written to console via Winston logger (stderr for status, stdout for MCP protocol)
|
|
162
185
|
- Set `NODE_ENV=development` for verbose logging
|
|
163
|
-
- Use `--expose-gc` flag for memory profiling
|
|
186
|
+
- Use `--expose-gc` flag for memory profiling: `node --expose-gc server.js`
|
|
164
187
|
- Check `cache/` directory for cached responses
|
|
165
188
|
- Review `logs/` directory for application logs
|
|
189
|
+
- Use creator mode during development to bypass authentication: `BYPASS_API_KEY=true npm start`
|
|
190
|
+
- Memory monitoring automatically enabled in development mode (logs every 60s if >200MB)
|
|
191
|
+
|
|
192
|
+
### Adding New Tools
|
|
193
|
+
When adding a new tool to server.js:
|
|
194
|
+
1. Import the tool class from `src/tools/`
|
|
195
|
+
2. Instantiate the tool (with config if needed)
|
|
196
|
+
3. Register with `server.registerTool(name, { description, inputSchema }, withAuth(name, handler))`
|
|
197
|
+
4. Ensure tool implements `execute(params)` method
|
|
198
|
+
5. Add to cleanup array in gracefulShutdown if it has `destroy()` or `cleanup()` methods
|
|
199
|
+
6. Update tool count in console log at server startup (line 1860)
|
|
166
200
|
|
|
167
201
|
## CI/CD Security Integration
|
|
168
202
|
|
|
@@ -313,3 +347,66 @@ The security integration ensures that:
|
|
|
313
347
|
- Comprehensive audit trails are maintained
|
|
314
348
|
- Automated remediation guidance is provided
|
|
315
349
|
|
|
350
|
+
## Important Implementation Patterns
|
|
351
|
+
|
|
352
|
+
### Tool Structure
|
|
353
|
+
All tools follow a consistent class-based pattern:
|
|
354
|
+
```javascript
|
|
355
|
+
export class ToolName {
|
|
356
|
+
constructor(config) {
|
|
357
|
+
this.config = config;
|
|
358
|
+
// Initialize resources
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
async execute(params) {
|
|
362
|
+
// Validate params (Zod validation done in server.js)
|
|
363
|
+
// Execute tool logic
|
|
364
|
+
// Return structured result
|
|
365
|
+
return { success: true, data: {...} };
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
async destroy() {
|
|
369
|
+
// Cleanup resources (browsers, connections, etc.)
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
### Search Provider Architecture
|
|
375
|
+
Search providers implement a factory pattern:
|
|
376
|
+
- `searchProviderFactory.js` selects provider based on config
|
|
377
|
+
- Providers implement common interface: `search(query, options)`
|
|
378
|
+
- Auto-fallback: Google ā DuckDuckGo if Google credentials missing
|
|
379
|
+
- Each provider in `src/tools/search/adapters/`
|
|
380
|
+
|
|
381
|
+
### Browser Management
|
|
382
|
+
- Playwright used for browser automation (ActionExecutor, ScrapeWithActionsTool)
|
|
383
|
+
- Stealth features in StealthBrowserManager
|
|
384
|
+
- Always cleanup browsers in error handlers
|
|
385
|
+
- Context isolation per operation for security
|
|
386
|
+
|
|
387
|
+
### Memory Management
|
|
388
|
+
Critical for long-running processes:
|
|
389
|
+
- Graceful shutdown handlers registered for SIGINT/SIGTERM
|
|
390
|
+
- All tools with heavy resources must implement `destroy()` or `cleanup()`
|
|
391
|
+
- Memory monitoring in development mode (server.js line 1955-1963)
|
|
392
|
+
- Force GC on shutdown if available
|
|
393
|
+
|
|
394
|
+
### Error Handling Pattern
|
|
395
|
+
```javascript
|
|
396
|
+
try {
|
|
397
|
+
const result = await tool.execute(params);
|
|
398
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
399
|
+
} catch (error) {
|
|
400
|
+
return {
|
|
401
|
+
content: [{ type: "text", text: `Operation failed: ${error.message}` }],
|
|
402
|
+
isError: true
|
|
403
|
+
};
|
|
404
|
+
}
|
|
405
|
+
```
|
|
406
|
+
|
|
407
|
+
### Configuration Validation
|
|
408
|
+
- All config in `src/constants/config.js` with defaults
|
|
409
|
+
- `validateConfig()` checks required settings
|
|
410
|
+
- Environment variables parsed with fallbacks
|
|
411
|
+
- Config errors only fail in production (warnings in dev)
|
|
412
|
+
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "3.0.
|
|
3
|
+
"version": "3.0.1",
|
|
4
4
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 16+ comprehensive web scraping, crawling, and content processing tools.",
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"bin": {
|
|
@@ -12,6 +12,9 @@
|
|
|
12
12
|
"setup": "node setup.js",
|
|
13
13
|
"dev": "cross-env NODE_ENV=development node server.js",
|
|
14
14
|
"test": "node tests/integration/mcp-protocol-compliance.test.js",
|
|
15
|
+
"test:tools": "node test-tools.js",
|
|
16
|
+
"test:real-world": "node test-real-world.js",
|
|
17
|
+
"test:all": "bash run-all-tests.sh",
|
|
15
18
|
"postinstall": "echo '\nš CrawlForge MCP Server installed!\n\nRun \"npx crawlforge-setup\" to configure your API key and get started.\n'",
|
|
16
19
|
"docker:build": "docker build -t crawlforge .",
|
|
17
20
|
"docker:dev": "docker-compose up crawlforge-dev",
|