crawlforge-mcp-server 3.0.3 → 3.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +31 -8
- package/README.md +14 -9
- package/package.json +2 -2
- package/server.js +1 -1
- package/src/core/AlertNotificationSystem.js +1 -1
- package/src/core/AuthManager.js +5 -4
- package/src/core/SnapshotManager.js +12 -2
- package/src/core/connections/ConnectionPool.js +1 -1
- package/src/core/integrations/PerformanceIntegration.js +2 -4
- package/src/core/processing/BrowserProcessor.js +4 -4
- package/src/tools/advanced/ScrapeWithActionsTool.js +10 -2
- package/src/tools/search/adapters/duckduckgoSearch.js +118 -16
- package/src/tools/tracking/trackChanges.js +38 -16
package/CLAUDE.md
CHANGED
|
@@ -4,7 +4,10 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
|
4
4
|
|
|
5
5
|
## Project Overview
|
|
6
6
|
|
|
7
|
-
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server implementation providing 19 comprehensive web scraping, crawling, and content processing tools. Version 3.0 includes advanced content extraction, document processing, summarization, and analysis capabilities. Wave 2 adds asynchronous batch processing and browser automation features. Wave 3 introduces deep research orchestration, stealth scraping, localization, and change tracking.
|
|
7
|
+
CrawlForge MCP Server - A professional MCP (Model Context Protocol) server implementation providing 19 comprehensive web scraping, crawling, and content processing tools. Version 3.0.3 includes advanced content extraction, document processing, summarization, and analysis capabilities. Wave 2 adds asynchronous batch processing and browser automation features. Wave 3 introduces deep research orchestration, stealth scraping, localization, and change tracking.
|
|
8
|
+
|
|
9
|
+
**Current Version:** 3.0.3
|
|
10
|
+
**Security Status:** Secure (authentication bypass vulnerability fixed in v3.0.3)
|
|
8
11
|
|
|
9
12
|
## Development Commands
|
|
10
13
|
|
|
@@ -12,11 +15,16 @@ CrawlForge MCP Server - A professional MCP (Model Context Protocol) server imple
|
|
|
12
15
|
# Install dependencies
|
|
13
16
|
npm install
|
|
14
17
|
|
|
15
|
-
# Setup (required for first run)
|
|
18
|
+
# Setup (required for first run - users only)
|
|
16
19
|
npm run setup
|
|
17
20
|
# Or provide API key via environment:
|
|
18
21
|
export CRAWLFORGE_API_KEY="your_api_key_here"
|
|
19
22
|
|
|
23
|
+
# Creator Mode (for package maintainer only)
|
|
24
|
+
# Set your creator secret in .env file:
|
|
25
|
+
# CRAWLFORGE_CREATOR_SECRET=your-secret-uuid
|
|
26
|
+
# This enables unlimited access for development/testing
|
|
27
|
+
|
|
20
28
|
# Run the server (production)
|
|
21
29
|
npm start
|
|
22
30
|
|
|
@@ -104,33 +112,48 @@ Tools are organized in subdirectories by category:
|
|
|
104
112
|
|
|
105
113
|
The main server implementation is in `server.js` which:
|
|
106
114
|
|
|
107
|
-
1. **
|
|
115
|
+
1. **Secure Creator Mode** (server.js lines 3-25):
|
|
116
|
+
- Loads `.env` file early to check for `CRAWLFORGE_CREATOR_SECRET`
|
|
117
|
+
- Validates secret using SHA256 hash comparison
|
|
118
|
+
- Only creator with valid secret UUID can enable unlimited access
|
|
119
|
+
- Hash stored in code is safe to commit (one-way cryptographic hash)
|
|
120
|
+
|
|
121
|
+
2. **Authentication Flow**: Uses AuthManager for API key validation and credit tracking
|
|
108
122
|
- Checks for authentication on startup
|
|
109
123
|
- Auto-setup if CRAWLFORGE_API_KEY environment variable is present
|
|
110
|
-
|
|
124
|
+
- Creator mode bypasses credit checks for development/testing
|
|
125
|
+
|
|
126
|
+
3. **Tool Registration**: All tools registered via `server.registerTool()` pattern
|
|
111
127
|
- Wrapped with `withAuth()` function for credit tracking and authentication
|
|
112
128
|
- Each tool has inline Zod schema for parameter validation
|
|
113
129
|
- Response format uses `content` array with text objects
|
|
114
|
-
|
|
115
|
-
4. **
|
|
130
|
+
|
|
131
|
+
4. **Transport**: Uses stdio transport for MCP protocol communication
|
|
132
|
+
|
|
133
|
+
5. **Graceful Shutdown**: Cleans up browser instances, job managers, and other resources
|
|
116
134
|
|
|
117
135
|
### Tool Credit System
|
|
118
136
|
|
|
119
137
|
Each tool wrapped with `withAuth(toolName, handler)`:
|
|
120
138
|
|
|
121
|
-
- Checks credits before execution
|
|
139
|
+
- Checks credits before execution (skipped in creator mode)
|
|
122
140
|
- Reports usage with credit deduction on success
|
|
123
141
|
- Charges half credits on error
|
|
124
142
|
- Returns credit error if insufficient balance
|
|
143
|
+
- Creator mode: Unlimited access for package maintainer
|
|
125
144
|
|
|
126
145
|
### Key Configuration
|
|
127
146
|
|
|
128
147
|
Critical environment variables defined in `src/constants/config.js`:
|
|
129
148
|
|
|
130
149
|
```bash
|
|
131
|
-
# Authentication (required)
|
|
150
|
+
# Authentication (required for users)
|
|
132
151
|
CRAWLFORGE_API_KEY=your_api_key_here
|
|
133
152
|
|
|
153
|
+
# Creator Mode (maintainer only - KEEP SECRET!)
|
|
154
|
+
# CRAWLFORGE_CREATOR_SECRET=your-uuid-secret
|
|
155
|
+
# Enables unlimited access for development/testing
|
|
156
|
+
|
|
134
157
|
# Search Provider (auto, google, duckduckgo)
|
|
135
158
|
SEARCH_PROVIDER=auto
|
|
136
159
|
|
package/README.md
CHANGED
|
@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
|
|
|
9
9
|
|
|
10
10
|
## 🎯 Features
|
|
11
11
|
|
|
12
|
-
- **
|
|
12
|
+
- **18 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis
|
|
13
13
|
- **Free Tier**: 1,000 credits to get started instantly
|
|
14
14
|
- **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
|
|
15
15
|
- **Enterprise Ready**: Scale up with paid plans for production use
|
|
@@ -113,7 +113,7 @@ Or use the MCP plugin in Cursor settings.
|
|
|
113
113
|
| **Enterprise** | 250,000 | Large scale operations |
|
|
114
114
|
|
|
115
115
|
**All plans include:**
|
|
116
|
-
- Access to all
|
|
116
|
+
- Access to all 18 tools
|
|
117
117
|
- Credits never expire and roll over month-to-month
|
|
118
118
|
- API access and webhook notifications
|
|
119
119
|
|
|
@@ -125,7 +125,7 @@ Or use the MCP plugin in Cursor settings.
|
|
|
125
125
|
|
|
126
126
|
```bash
|
|
127
127
|
# Optional: Set API key via environment
|
|
128
|
-
export CRAWLFORGE_API_KEY="
|
|
128
|
+
export CRAWLFORGE_API_KEY="cf_live_your_api_key_here"
|
|
129
129
|
|
|
130
130
|
# Optional: Custom API endpoint (for enterprise)
|
|
131
131
|
export CRAWLFORGE_API_URL="https://api.crawlforge.dev"
|
|
@@ -137,7 +137,7 @@ Your configuration is stored at `~/.crawlforge/config.json`:
|
|
|
137
137
|
|
|
138
138
|
```json
|
|
139
139
|
{
|
|
140
|
-
"apiKey": "
|
|
140
|
+
"apiKey": "cf_live_...",
|
|
141
141
|
"userId": "user_...",
|
|
142
142
|
"email": "you@example.com"
|
|
143
143
|
}
|
|
@@ -157,11 +157,16 @@ Once configured, use these tools in your AI assistant:
|
|
|
157
157
|
|
|
158
158
|
## 🔒 Security & Privacy
|
|
159
159
|
|
|
160
|
-
- API keys
|
|
161
|
-
-
|
|
162
|
-
-
|
|
163
|
-
-
|
|
164
|
-
-
|
|
160
|
+
- **Secure Authentication**: API keys required for all operations (no bypass methods)
|
|
161
|
+
- **Local Storage**: API keys stored securely at `~/.crawlforge/config.json`
|
|
162
|
+
- **HTTPS Only**: All connections use encrypted HTTPS
|
|
163
|
+
- **No Data Retention**: We don't store scraped data, only usage logs
|
|
164
|
+
- **Rate Limiting**: Built-in protection against abuse
|
|
165
|
+
- **Compliance**: Respects robots.txt and GDPR requirements
|
|
166
|
+
|
|
167
|
+
### Security Updates
|
|
168
|
+
|
|
169
|
+
**v3.0.3 (2025-10-01)**: Removed authentication bypass vulnerability. All users must authenticate with valid API keys.
|
|
165
170
|
|
|
166
171
|
## 🆘 Support
|
|
167
172
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "3.0.
|
|
3
|
+
"version": "3.0.5",
|
|
4
4
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 19 comprehensive web scraping, crawling, and content processing tools.",
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"bin": {
|
|
@@ -95,12 +95,12 @@
|
|
|
95
95
|
"compromise": "^14.14.4",
|
|
96
96
|
"diff": "^8.0.2",
|
|
97
97
|
"dotenv": "^17.2.1",
|
|
98
|
+
"duck-duck-scrape": "^2.2.7",
|
|
98
99
|
"franc": "^6.2.0",
|
|
99
100
|
"isomorphic-dompurify": "^2.26.0",
|
|
100
101
|
"jsdom": "^26.1.0",
|
|
101
102
|
"lru-cache": "^11.1.0",
|
|
102
103
|
"node-cron": "^3.0.3",
|
|
103
|
-
"node-fetch": "^3.3.2",
|
|
104
104
|
"node-summarizer": "^1.0.7",
|
|
105
105
|
"p-queue": "^8.1.0",
|
|
106
106
|
"pdf-parse": "^1.1.1",
|
package/server.js
CHANGED
|
@@ -97,7 +97,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
97
97
|
}
|
|
98
98
|
|
|
99
99
|
// Create the server
|
|
100
|
-
const server = new McpServer({ name: "crawlforge", version: "3.0.
|
|
100
|
+
const server = new McpServer({ name: "crawlforge", version: "3.0.4" });
|
|
101
101
|
|
|
102
102
|
// Helper function to wrap tool handlers with authentication and credit tracking
|
|
103
103
|
function withAuth(toolName, handler) {
|
package/src/core/AuthManager.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* Handles API key validation, credit tracking, and usage reporting
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
// Using native fetch (Node.js 18+)
|
|
7
7
|
import fs from 'fs/promises';
|
|
8
8
|
import path from 'path';
|
|
9
9
|
|
|
@@ -221,7 +221,7 @@ class AuthManager {
|
|
|
221
221
|
responseStatus,
|
|
222
222
|
processingTime,
|
|
223
223
|
timestamp: new Date().toISOString(),
|
|
224
|
-
version: '3.0.
|
|
224
|
+
version: '3.0.3'
|
|
225
225
|
};
|
|
226
226
|
|
|
227
227
|
await fetch(`${this.apiEndpoint}/api/v1/usage`, {
|
|
@@ -268,12 +268,13 @@ class AuthManager {
|
|
|
268
268
|
deep_research: 10,
|
|
269
269
|
stealth_mode: 10,
|
|
270
270
|
|
|
271
|
-
// Heavy processing (
|
|
271
|
+
// Heavy processing (3-5 credits)
|
|
272
272
|
process_document: 3,
|
|
273
273
|
extract_content: 3,
|
|
274
274
|
scrape_with_actions: 5,
|
|
275
275
|
generate_llms_txt: 3,
|
|
276
|
-
localization: 5
|
|
276
|
+
localization: 5,
|
|
277
|
+
track_changes: 3
|
|
277
278
|
};
|
|
278
279
|
|
|
279
280
|
return costs[tool] || 1;
|
|
@@ -166,10 +166,20 @@ export class SnapshotManager extends EventEmitter {
|
|
|
166
166
|
*/
|
|
167
167
|
async storeSnapshot(url, content, metadata = {}, options = {}) {
|
|
168
168
|
const operationId = this.generateOperationId();
|
|
169
|
-
|
|
169
|
+
|
|
170
170
|
try {
|
|
171
|
+
// Validate content is not null/undefined
|
|
172
|
+
if (content === null || content === undefined) {
|
|
173
|
+
throw new Error('Content cannot be null or undefined');
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Ensure content is a string
|
|
177
|
+
if (typeof content !== 'string') {
|
|
178
|
+
content = String(content);
|
|
179
|
+
}
|
|
180
|
+
|
|
171
181
|
this.activeOperations.set(operationId, { type: 'store', url, startTime: Date.now() });
|
|
172
|
-
|
|
182
|
+
|
|
173
183
|
const snapshotId = this.generateSnapshotId(url, metadata.timestamp || Date.now());
|
|
174
184
|
const contentHash = this.hashContent(content);
|
|
175
185
|
|
|
@@ -218,7 +218,7 @@ export class ConnectionPool extends EventEmitter {
|
|
|
218
218
|
* @returns {Promise<Object>} - Request result
|
|
219
219
|
*/
|
|
220
220
|
async executeRequest(options, requestId) {
|
|
221
|
-
|
|
221
|
+
// Using native fetch (Node.js 18+)
|
|
222
222
|
|
|
223
223
|
const {
|
|
224
224
|
url,
|
|
@@ -109,8 +109,7 @@ export async function enhancedFetch(url, options = {}) {
|
|
|
109
109
|
const requestOptions = typeof url === 'string' ? { url, ...options } : url;
|
|
110
110
|
return await connectionPoolInstance.request(requestOptions);
|
|
111
111
|
} else {
|
|
112
|
-
// Fallback to
|
|
113
|
-
const { default: fetch } = await import('node-fetch');
|
|
112
|
+
// Fallback to native fetch (Node.js 18+)
|
|
114
113
|
return await fetch(url, options);
|
|
115
114
|
}
|
|
116
115
|
}
|
|
@@ -182,8 +181,7 @@ export async function enhancedConcurrentRequests(requests, options = {}) {
|
|
|
182
181
|
if (connectionPoolInstance) {
|
|
183
182
|
return await connectionPoolInstance.requestBatch(requests, options);
|
|
184
183
|
} else {
|
|
185
|
-
// Fallback to Promise.all with
|
|
186
|
-
const { default: fetch } = await import('node-fetch');
|
|
184
|
+
// Fallback to Promise.all with native fetch (Node.js 18+)
|
|
187
185
|
const promises = requests.map(request => fetch(request.url || request, request));
|
|
188
186
|
return await Promise.all(promises);
|
|
189
187
|
}
|
|
@@ -333,8 +333,8 @@ export class BrowserProcessor {
|
|
|
333
333
|
const { context, contextId } = await this.stealthManager.createStealthContext({
|
|
334
334
|
level: options.stealthMode.level,
|
|
335
335
|
customViewport: {
|
|
336
|
-
width: options.viewportWidth,
|
|
337
|
-
height: options.viewportHeight
|
|
336
|
+
width: options.viewportWidth || 1280,
|
|
337
|
+
height: options.viewportHeight || 720
|
|
338
338
|
}
|
|
339
339
|
});
|
|
340
340
|
|
|
@@ -475,8 +475,8 @@ export class BrowserProcessor {
|
|
|
475
475
|
async createPage(options) {
|
|
476
476
|
const contextOptions = {
|
|
477
477
|
viewport: {
|
|
478
|
-
width: options.viewportWidth,
|
|
479
|
-
height: options.viewportHeight
|
|
478
|
+
width: options.viewportWidth || 1280,
|
|
479
|
+
height: options.viewportHeight || 720
|
|
480
480
|
},
|
|
481
481
|
userAgent: options.userAgent,
|
|
482
482
|
extraHTTPHeaders: options.extraHeaders,
|
|
@@ -147,7 +147,12 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
147
147
|
enableLogging = true,
|
|
148
148
|
enableCaching = false,
|
|
149
149
|
maxConcurrentSessions = 3,
|
|
150
|
-
defaultBrowserOptions = {
|
|
150
|
+
defaultBrowserOptions = {
|
|
151
|
+
viewportWidth: 1280,
|
|
152
|
+
viewportHeight: 720,
|
|
153
|
+
headless: true,
|
|
154
|
+
timeout: 30000
|
|
155
|
+
},
|
|
151
156
|
screenshotPath = './screenshots'
|
|
152
157
|
} = options;
|
|
153
158
|
|
|
@@ -317,7 +322,10 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
317
322
|
sessionId: sessionContext.id,
|
|
318
323
|
url: params.url,
|
|
319
324
|
executionTime,
|
|
320
|
-
|
|
325
|
+
|
|
326
|
+
// Include error message if action chain failed
|
|
327
|
+
error: chainResult.error || undefined,
|
|
328
|
+
|
|
321
329
|
actionResults,
|
|
322
330
|
totalActions: params.actions.length,
|
|
323
331
|
successfulActions: actionResults.filter(r => r.success).length,
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import * as cheerio from 'cheerio';
|
|
2
|
+
import { search as ddgSearch, SafeSearchType, SearchTimeType } from 'duck-duck-scrape';
|
|
2
3
|
|
|
3
4
|
export class DuckDuckGoSearchAdapter {
|
|
4
5
|
constructor(options = {}) {
|
|
5
6
|
this.timeout = options.timeout || 30000;
|
|
6
7
|
this.maxRetries = options.maxRetries || 3;
|
|
7
|
-
this.userAgent = options.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
|
8
|
-
this.retryDelay = options.retryDelay ||
|
|
8
|
+
this.userAgent = options.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
9
|
+
this.retryDelay = options.retryDelay || 2000; // Increased base delay
|
|
9
10
|
this.baseUrl = 'https://html.duckduckgo.com/html/';
|
|
10
11
|
}
|
|
11
12
|
|
|
@@ -19,19 +20,36 @@ export class DuckDuckGoSearchAdapter {
|
|
|
19
20
|
dateRestrict
|
|
20
21
|
} = params;
|
|
21
22
|
|
|
22
|
-
//
|
|
23
|
+
// Try duck-duck-scrape library first (more reliable API access)
|
|
24
|
+
try {
|
|
25
|
+
const results = await this.searchWithLibrary(query, num, safe, dateRestrict);
|
|
26
|
+
if (results.items && results.items.length > 0) {
|
|
27
|
+
return results;
|
|
28
|
+
}
|
|
29
|
+
} catch (libraryError) {
|
|
30
|
+
console.warn('DuckDuckGo library search failed:', libraryError.message);
|
|
31
|
+
// Check if it's a CAPTCHA/anomaly error
|
|
32
|
+
if (libraryError.message.includes('anomaly') || libraryError.message.includes('too quickly')) {
|
|
33
|
+
throw new Error(
|
|
34
|
+
'DuckDuckGo is blocking automated requests. ' +
|
|
35
|
+
'To use web search reliably, please configure Google Custom Search API by setting ' +
|
|
36
|
+
'GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables. ' +
|
|
37
|
+
'See: https://developers.google.com/custom-search/v1/introduction'
|
|
38
|
+
);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Fallback to HTML scraping (legacy method)
|
|
23
43
|
const offset = (start - 1) * num;
|
|
24
44
|
|
|
25
|
-
// Build form data for POST request to DuckDuckGo HTML endpoint
|
|
26
45
|
const formData = new URLSearchParams({
|
|
27
46
|
q: query,
|
|
28
|
-
b: offset.toString(),
|
|
29
|
-
kl: 'us-en',
|
|
30
|
-
df: '',
|
|
31
|
-
safe: 'moderate'
|
|
47
|
+
b: offset.toString(),
|
|
48
|
+
kl: 'us-en',
|
|
49
|
+
df: '',
|
|
50
|
+
safe: 'moderate'
|
|
32
51
|
});
|
|
33
52
|
|
|
34
|
-
// Update safe search parameter
|
|
35
53
|
if (safe === 'active') {
|
|
36
54
|
formData.set('safe', 'strict');
|
|
37
55
|
} else if (safe === 'off') {
|
|
@@ -40,13 +58,11 @@ export class DuckDuckGoSearchAdapter {
|
|
|
40
58
|
formData.set('safe', 'moderate');
|
|
41
59
|
}
|
|
42
60
|
|
|
43
|
-
// Add language if specified
|
|
44
61
|
if (lr && lr.startsWith('lang_')) {
|
|
45
62
|
const lang = lr.replace('lang_', '');
|
|
46
63
|
formData.set('kl', this.mapLanguageCode(lang));
|
|
47
64
|
}
|
|
48
65
|
|
|
49
|
-
// Add date filter if specified
|
|
50
66
|
if (dateRestrict) {
|
|
51
67
|
const timeFilter = this.mapDateRestrict(dateRestrict);
|
|
52
68
|
if (timeFilter) {
|
|
@@ -57,15 +73,20 @@ export class DuckDuckGoSearchAdapter {
|
|
|
57
73
|
let lastError;
|
|
58
74
|
for (let attempt = 1; attempt <= this.maxRetries; attempt++) {
|
|
59
75
|
try {
|
|
76
|
+
// Add delay between attempts to avoid rate limiting
|
|
77
|
+
if (attempt > 1) {
|
|
78
|
+
await new Promise(resolve =>
|
|
79
|
+
setTimeout(resolve, this.retryDelay * Math.pow(2, attempt - 1))
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
|
|
60
83
|
const htmlResponse = await this.makeRequest(formData);
|
|
61
84
|
return this.parseHtmlResponse(htmlResponse, query, num, start);
|
|
62
85
|
} catch (error) {
|
|
63
86
|
lastError = error;
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
setTimeout(resolve, this.retryDelay * Math.pow(2, attempt - 1))
|
|
68
|
-
);
|
|
87
|
+
// If it's a CAPTCHA error, don't retry - it won't help
|
|
88
|
+
if (error.message.includes('CAPTCHA') || error.message.includes('automated requests')) {
|
|
89
|
+
throw error;
|
|
69
90
|
}
|
|
70
91
|
}
|
|
71
92
|
}
|
|
@@ -73,6 +94,67 @@ export class DuckDuckGoSearchAdapter {
|
|
|
73
94
|
throw new Error(`DuckDuckGo search failed after ${this.maxRetries} attempts: ${lastError.message}`);
|
|
74
95
|
}
|
|
75
96
|
|
|
97
|
+
async searchWithLibrary(query, num, safe, dateRestrict) {
|
|
98
|
+
// Map safe search settings
|
|
99
|
+
let safeSearch = SafeSearchType.MODERATE;
|
|
100
|
+
if (safe === 'active' || safe === 'strict') {
|
|
101
|
+
safeSearch = SafeSearchType.STRICT;
|
|
102
|
+
} else if (safe === 'off') {
|
|
103
|
+
safeSearch = SafeSearchType.OFF;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Map time filter
|
|
107
|
+
let time = undefined;
|
|
108
|
+
if (dateRestrict) {
|
|
109
|
+
const timeMap = {
|
|
110
|
+
'd1': SearchTimeType.DAY,
|
|
111
|
+
'w1': SearchTimeType.WEEK,
|
|
112
|
+
'm1': SearchTimeType.MONTH,
|
|
113
|
+
'y1': SearchTimeType.YEAR
|
|
114
|
+
};
|
|
115
|
+
time = timeMap[dateRestrict];
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const searchResults = await ddgSearch(query, {
|
|
119
|
+
safeSearch,
|
|
120
|
+
time,
|
|
121
|
+
locale: 'en-us'
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
// Transform results to match expected format
|
|
125
|
+
const items = (searchResults.results || []).slice(0, num).map(result => ({
|
|
126
|
+
title: result.title || '',
|
|
127
|
+
link: result.url || '',
|
|
128
|
+
snippet: result.description || '',
|
|
129
|
+
displayLink: this.extractDomain(result.url),
|
|
130
|
+
formattedUrl: result.url || '',
|
|
131
|
+
htmlSnippet: result.description || '',
|
|
132
|
+
pagemap: {
|
|
133
|
+
metatags: {
|
|
134
|
+
title: result.title || '',
|
|
135
|
+
description: result.description || ''
|
|
136
|
+
}
|
|
137
|
+
},
|
|
138
|
+
metadata: {
|
|
139
|
+
source: 'duckduckgo_api',
|
|
140
|
+
type: 'web_result',
|
|
141
|
+
hostname: result.hostname || '',
|
|
142
|
+
icon: result.icon || ''
|
|
143
|
+
}
|
|
144
|
+
}));
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
kind: 'duckduckgo#search',
|
|
148
|
+
searchInformation: {
|
|
149
|
+
searchTime: 0.1,
|
|
150
|
+
formattedSearchTime: '0.10',
|
|
151
|
+
totalResults: items.length.toString(),
|
|
152
|
+
formattedTotalResults: items.length.toLocaleString()
|
|
153
|
+
},
|
|
154
|
+
items: items
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
76
158
|
async makeRequest(formData) {
|
|
77
159
|
const controller = new AbortController();
|
|
78
160
|
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
@@ -121,6 +203,26 @@ export class DuckDuckGoSearchAdapter {
|
|
|
121
203
|
const $ = cheerio.load(html);
|
|
122
204
|
const items = [];
|
|
123
205
|
|
|
206
|
+
// Check for CAPTCHA challenge (DuckDuckGo bot protection)
|
|
207
|
+
const captchaIndicators = [
|
|
208
|
+
'anomaly-modal',
|
|
209
|
+
'Unfortunately, bots use DuckDuckGo too',
|
|
210
|
+
'Select all squares containing a duck',
|
|
211
|
+
'confirm this search was made by a human',
|
|
212
|
+
'challenge-form'
|
|
213
|
+
];
|
|
214
|
+
|
|
215
|
+
for (const indicator of captchaIndicators) {
|
|
216
|
+
if (html.includes(indicator)) {
|
|
217
|
+
throw new Error(
|
|
218
|
+
'DuckDuckGo CAPTCHA detected - automated requests are being blocked. ' +
|
|
219
|
+
'To use web search reliably, please configure Google Custom Search API by setting ' +
|
|
220
|
+
'GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID environment variables. ' +
|
|
221
|
+
'See: https://developers.google.com/custom-search/v1/introduction'
|
|
222
|
+
);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
124
226
|
// Look for search result containers - DuckDuckGo uses various selectors
|
|
125
227
|
const resultSelectors = [
|
|
126
228
|
'.result', // Primary result class
|
|
@@ -285,29 +285,40 @@ export class TrackChangesTool extends EventEmitter {
|
|
|
285
285
|
* @returns {Object} - Baseline creation results
|
|
286
286
|
*/
|
|
287
287
|
async createBaseline(params) {
|
|
288
|
-
const { url, content, html, trackingOptions, storageOptions } = params;
|
|
289
|
-
|
|
288
|
+
const { url, content, html, trackingOptions, storageOptions = {} } = params;
|
|
289
|
+
|
|
290
|
+
// Apply defaults for storageOptions fields
|
|
291
|
+
const enableSnapshots = storageOptions.enableSnapshots !== false; // Default to true
|
|
292
|
+
|
|
290
293
|
try {
|
|
291
294
|
// Fetch content if not provided
|
|
292
295
|
let sourceContent = content || html;
|
|
293
296
|
let fetchMetadata = {};
|
|
294
|
-
|
|
297
|
+
|
|
295
298
|
if (!sourceContent) {
|
|
296
299
|
const fetchResult = await this.fetchContent(url);
|
|
300
|
+
if (!fetchResult || !fetchResult.content) {
|
|
301
|
+
throw new Error('Failed to fetch content from URL');
|
|
302
|
+
}
|
|
297
303
|
sourceContent = fetchResult.content;
|
|
298
|
-
fetchMetadata = fetchResult.metadata;
|
|
304
|
+
fetchMetadata = fetchResult.metadata || {};
|
|
299
305
|
}
|
|
300
|
-
|
|
306
|
+
|
|
307
|
+
// Validate sourceContent
|
|
308
|
+
if (!sourceContent || typeof sourceContent !== 'string') {
|
|
309
|
+
throw new Error('Invalid content: content must be a non-empty string');
|
|
310
|
+
}
|
|
311
|
+
|
|
301
312
|
// Create baseline with change tracker
|
|
302
313
|
const baseline = await this.changeTracker.createBaseline(
|
|
303
314
|
url,
|
|
304
315
|
sourceContent,
|
|
305
316
|
trackingOptions
|
|
306
317
|
);
|
|
307
|
-
|
|
308
|
-
// Store snapshot if enabled
|
|
318
|
+
|
|
319
|
+
// Store snapshot if enabled (defaults to true)
|
|
309
320
|
let snapshotInfo = null;
|
|
310
|
-
if (
|
|
321
|
+
if (enableSnapshots) {
|
|
311
322
|
const snapshotResult = await this.snapshotManager.storeSnapshot(
|
|
312
323
|
url,
|
|
313
324
|
sourceContent,
|
|
@@ -347,29 +358,40 @@ export class TrackChangesTool extends EventEmitter {
|
|
|
347
358
|
* @returns {Object} - Comparison results
|
|
348
359
|
*/
|
|
349
360
|
async compareWithBaseline(params) {
|
|
350
|
-
const { url, content, html, trackingOptions, storageOptions, notificationOptions } = params;
|
|
351
|
-
|
|
361
|
+
const { url, content, html, trackingOptions, storageOptions = {}, notificationOptions } = params;
|
|
362
|
+
|
|
363
|
+
// Apply defaults for storageOptions fields
|
|
364
|
+
const enableSnapshots = storageOptions.enableSnapshots !== false; // Default to true
|
|
365
|
+
|
|
352
366
|
try {
|
|
353
367
|
// Fetch current content if not provided
|
|
354
368
|
let currentContent = content || html;
|
|
355
369
|
let fetchMetadata = {};
|
|
356
|
-
|
|
370
|
+
|
|
357
371
|
if (!currentContent) {
|
|
358
372
|
const fetchResult = await this.fetchContent(url);
|
|
373
|
+
if (!fetchResult || !fetchResult.content) {
|
|
374
|
+
throw new Error('Failed to fetch content from URL');
|
|
375
|
+
}
|
|
359
376
|
currentContent = fetchResult.content;
|
|
360
|
-
fetchMetadata = fetchResult.metadata;
|
|
377
|
+
fetchMetadata = fetchResult.metadata || {};
|
|
361
378
|
}
|
|
362
|
-
|
|
379
|
+
|
|
380
|
+
// Validate currentContent
|
|
381
|
+
if (!currentContent || typeof currentContent !== 'string') {
|
|
382
|
+
throw new Error('Invalid content: content must be a non-empty string');
|
|
383
|
+
}
|
|
384
|
+
|
|
363
385
|
// Perform comparison
|
|
364
386
|
const comparisonResult = await this.changeTracker.compareWithBaseline(
|
|
365
387
|
url,
|
|
366
388
|
currentContent,
|
|
367
389
|
trackingOptions
|
|
368
390
|
);
|
|
369
|
-
|
|
370
|
-
// Store snapshot if changes detected and storage enabled
|
|
391
|
+
|
|
392
|
+
// Store snapshot if changes detected and storage enabled (defaults to true)
|
|
371
393
|
let snapshotInfo = null;
|
|
372
|
-
if (comparisonResult.hasChanges &&
|
|
394
|
+
if (comparisonResult.hasChanges && enableSnapshots) {
|
|
373
395
|
const snapshotResult = await this.snapshotManager.storeSnapshot(
|
|
374
396
|
url,
|
|
375
397
|
currentContent,
|