crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,615 @@
|
|
|
1
|
+
import dotenv from 'dotenv';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { dirname, join } from 'path';
|
|
4
|
+
|
|
5
|
+
// Load environment variables
|
|
6
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
7
|
+
const __dirname = dirname(__filename);
|
|
8
|
+
dotenv.config({ path: join(__dirname, '../../.env'), quiet: true });
|
|
9
|
+
|
|
10
|
+
export const config = {
|
|
11
|
+
// Search Provider Configuration
|
|
12
|
+
search: {
|
|
13
|
+
provider: process.env.SEARCH_PROVIDER || 'auto', // 'google', 'duckduckgo', or 'auto'
|
|
14
|
+
|
|
15
|
+
// Google Search API
|
|
16
|
+
google: {
|
|
17
|
+
apiKey: process.env.GOOGLE_API_KEY || '',
|
|
18
|
+
searchEngineId: process.env.GOOGLE_SEARCH_ENGINE_ID || ''
|
|
19
|
+
},
|
|
20
|
+
|
|
21
|
+
// DuckDuckGo Configuration
|
|
22
|
+
duckduckgo: {
|
|
23
|
+
timeout: parseInt(process.env.DUCKDUCKGO_TIMEOUT || '30000'),
|
|
24
|
+
maxRetries: parseInt(process.env.DUCKDUCKGO_MAX_RETRIES || '3'),
|
|
25
|
+
retryDelay: parseInt(process.env.DUCKDUCKGO_RETRY_DELAY || '1000'),
|
|
26
|
+
userAgent: process.env.DUCKDUCKGO_USER_AGENT || process.env.USER_AGENT || 'CrawlForge/1.0'
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
|
|
30
|
+
// Performance
|
|
31
|
+
performance: {
|
|
32
|
+
maxWorkers: parseInt(process.env.MAX_WORKERS || '10'),
|
|
33
|
+
queueConcurrency: parseInt(process.env.QUEUE_CONCURRENCY || '10'),
|
|
34
|
+
cacheMaxSize: parseInt(process.env.CACHE_MAX_SIZE || '1000'),
|
|
35
|
+
cacheTTL: parseInt(process.env.CACHE_TTL || '3600000'),
|
|
36
|
+
cacheEnableDisk: process.env.CACHE_ENABLE_DISK !== 'false',
|
|
37
|
+
cacheDir: process.env.CACHE_DIR || './cache'
|
|
38
|
+
},
|
|
39
|
+
|
|
40
|
+
// Rate Limiting
|
|
41
|
+
rateLimit: {
|
|
42
|
+
requestsPerSecond: parseInt(process.env.RATE_LIMIT_REQUESTS_PER_SECOND || '10'),
|
|
43
|
+
requestsPerMinute: parseInt(process.env.RATE_LIMIT_REQUESTS_PER_MINUTE || '100'),
|
|
44
|
+
perDomain: process.env.RATE_LIMIT_PER_DOMAIN !== 'false'
|
|
45
|
+
},
|
|
46
|
+
|
|
47
|
+
// Crawling
|
|
48
|
+
crawling: {
|
|
49
|
+
maxDepth: parseInt(process.env.MAX_CRAWL_DEPTH || '5'),
|
|
50
|
+
maxPages: parseInt(process.env.MAX_PAGES_PER_CRAWL || '100'),
|
|
51
|
+
respectRobots: process.env.RESPECT_ROBOTS_TXT !== 'false',
|
|
52
|
+
userAgent: process.env.USER_AGENT || 'CrawlForge/1.0',
|
|
53
|
+
timeout: parseInt(process.env.CRAWL_TIMEOUT || '30000'),
|
|
54
|
+
followExternal: process.env.FOLLOW_EXTERNAL_LINKS === 'true'
|
|
55
|
+
},
|
|
56
|
+
|
|
57
|
+
// Search ranking and deduplication
|
|
58
|
+
searchProcessing: {
|
|
59
|
+
enableRanking: process.env.ENABLE_SEARCH_RANKING !== 'false',
|
|
60
|
+
enableDeduplication: process.env.ENABLE_SEARCH_DEDUPLICATION !== 'false',
|
|
61
|
+
|
|
62
|
+
// Ranking configuration
|
|
63
|
+
ranking: {
|
|
64
|
+
weights: {
|
|
65
|
+
bm25: parseFloat(process.env.RANKING_WEIGHT_BM25 || '0.4'),
|
|
66
|
+
semantic: parseFloat(process.env.RANKING_WEIGHT_SEMANTIC || '0.3'),
|
|
67
|
+
authority: parseFloat(process.env.RANKING_WEIGHT_AUTHORITY || '0.2'),
|
|
68
|
+
freshness: parseFloat(process.env.RANKING_WEIGHT_FRESHNESS || '0.1')
|
|
69
|
+
},
|
|
70
|
+
bm25: {
|
|
71
|
+
k1: parseFloat(process.env.BM25_K1 || '1.5'),
|
|
72
|
+
b: parseFloat(process.env.BM25_B || '0.75')
|
|
73
|
+
}
|
|
74
|
+
},
|
|
75
|
+
|
|
76
|
+
// Deduplication configuration
|
|
77
|
+
deduplication: {
|
|
78
|
+
thresholds: {
|
|
79
|
+
url: parseFloat(process.env.DEDUP_THRESHOLD_URL || '0.8'),
|
|
80
|
+
title: parseFloat(process.env.DEDUP_THRESHOLD_TITLE || '0.75'),
|
|
81
|
+
content: parseFloat(process.env.DEDUP_THRESHOLD_CONTENT || '0.7'),
|
|
82
|
+
combined: parseFloat(process.env.DEDUP_THRESHOLD_COMBINED || '0.6')
|
|
83
|
+
},
|
|
84
|
+
strategies: {
|
|
85
|
+
urlNormalization: process.env.DEDUP_URL_NORMALIZATION !== 'false',
|
|
86
|
+
titleFuzzy: process.env.DEDUP_TITLE_FUZZY !== 'false',
|
|
87
|
+
contentSimhash: process.env.DEDUP_CONTENT_SIMHASH !== 'false',
|
|
88
|
+
domainClustering: process.env.DEDUP_DOMAIN_CLUSTERING !== 'false'
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
},
|
|
92
|
+
|
|
93
|
+
// Security Configuration
|
|
94
|
+
security: {
|
|
95
|
+
// SSRF Protection
|
|
96
|
+
ssrfProtection: {
|
|
97
|
+
enabled: process.env.SSRF_PROTECTION_ENABLED !== 'false',
|
|
98
|
+
allowedProtocols: (process.env.ALLOWED_PROTOCOLS || 'http:,https:').split(','),
|
|
99
|
+
maxRequestSize: parseInt(process.env.MAX_REQUEST_SIZE || '104857600'), // 100MB
|
|
100
|
+
maxTimeout: parseInt(process.env.MAX_REQUEST_TIMEOUT || '60000'), // 60s
|
|
101
|
+
maxRedirects: parseInt(process.env.MAX_REDIRECTS || '5'),
|
|
102
|
+
allowedDomains: (process.env.ALLOWED_DOMAINS || '').split(',').filter(d => d.trim()),
|
|
103
|
+
blockedDomains: (process.env.BLOCKED_DOMAINS || 'localhost,127.0.0.1,0.0.0.0,metadata.google.internal,169.254.169.254,metadata.azure.com').split(',')
|
|
104
|
+
},
|
|
105
|
+
|
|
106
|
+
// Input Validation
|
|
107
|
+
inputValidation: {
|
|
108
|
+
enabled: process.env.INPUT_VALIDATION_ENABLED !== 'false',
|
|
109
|
+
maxStringLength: parseInt(process.env.MAX_STRING_LENGTH || '10000'),
|
|
110
|
+
maxArrayLength: parseInt(process.env.MAX_ARRAY_LENGTH || '1000'),
|
|
111
|
+
maxObjectDepth: parseInt(process.env.MAX_OBJECT_DEPTH || '10'),
|
|
112
|
+
maxRegexLength: parseInt(process.env.MAX_REGEX_LENGTH || '500'),
|
|
113
|
+
strictMode: process.env.STRICT_VALIDATION_MODE === 'true'
|
|
114
|
+
},
|
|
115
|
+
|
|
116
|
+
// API Security
|
|
117
|
+
apiSecurity: {
|
|
118
|
+
requireAuthentication: process.env.REQUIRE_AUTHENTICATION === 'true',
|
|
119
|
+
apiKeyHeader: process.env.API_KEY_HEADER || 'X-API-Key',
|
|
120
|
+
apiKey: process.env.API_KEY || '',
|
|
121
|
+
rateLimitByKey: process.env.RATE_LIMIT_BY_KEY === 'true',
|
|
122
|
+
auditLogging: process.env.AUDIT_LOGGING !== 'false'
|
|
123
|
+
},
|
|
124
|
+
|
|
125
|
+
// Content Security
|
|
126
|
+
contentSecurity: {
|
|
127
|
+
sanitizeHTML: process.env.SANITIZE_HTML !== 'false',
|
|
128
|
+
allowedHTMLTags: (process.env.ALLOWED_HTML_TAGS || 'p,br,strong,em,u,h1,h2,h3,h4,h5,h6').split(','),
|
|
129
|
+
blockScripts: process.env.BLOCK_SCRIPTS !== 'false',
|
|
130
|
+
blockIframes: process.env.BLOCK_IFRAMES !== 'false'
|
|
131
|
+
}
|
|
132
|
+
},
|
|
133
|
+
|
|
134
|
+
// Monitoring
|
|
135
|
+
monitoring: {
|
|
136
|
+
enableMetrics: process.env.ENABLE_METRICS === 'true',
|
|
137
|
+
logLevel: process.env.LOG_LEVEL || 'info',
|
|
138
|
+
securityLogging: process.env.SECURITY_LOGGING !== 'false',
|
|
139
|
+
violationLogging: process.env.VIOLATION_LOGGING !== 'false'
|
|
140
|
+
},
|
|
141
|
+
|
|
142
|
+
// Server
|
|
143
|
+
server: {
|
|
144
|
+
nodeEnv: process.env.NODE_ENV || 'development',
|
|
145
|
+
port: parseInt(process.env.PORT || '3000'),
|
|
146
|
+
enableSecurityHeaders: process.env.ENABLE_SECURITY_HEADERS !== 'false'
|
|
147
|
+
},
|
|
148
|
+
|
|
149
|
+
// Stealth Mode Configuration
|
|
150
|
+
stealth: {
|
|
151
|
+
// Global stealth settings
|
|
152
|
+
enabled: process.env.STEALTH_MODE_ENABLED === 'true',
|
|
153
|
+
defaultLevel: process.env.STEALTH_LEVEL || 'medium', // 'basic', 'medium', 'advanced'
|
|
154
|
+
|
|
155
|
+
// Browser fingerprinting
|
|
156
|
+
fingerprinting: {
|
|
157
|
+
randomizeUserAgent: process.env.STEALTH_RANDOMIZE_USER_AGENT !== 'false',
|
|
158
|
+
randomizeViewport: process.env.STEALTH_RANDOMIZE_VIEWPORT !== 'false',
|
|
159
|
+
spoofTimezone: process.env.STEALTH_SPOOF_TIMEZONE !== 'false',
|
|
160
|
+
hideWebDriver: process.env.STEALTH_HIDE_WEBDRIVER !== 'false',
|
|
161
|
+
blockWebRTC: process.env.STEALTH_BLOCK_WEBRTC !== 'false',
|
|
162
|
+
customUserAgent: process.env.STEALTH_CUSTOM_USER_AGENT || null
|
|
163
|
+
},
|
|
164
|
+
|
|
165
|
+
// Human behavior simulation
|
|
166
|
+
humanBehavior: {
|
|
167
|
+
enabled: process.env.STEALTH_HUMAN_BEHAVIOR_ENABLED !== 'false',
|
|
168
|
+
mouseMovements: process.env.STEALTH_MOUSE_MOVEMENTS !== 'false',
|
|
169
|
+
naturalTyping: process.env.STEALTH_NATURAL_TYPING !== 'false',
|
|
170
|
+
scrollBehavior: process.env.STEALTH_SCROLL_BEHAVIOR !== 'false',
|
|
171
|
+
idlePeriods: process.env.STEALTH_IDLE_PERIODS !== 'false',
|
|
172
|
+
readingSimulation: process.env.STEALTH_READING_SIMULATION !== 'false',
|
|
173
|
+
|
|
174
|
+
// Timing configurations
|
|
175
|
+
mouseSpeed: process.env.STEALTH_MOUSE_SPEED || 'normal', // 'slow', 'normal', 'fast'
|
|
176
|
+
typingSpeed: process.env.STEALTH_TYPING_SPEED || 'normal', // 'slow', 'normal', 'fast'
|
|
177
|
+
typingVariability: parseFloat(process.env.STEALTH_TYPING_VARIABILITY || '0.3'), // 0.0 to 1.0
|
|
178
|
+
mistakeFrequency: parseFloat(process.env.STEALTH_MISTAKE_FREQUENCY || '0.02'), // 2% mistake rate
|
|
179
|
+
|
|
180
|
+
// Idle period settings
|
|
181
|
+
idleFrequency: parseFloat(process.env.STEALTH_IDLE_FREQUENCY || '0.1'), // 10% chance
|
|
182
|
+
idleMinDuration: parseInt(process.env.STEALTH_IDLE_MIN_DURATION || '1000'), // 1 second
|
|
183
|
+
idleMaxDuration: parseInt(process.env.STEALTH_IDLE_MAX_DURATION || '5000'), // 5 seconds
|
|
184
|
+
|
|
185
|
+
// Click behavior
|
|
186
|
+
hoverBeforeClick: process.env.STEALTH_HOVER_BEFORE_CLICK !== 'false',
|
|
187
|
+
clickDelayMin: parseInt(process.env.STEALTH_CLICK_DELAY_MIN || '100'),
|
|
188
|
+
clickDelayMax: parseInt(process.env.STEALTH_CLICK_DELAY_MAX || '300')
|
|
189
|
+
},
|
|
190
|
+
|
|
191
|
+
// Advanced anti-detection
|
|
192
|
+
antiDetection: {
|
|
193
|
+
bypassHeadlessDetection: process.env.STEALTH_BYPASS_HEADLESS !== 'false',
|
|
194
|
+
spoofPlugins: process.env.STEALTH_SPOOF_PLUGINS !== 'false',
|
|
195
|
+
spoofPermissions: process.env.STEALTH_SPOOF_PERMISSIONS !== 'false',
|
|
196
|
+
mockBattery: process.env.STEALTH_MOCK_BATTERY !== 'false',
|
|
197
|
+
preventCanvasFingerprinting: process.env.STEALTH_PREVENT_CANVAS !== 'false',
|
|
198
|
+
preventWebGLFingerprinting: process.env.STEALTH_PREVENT_WEBGL !== 'false',
|
|
199
|
+
networkEmulation: process.env.STEALTH_NETWORK_EMULATION === 'true'
|
|
200
|
+
},
|
|
201
|
+
|
|
202
|
+
// Resource optimization for stealth
|
|
203
|
+
resources: {
|
|
204
|
+
blockImages: process.env.STEALTH_BLOCK_IMAGES === 'true',
|
|
205
|
+
blockFonts: process.env.STEALTH_BLOCK_FONTS === 'true',
|
|
206
|
+
blockStylesheets: process.env.STEALTH_BLOCK_CSS === 'true',
|
|
207
|
+
allowTrackingPixels: process.env.STEALTH_ALLOW_TRACKING === 'true', // Allow some tracking to appear normal
|
|
208
|
+
maxConcurrentContexts: parseInt(process.env.STEALTH_MAX_CONTEXTS || '5')
|
|
209
|
+
},
|
|
210
|
+
|
|
211
|
+
// Geolocation spoofing
|
|
212
|
+
geolocation: {
|
|
213
|
+
enabled: process.env.STEALTH_SPOOF_GEOLOCATION === 'true',
|
|
214
|
+
latitude: parseFloat(process.env.STEALTH_LATITUDE || '40.7128'), // NYC default
|
|
215
|
+
longitude: parseFloat(process.env.STEALTH_LONGITUDE || '-74.0060'),
|
|
216
|
+
accuracy: parseInt(process.env.STEALTH_LOCATION_ACCURACY || '100')
|
|
217
|
+
}
|
|
218
|
+
},
|
|
219
|
+
|
|
220
|
+
// Localization Configuration
|
|
221
|
+
localization: {
|
|
222
|
+
// Global localization settings
|
|
223
|
+
enabled: process.env.LOCALIZATION_ENABLED === 'true',
|
|
224
|
+
defaultCountry: process.env.DEFAULT_COUNTRY_CODE || 'US',
|
|
225
|
+
defaultLanguage: process.env.DEFAULT_LANGUAGE || 'en-US',
|
|
226
|
+
|
|
227
|
+
// Proxy configuration for geo-specific access
|
|
228
|
+
proxy: {
|
|
229
|
+
enabled: process.env.LOCALIZATION_PROXY_ENABLED === 'true',
|
|
230
|
+
rotation: {
|
|
231
|
+
enabled: process.env.PROXY_ROTATION_ENABLED === 'true',
|
|
232
|
+
interval: parseInt(process.env.PROXY_ROTATION_INTERVAL || '300000'), // 5 minutes
|
|
233
|
+
strategy: process.env.PROXY_ROTATION_STRATEGY || 'round-robin'
|
|
234
|
+
},
|
|
235
|
+
healthCheck: {
|
|
236
|
+
enabled: process.env.PROXY_HEALTH_CHECK_ENABLED !== 'false',
|
|
237
|
+
interval: parseInt(process.env.PROXY_HEALTH_CHECK_INTERVAL || '300000'), // 5 minutes
|
|
238
|
+
timeout: parseInt(process.env.PROXY_HEALTH_CHECK_TIMEOUT || '10000')
|
|
239
|
+
},
|
|
240
|
+
fallback: {
|
|
241
|
+
enabled: process.env.PROXY_FALLBACK_ENABLED !== 'false',
|
|
242
|
+
maxRetries: parseInt(process.env.PROXY_MAX_RETRIES || '3'),
|
|
243
|
+
timeout: parseInt(process.env.PROXY_TIMEOUT || '10000')
|
|
244
|
+
}
|
|
245
|
+
},
|
|
246
|
+
|
|
247
|
+
// Translation services
|
|
248
|
+
translation: {
|
|
249
|
+
enabled: process.env.TRANSLATION_ENABLED === 'true',
|
|
250
|
+
defaultProvider: process.env.TRANSLATION_PROVIDER || 'google',
|
|
251
|
+
autoDetect: process.env.TRANSLATION_AUTO_DETECT !== 'false',
|
|
252
|
+
preserveFormatting: process.env.TRANSLATION_PRESERVE_FORMAT !== 'false',
|
|
253
|
+
cacheEnabled: process.env.TRANSLATION_CACHE_ENABLED !== 'false',
|
|
254
|
+
cacheTTL: parseInt(process.env.TRANSLATION_CACHE_TTL || '86400000') // 24 hours
|
|
255
|
+
},
|
|
256
|
+
|
|
257
|
+
// Geo-blocking bypass
|
|
258
|
+
geoBlocking: {
|
|
259
|
+
autoBypass: process.env.GEO_BLOCKING_AUTO_BYPASS === 'true',
|
|
260
|
+
maxRetries: parseInt(process.env.GEO_BLOCKING_MAX_RETRIES || '3'),
|
|
261
|
+
retryDelay: parseInt(process.env.GEO_BLOCKING_RETRY_DELAY || '2000'),
|
|
262
|
+
fallbackCountries: (process.env.GEO_BLOCKING_FALLBACK_COUNTRIES || 'US,GB,DE,CA').split(','),
|
|
263
|
+
detectionSensitivity: process.env.GEO_BLOCKING_DETECTION_SENSITIVITY || 'medium'
|
|
264
|
+
},
|
|
265
|
+
|
|
266
|
+
// Cultural browsing simulation
|
|
267
|
+
cultural: {
|
|
268
|
+
enabled: process.env.CULTURAL_SIMULATION_ENABLED === 'true',
|
|
269
|
+
adaptBehavior: process.env.CULTURAL_ADAPT_BEHAVIOR !== 'false',
|
|
270
|
+
adaptTiming: process.env.CULTURAL_ADAPT_TIMING !== 'false',
|
|
271
|
+
respectRTL: process.env.CULTURAL_RESPECT_RTL !== 'false'
|
|
272
|
+
},
|
|
273
|
+
|
|
274
|
+
// DNS configuration
|
|
275
|
+
dns: {
|
|
276
|
+
enabled: process.env.LOCALIZATION_DNS_ENABLED === 'true',
|
|
277
|
+
overHttps: process.env.DNS_OVER_HTTPS === 'true',
|
|
278
|
+
customResolvers: process.env.CUSTOM_DNS_RESOLVERS ?
|
|
279
|
+
JSON.parse(process.env.CUSTOM_DNS_RESOLVERS) : {},
|
|
280
|
+
preferredCountry: process.env.DNS_PREFERRED_COUNTRY || null
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
};
|
|
284
|
+
|
|
285
|
+
// Validate required configuration
|
|
286
|
+
export function validateConfig() {
|
|
287
|
+
const errors = [];
|
|
288
|
+
|
|
289
|
+
// Check search provider configuration
|
|
290
|
+
const provider = getActiveSearchProvider();
|
|
291
|
+
|
|
292
|
+
if (config.server.nodeEnv === 'production') {
|
|
293
|
+
if (provider === 'google') {
|
|
294
|
+
if (!config.search.google.apiKey) {
|
|
295
|
+
errors.push('GOOGLE_API_KEY is required when using Google search provider in production');
|
|
296
|
+
}
|
|
297
|
+
if (!config.search.google.searchEngineId) {
|
|
298
|
+
errors.push('GOOGLE_SEARCH_ENGINE_ID is required when using Google search provider in production');
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
if (!isSearchConfigured()) {
|
|
303
|
+
errors.push('Search provider is not properly configured');
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Validate search provider setting
|
|
308
|
+
const validProviders = ['google', 'duckduckgo', 'auto'];
|
|
309
|
+
if (!validProviders.includes(config.search.provider.toLowerCase())) {
|
|
310
|
+
errors.push(`Invalid SEARCH_PROVIDER value. Must be one of: ${validProviders.join(', ')}`);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Validate numeric ranges
|
|
314
|
+
if (config.crawling.maxDepth > 10) {
|
|
315
|
+
errors.push('MAX_CRAWL_DEPTH should not exceed 10 for performance reasons');
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if (config.crawling.maxPages > 10000) {
|
|
319
|
+
errors.push('MAX_PAGES_PER_CRAWL should not exceed 10000 for memory reasons');
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
if (config.performance.queueConcurrency > 50) {
|
|
323
|
+
errors.push('QUEUE_CONCURRENCY should not exceed 50 to avoid overwhelming servers');
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// Validate localization configuration
|
|
327
|
+
const localizationErrors = validateLocalizationConfig();
|
|
328
|
+
errors.push(...localizationErrors);
|
|
329
|
+
|
|
330
|
+
return errors;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// Check if search is properly configured
|
|
334
|
+
export function isSearchConfigured() {
|
|
335
|
+
const provider = getActiveSearchProvider();
|
|
336
|
+
|
|
337
|
+
switch (provider) {
|
|
338
|
+
case 'google':
|
|
339
|
+
return !!(config.search.google.apiKey && config.search.google.searchEngineId);
|
|
340
|
+
case 'duckduckgo':
|
|
341
|
+
return true; // DuckDuckGo doesn't require API credentials
|
|
342
|
+
default:
|
|
343
|
+
return false;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// Get the active search provider based on configuration and availability
|
|
348
|
+
export function getActiveSearchProvider() {
|
|
349
|
+
const configuredProvider = config.search.provider.toLowerCase();
|
|
350
|
+
|
|
351
|
+
switch (configuredProvider) {
|
|
352
|
+
case 'google':
|
|
353
|
+
return 'google';
|
|
354
|
+
case 'duckduckgo':
|
|
355
|
+
return 'duckduckgo';
|
|
356
|
+
case 'auto':
|
|
357
|
+
default:
|
|
358
|
+
// Auto mode: prefer Google if credentials available, otherwise use DuckDuckGo
|
|
359
|
+
if (config.search.google.apiKey && config.search.google.searchEngineId) {
|
|
360
|
+
return 'google';
|
|
361
|
+
}
|
|
362
|
+
return 'duckduckgo';
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// Get configuration for a specific tool
|
|
367
|
+
export function getToolConfig(toolName) {
|
|
368
|
+
const provider = getActiveSearchProvider();
|
|
369
|
+
|
|
370
|
+
const toolConfigs = {
|
|
371
|
+
search_web: {
|
|
372
|
+
provider: provider,
|
|
373
|
+
|
|
374
|
+
// Google-specific configuration
|
|
375
|
+
google: {
|
|
376
|
+
apiKey: config.search.google.apiKey,
|
|
377
|
+
searchEngineId: config.search.google.searchEngineId
|
|
378
|
+
},
|
|
379
|
+
|
|
380
|
+
// DuckDuckGo-specific configuration
|
|
381
|
+
duckduckgo: {
|
|
382
|
+
timeout: config.search.duckduckgo.timeout,
|
|
383
|
+
maxRetries: config.search.duckduckgo.maxRetries,
|
|
384
|
+
retryDelay: config.search.duckduckgo.retryDelay,
|
|
385
|
+
userAgent: config.search.duckduckgo.userAgent
|
|
386
|
+
},
|
|
387
|
+
|
|
388
|
+
// Common configuration
|
|
389
|
+
cacheEnabled: config.performance.cacheEnableDisk,
|
|
390
|
+
cacheTTL: config.performance.cacheTTL,
|
|
391
|
+
rankingOptions: {
|
|
392
|
+
weights: config.searchProcessing.ranking.weights,
|
|
393
|
+
bm25: config.searchProcessing.ranking.bm25,
|
|
394
|
+
cacheEnabled: config.performance.cacheEnableDisk,
|
|
395
|
+
cacheTTL: config.performance.cacheTTL
|
|
396
|
+
},
|
|
397
|
+
deduplicationOptions: {
|
|
398
|
+
thresholds: config.searchProcessing.deduplication.thresholds,
|
|
399
|
+
strategies: config.searchProcessing.deduplication.strategies,
|
|
400
|
+
cacheEnabled: config.performance.cacheEnableDisk,
|
|
401
|
+
cacheTTL: config.performance.cacheTTL
|
|
402
|
+
}
|
|
403
|
+
},
|
|
404
|
+
crawl_deep: {
|
|
405
|
+
maxDepth: config.crawling.maxDepth,
|
|
406
|
+
maxPages: config.crawling.maxPages,
|
|
407
|
+
respectRobots: config.crawling.respectRobots,
|
|
408
|
+
userAgent: config.crawling.userAgent,
|
|
409
|
+
timeout: config.crawling.timeout,
|
|
410
|
+
followExternal: config.crawling.followExternal,
|
|
411
|
+
concurrency: config.performance.queueConcurrency
|
|
412
|
+
},
|
|
413
|
+
map_site: {
|
|
414
|
+
userAgent: config.crawling.userAgent,
|
|
415
|
+
timeout: config.crawling.timeout
|
|
416
|
+
},
|
|
417
|
+
process_document: {
|
|
418
|
+
stealthMode: config.stealth.enabled ? {
|
|
419
|
+
enabled: true,
|
|
420
|
+
level: config.stealth.defaultLevel,
|
|
421
|
+
randomizeFingerprint: config.stealth.fingerprinting.randomizeUserAgent,
|
|
422
|
+
hideWebDriver: config.stealth.fingerprinting.hideWebDriver,
|
|
423
|
+
blockWebRTC: config.stealth.fingerprinting.blockWebRTC
|
|
424
|
+
} : { enabled: false }
|
|
425
|
+
},
|
|
426
|
+
scrape_with_actions: {
|
|
427
|
+
stealthMode: config.stealth.enabled ? {
|
|
428
|
+
enabled: true,
|
|
429
|
+
level: config.stealth.defaultLevel,
|
|
430
|
+
randomizeFingerprint: config.stealth.fingerprinting.randomizeUserAgent,
|
|
431
|
+
simulateHumanBehavior: config.stealth.humanBehavior.enabled,
|
|
432
|
+
customUserAgent: config.stealth.fingerprinting.customUserAgent,
|
|
433
|
+
hideWebDriver: config.stealth.fingerprinting.hideWebDriver,
|
|
434
|
+
blockWebRTC: config.stealth.fingerprinting.blockWebRTC
|
|
435
|
+
} : { enabled: false },
|
|
436
|
+
humanBehavior: config.stealth.humanBehavior.enabled ? {
|
|
437
|
+
enabled: true,
|
|
438
|
+
mouseMovements: config.stealth.humanBehavior.mouseMovements,
|
|
439
|
+
typingVariation: config.stealth.humanBehavior.naturalTyping,
|
|
440
|
+
scrollBehavior: config.stealth.humanBehavior.scrollBehavior,
|
|
441
|
+
idlePeriods: config.stealth.humanBehavior.idlePeriods,
|
|
442
|
+
readingTime: config.stealth.humanBehavior.readingSimulation
|
|
443
|
+
} : { enabled: false }
|
|
444
|
+
}
|
|
445
|
+
};
|
|
446
|
+
|
|
447
|
+
return toolConfigs[toolName] || {};
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// Get stealth configuration for specific level
|
|
451
|
+
export function getStealthConfig(level = 'medium') {
|
|
452
|
+
const baseConfig = {
|
|
453
|
+
enabled: true,
|
|
454
|
+
level,
|
|
455
|
+
randomizeFingerprint: config.stealth.fingerprinting.randomizeUserAgent,
|
|
456
|
+
hideWebDriver: config.stealth.fingerprinting.hideWebDriver,
|
|
457
|
+
blockWebRTC: config.stealth.fingerprinting.blockWebRTC,
|
|
458
|
+
customUserAgent: config.stealth.fingerprinting.customUserAgent
|
|
459
|
+
};
|
|
460
|
+
|
|
461
|
+
// Adjust settings based on level
|
|
462
|
+
switch (level) {
|
|
463
|
+
case 'basic':
|
|
464
|
+
return {
|
|
465
|
+
...baseConfig,
|
|
466
|
+
randomizeFingerprint: false,
|
|
467
|
+
blockWebRTC: true,
|
|
468
|
+
hideWebDriver: true
|
|
469
|
+
};
|
|
470
|
+
case 'advanced':
|
|
471
|
+
return {
|
|
472
|
+
...baseConfig,
|
|
473
|
+
randomizeFingerprint: true,
|
|
474
|
+
blockWebRTC: true,
|
|
475
|
+
hideWebDriver: true,
|
|
476
|
+
spoofTimezone: config.stealth.fingerprinting.spoofTimezone,
|
|
477
|
+
preventCanvasFingerprinting: config.stealth.antiDetection.preventCanvasFingerprinting,
|
|
478
|
+
preventWebGLFingerprinting: config.stealth.antiDetection.preventWebGLFingerprinting,
|
|
479
|
+
networkEmulation: config.stealth.antiDetection.networkEmulation
|
|
480
|
+
};
|
|
481
|
+
case 'medium':
|
|
482
|
+
default:
|
|
483
|
+
return baseConfig;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// Get human behavior configuration for specific level
|
|
488
|
+
export function getHumanBehaviorConfig(level = 'medium') {
|
|
489
|
+
const baseConfig = {
|
|
490
|
+
enabled: config.stealth.humanBehavior.enabled,
|
|
491
|
+
mouseMovements: config.stealth.humanBehavior.mouseMovements,
|
|
492
|
+
typingVariation: config.stealth.humanBehavior.naturalTyping,
|
|
493
|
+
scrollBehavior: config.stealth.humanBehavior.scrollBehavior,
|
|
494
|
+
idlePeriods: config.stealth.humanBehavior.idlePeriods,
|
|
495
|
+
readingTime: config.stealth.humanBehavior.readingSimulation
|
|
496
|
+
};
|
|
497
|
+
|
|
498
|
+
// Adjust behavior complexity based on level
|
|
499
|
+
switch (level) {
|
|
500
|
+
case 'basic':
|
|
501
|
+
return {
|
|
502
|
+
...baseConfig,
|
|
503
|
+
mouseMovements: false,
|
|
504
|
+
typingVariation: false,
|
|
505
|
+
idlePeriods: false
|
|
506
|
+
};
|
|
507
|
+
case 'advanced':
|
|
508
|
+
return {
|
|
509
|
+
...baseConfig,
|
|
510
|
+
mouseMovements: true,
|
|
511
|
+
typingVariation: true,
|
|
512
|
+
scrollBehavior: true,
|
|
513
|
+
idlePeriods: true,
|
|
514
|
+
readingTime: true
|
|
515
|
+
};
|
|
516
|
+
case 'medium':
|
|
517
|
+
default:
|
|
518
|
+
return baseConfig;
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// Check if stealth mode is properly configured
|
|
523
|
+
export function isStealthConfigured() {
|
|
524
|
+
return config.stealth.enabled && (
|
|
525
|
+
config.stealth.fingerprinting.randomizeUserAgent ||
|
|
526
|
+
config.stealth.fingerprinting.hideWebDriver ||
|
|
527
|
+
config.stealth.humanBehavior.enabled
|
|
528
|
+
);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
// Get localization configuration
|
|
532
|
+
export function getLocalizationConfig() {
|
|
533
|
+
return config.localization;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// Check if localization is enabled and properly configured
|
|
537
|
+
export function isLocalizationConfigured() {
|
|
538
|
+
return config.localization.enabled && (
|
|
539
|
+
config.localization.proxy.enabled ||
|
|
540
|
+
config.localization.translation.enabled ||
|
|
541
|
+
config.localization.geoBlocking.autoBypass
|
|
542
|
+
);
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// Get proxy configuration for localization
|
|
546
|
+
export function getProxyConfig() {
|
|
547
|
+
return config.localization.proxy;
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// Get translation configuration
|
|
551
|
+
export function getTranslationConfig() {
|
|
552
|
+
return config.localization.translation;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// Get geo-blocking bypass configuration
|
|
556
|
+
export function getGeoBlockingConfig() {
|
|
557
|
+
return config.localization.geoBlocking;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
// Get cultural simulation configuration
|
|
561
|
+
export function getCulturalConfig() {
|
|
562
|
+
return config.localization.cultural;
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
// Validate localization configuration
|
|
566
|
+
export function validateLocalizationConfig() {
|
|
567
|
+
const errors = [];
|
|
568
|
+
const localizationConfig = config.localization;
|
|
569
|
+
|
|
570
|
+
if (localizationConfig.enabled) {
|
|
571
|
+
// Validate country code
|
|
572
|
+
if (!localizationConfig.defaultCountry || localizationConfig.defaultCountry.length !== 2) {
|
|
573
|
+
errors.push('DEFAULT_COUNTRY_CODE must be a valid 2-letter country code');
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// Validate language code
|
|
577
|
+
if (!localizationConfig.defaultLanguage || !localizationConfig.defaultLanguage.includes('-')) {
|
|
578
|
+
errors.push('DEFAULT_LANGUAGE must be in format language-country (e.g., en-US)');
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
// Validate proxy configuration
|
|
582
|
+
if (localizationConfig.proxy.enabled) {
|
|
583
|
+
if (localizationConfig.proxy.rotation.interval < 60000) {
|
|
584
|
+
errors.push('PROXY_ROTATION_INTERVAL should be at least 60000ms (1 minute)');
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
if (localizationConfig.proxy.healthCheck.interval < 60000) {
|
|
588
|
+
errors.push('PROXY_HEALTH_CHECK_INTERVAL should be at least 60000ms (1 minute)');
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
// Validate translation configuration
|
|
593
|
+
if (localizationConfig.translation.enabled) {
|
|
594
|
+
const validProviders = ['google', 'azure', 'libre'];
|
|
595
|
+
if (!validProviders.includes(localizationConfig.translation.defaultProvider)) {
|
|
596
|
+
errors.push(`TRANSLATION_PROVIDER must be one of: ${validProviders.join(', ')}`);
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Validate geo-blocking configuration
|
|
601
|
+
if (localizationConfig.geoBlocking.autoBypass) {
|
|
602
|
+
if (localizationConfig.geoBlocking.maxRetries > 10) {
|
|
603
|
+
errors.push('GEO_BLOCKING_MAX_RETRIES should not exceed 10');
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
if (localizationConfig.geoBlocking.retryDelay < 1000) {
|
|
607
|
+
errors.push('GEO_BLOCKING_RETRY_DELAY should be at least 1000ms');
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
return errors;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
export default config;
|