crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,615 @@
1
+ import dotenv from 'dotenv';
2
+ import { fileURLToPath } from 'url';
3
+ import { dirname, join } from 'path';
4
+
5
+ // Load environment variables
6
+ const __filename = fileURLToPath(import.meta.url);
7
+ const __dirname = dirname(__filename);
8
+ dotenv.config({ path: join(__dirname, '../../.env'), quiet: true });
9
+
10
+ export const config = {
11
+ // Search Provider Configuration
12
+ search: {
13
+ provider: process.env.SEARCH_PROVIDER || 'auto', // 'google', 'duckduckgo', or 'auto'
14
+
15
+ // Google Search API
16
+ google: {
17
+ apiKey: process.env.GOOGLE_API_KEY || '',
18
+ searchEngineId: process.env.GOOGLE_SEARCH_ENGINE_ID || ''
19
+ },
20
+
21
+ // DuckDuckGo Configuration
22
+ duckduckgo: {
23
+ timeout: parseInt(process.env.DUCKDUCKGO_TIMEOUT || '30000'),
24
+ maxRetries: parseInt(process.env.DUCKDUCKGO_MAX_RETRIES || '3'),
25
+ retryDelay: parseInt(process.env.DUCKDUCKGO_RETRY_DELAY || '1000'),
26
+ userAgent: process.env.DUCKDUCKGO_USER_AGENT || process.env.USER_AGENT || 'CrawlForge/1.0'
27
+ }
28
+ },
29
+
30
+ // Performance
31
+ performance: {
32
+ maxWorkers: parseInt(process.env.MAX_WORKERS || '10'),
33
+ queueConcurrency: parseInt(process.env.QUEUE_CONCURRENCY || '10'),
34
+ cacheMaxSize: parseInt(process.env.CACHE_MAX_SIZE || '1000'),
35
+ cacheTTL: parseInt(process.env.CACHE_TTL || '3600000'),
36
+ cacheEnableDisk: process.env.CACHE_ENABLE_DISK !== 'false',
37
+ cacheDir: process.env.CACHE_DIR || './cache'
38
+ },
39
+
40
+ // Rate Limiting
41
+ rateLimit: {
42
+ requestsPerSecond: parseInt(process.env.RATE_LIMIT_REQUESTS_PER_SECOND || '10'),
43
+ requestsPerMinute: parseInt(process.env.RATE_LIMIT_REQUESTS_PER_MINUTE || '100'),
44
+ perDomain: process.env.RATE_LIMIT_PER_DOMAIN !== 'false'
45
+ },
46
+
47
+ // Crawling
48
+ crawling: {
49
+ maxDepth: parseInt(process.env.MAX_CRAWL_DEPTH || '5'),
50
+ maxPages: parseInt(process.env.MAX_PAGES_PER_CRAWL || '100'),
51
+ respectRobots: process.env.RESPECT_ROBOTS_TXT !== 'false',
52
+ userAgent: process.env.USER_AGENT || 'CrawlForge/1.0',
53
+ timeout: parseInt(process.env.CRAWL_TIMEOUT || '30000'),
54
+ followExternal: process.env.FOLLOW_EXTERNAL_LINKS === 'true'
55
+ },
56
+
57
+ // Search ranking and deduplication
58
+ searchProcessing: {
59
+ enableRanking: process.env.ENABLE_SEARCH_RANKING !== 'false',
60
+ enableDeduplication: process.env.ENABLE_SEARCH_DEDUPLICATION !== 'false',
61
+
62
+ // Ranking configuration
63
+ ranking: {
64
+ weights: {
65
+ bm25: parseFloat(process.env.RANKING_WEIGHT_BM25 || '0.4'),
66
+ semantic: parseFloat(process.env.RANKING_WEIGHT_SEMANTIC || '0.3'),
67
+ authority: parseFloat(process.env.RANKING_WEIGHT_AUTHORITY || '0.2'),
68
+ freshness: parseFloat(process.env.RANKING_WEIGHT_FRESHNESS || '0.1')
69
+ },
70
+ bm25: {
71
+ k1: parseFloat(process.env.BM25_K1 || '1.5'),
72
+ b: parseFloat(process.env.BM25_B || '0.75')
73
+ }
74
+ },
75
+
76
+ // Deduplication configuration
77
+ deduplication: {
78
+ thresholds: {
79
+ url: parseFloat(process.env.DEDUP_THRESHOLD_URL || '0.8'),
80
+ title: parseFloat(process.env.DEDUP_THRESHOLD_TITLE || '0.75'),
81
+ content: parseFloat(process.env.DEDUP_THRESHOLD_CONTENT || '0.7'),
82
+ combined: parseFloat(process.env.DEDUP_THRESHOLD_COMBINED || '0.6')
83
+ },
84
+ strategies: {
85
+ urlNormalization: process.env.DEDUP_URL_NORMALIZATION !== 'false',
86
+ titleFuzzy: process.env.DEDUP_TITLE_FUZZY !== 'false',
87
+ contentSimhash: process.env.DEDUP_CONTENT_SIMHASH !== 'false',
88
+ domainClustering: process.env.DEDUP_DOMAIN_CLUSTERING !== 'false'
89
+ }
90
+ }
91
+ },
92
+
93
+ // Security Configuration
94
+ security: {
95
+ // SSRF Protection
96
+ ssrfProtection: {
97
+ enabled: process.env.SSRF_PROTECTION_ENABLED !== 'false',
98
+ allowedProtocols: (process.env.ALLOWED_PROTOCOLS || 'http:,https:').split(','),
99
+ maxRequestSize: parseInt(process.env.MAX_REQUEST_SIZE || '104857600'), // 100MB
100
+ maxTimeout: parseInt(process.env.MAX_REQUEST_TIMEOUT || '60000'), // 60s
101
+ maxRedirects: parseInt(process.env.MAX_REDIRECTS || '5'),
102
+ allowedDomains: (process.env.ALLOWED_DOMAINS || '').split(',').filter(d => d.trim()),
103
+ blockedDomains: (process.env.BLOCKED_DOMAINS || 'localhost,127.0.0.1,0.0.0.0,metadata.google.internal,169.254.169.254,metadata.azure.com').split(',')
104
+ },
105
+
106
+ // Input Validation
107
+ inputValidation: {
108
+ enabled: process.env.INPUT_VALIDATION_ENABLED !== 'false',
109
+ maxStringLength: parseInt(process.env.MAX_STRING_LENGTH || '10000'),
110
+ maxArrayLength: parseInt(process.env.MAX_ARRAY_LENGTH || '1000'),
111
+ maxObjectDepth: parseInt(process.env.MAX_OBJECT_DEPTH || '10'),
112
+ maxRegexLength: parseInt(process.env.MAX_REGEX_LENGTH || '500'),
113
+ strictMode: process.env.STRICT_VALIDATION_MODE === 'true'
114
+ },
115
+
116
+ // API Security
117
+ apiSecurity: {
118
+ requireAuthentication: process.env.REQUIRE_AUTHENTICATION === 'true',
119
+ apiKeyHeader: process.env.API_KEY_HEADER || 'X-API-Key',
120
+ apiKey: process.env.API_KEY || '',
121
+ rateLimitByKey: process.env.RATE_LIMIT_BY_KEY === 'true',
122
+ auditLogging: process.env.AUDIT_LOGGING !== 'false'
123
+ },
124
+
125
+ // Content Security
126
+ contentSecurity: {
127
+ sanitizeHTML: process.env.SANITIZE_HTML !== 'false',
128
+ allowedHTMLTags: (process.env.ALLOWED_HTML_TAGS || 'p,br,strong,em,u,h1,h2,h3,h4,h5,h6').split(','),
129
+ blockScripts: process.env.BLOCK_SCRIPTS !== 'false',
130
+ blockIframes: process.env.BLOCK_IFRAMES !== 'false'
131
+ }
132
+ },
133
+
134
+ // Monitoring
135
+ monitoring: {
136
+ enableMetrics: process.env.ENABLE_METRICS === 'true',
137
+ logLevel: process.env.LOG_LEVEL || 'info',
138
+ securityLogging: process.env.SECURITY_LOGGING !== 'false',
139
+ violationLogging: process.env.VIOLATION_LOGGING !== 'false'
140
+ },
141
+
142
+ // Server
143
+ server: {
144
+ nodeEnv: process.env.NODE_ENV || 'development',
145
+ port: parseInt(process.env.PORT || '3000'),
146
+ enableSecurityHeaders: process.env.ENABLE_SECURITY_HEADERS !== 'false'
147
+ },
148
+
149
+ // Stealth Mode Configuration
150
+ stealth: {
151
+ // Global stealth settings
152
+ enabled: process.env.STEALTH_MODE_ENABLED === 'true',
153
+ defaultLevel: process.env.STEALTH_LEVEL || 'medium', // 'basic', 'medium', 'advanced'
154
+
155
+ // Browser fingerprinting
156
+ fingerprinting: {
157
+ randomizeUserAgent: process.env.STEALTH_RANDOMIZE_USER_AGENT !== 'false',
158
+ randomizeViewport: process.env.STEALTH_RANDOMIZE_VIEWPORT !== 'false',
159
+ spoofTimezone: process.env.STEALTH_SPOOF_TIMEZONE !== 'false',
160
+ hideWebDriver: process.env.STEALTH_HIDE_WEBDRIVER !== 'false',
161
+ blockWebRTC: process.env.STEALTH_BLOCK_WEBRTC !== 'false',
162
+ customUserAgent: process.env.STEALTH_CUSTOM_USER_AGENT || null
163
+ },
164
+
165
+ // Human behavior simulation
166
+ humanBehavior: {
167
+ enabled: process.env.STEALTH_HUMAN_BEHAVIOR_ENABLED !== 'false',
168
+ mouseMovements: process.env.STEALTH_MOUSE_MOVEMENTS !== 'false',
169
+ naturalTyping: process.env.STEALTH_NATURAL_TYPING !== 'false',
170
+ scrollBehavior: process.env.STEALTH_SCROLL_BEHAVIOR !== 'false',
171
+ idlePeriods: process.env.STEALTH_IDLE_PERIODS !== 'false',
172
+ readingSimulation: process.env.STEALTH_READING_SIMULATION !== 'false',
173
+
174
+ // Timing configurations
175
+ mouseSpeed: process.env.STEALTH_MOUSE_SPEED || 'normal', // 'slow', 'normal', 'fast'
176
+ typingSpeed: process.env.STEALTH_TYPING_SPEED || 'normal', // 'slow', 'normal', 'fast'
177
+ typingVariability: parseFloat(process.env.STEALTH_TYPING_VARIABILITY || '0.3'), // 0.0 to 1.0
178
+ mistakeFrequency: parseFloat(process.env.STEALTH_MISTAKE_FREQUENCY || '0.02'), // 2% mistake rate
179
+
180
+ // Idle period settings
181
+ idleFrequency: parseFloat(process.env.STEALTH_IDLE_FREQUENCY || '0.1'), // 10% chance
182
+ idleMinDuration: parseInt(process.env.STEALTH_IDLE_MIN_DURATION || '1000'), // 1 second
183
+ idleMaxDuration: parseInt(process.env.STEALTH_IDLE_MAX_DURATION || '5000'), // 5 seconds
184
+
185
+ // Click behavior
186
+ hoverBeforeClick: process.env.STEALTH_HOVER_BEFORE_CLICK !== 'false',
187
+ clickDelayMin: parseInt(process.env.STEALTH_CLICK_DELAY_MIN || '100'),
188
+ clickDelayMax: parseInt(process.env.STEALTH_CLICK_DELAY_MAX || '300')
189
+ },
190
+
191
+ // Advanced anti-detection
192
+ antiDetection: {
193
+ bypassHeadlessDetection: process.env.STEALTH_BYPASS_HEADLESS !== 'false',
194
+ spoofPlugins: process.env.STEALTH_SPOOF_PLUGINS !== 'false',
195
+ spoofPermissions: process.env.STEALTH_SPOOF_PERMISSIONS !== 'false',
196
+ mockBattery: process.env.STEALTH_MOCK_BATTERY !== 'false',
197
+ preventCanvasFingerprinting: process.env.STEALTH_PREVENT_CANVAS !== 'false',
198
+ preventWebGLFingerprinting: process.env.STEALTH_PREVENT_WEBGL !== 'false',
199
+ networkEmulation: process.env.STEALTH_NETWORK_EMULATION === 'true'
200
+ },
201
+
202
+ // Resource optimization for stealth
203
+ resources: {
204
+ blockImages: process.env.STEALTH_BLOCK_IMAGES === 'true',
205
+ blockFonts: process.env.STEALTH_BLOCK_FONTS === 'true',
206
+ blockStylesheets: process.env.STEALTH_BLOCK_CSS === 'true',
207
+ allowTrackingPixels: process.env.STEALTH_ALLOW_TRACKING === 'true', // Allow some tracking to appear normal
208
+ maxConcurrentContexts: parseInt(process.env.STEALTH_MAX_CONTEXTS || '5')
209
+ },
210
+
211
+ // Geolocation spoofing
212
+ geolocation: {
213
+ enabled: process.env.STEALTH_SPOOF_GEOLOCATION === 'true',
214
+ latitude: parseFloat(process.env.STEALTH_LATITUDE || '40.7128'), // NYC default
215
+ longitude: parseFloat(process.env.STEALTH_LONGITUDE || '-74.0060'),
216
+ accuracy: parseInt(process.env.STEALTH_LOCATION_ACCURACY || '100')
217
+ }
218
+ },
219
+
220
+ // Localization Configuration
221
+ localization: {
222
+ // Global localization settings
223
+ enabled: process.env.LOCALIZATION_ENABLED === 'true',
224
+ defaultCountry: process.env.DEFAULT_COUNTRY_CODE || 'US',
225
+ defaultLanguage: process.env.DEFAULT_LANGUAGE || 'en-US',
226
+
227
+ // Proxy configuration for geo-specific access
228
+ proxy: {
229
+ enabled: process.env.LOCALIZATION_PROXY_ENABLED === 'true',
230
+ rotation: {
231
+ enabled: process.env.PROXY_ROTATION_ENABLED === 'true',
232
+ interval: parseInt(process.env.PROXY_ROTATION_INTERVAL || '300000'), // 5 minutes
233
+ strategy: process.env.PROXY_ROTATION_STRATEGY || 'round-robin'
234
+ },
235
+ healthCheck: {
236
+ enabled: process.env.PROXY_HEALTH_CHECK_ENABLED !== 'false',
237
+ interval: parseInt(process.env.PROXY_HEALTH_CHECK_INTERVAL || '300000'), // 5 minutes
238
+ timeout: parseInt(process.env.PROXY_HEALTH_CHECK_TIMEOUT || '10000')
239
+ },
240
+ fallback: {
241
+ enabled: process.env.PROXY_FALLBACK_ENABLED !== 'false',
242
+ maxRetries: parseInt(process.env.PROXY_MAX_RETRIES || '3'),
243
+ timeout: parseInt(process.env.PROXY_TIMEOUT || '10000')
244
+ }
245
+ },
246
+
247
+ // Translation services
248
+ translation: {
249
+ enabled: process.env.TRANSLATION_ENABLED === 'true',
250
+ defaultProvider: process.env.TRANSLATION_PROVIDER || 'google',
251
+ autoDetect: process.env.TRANSLATION_AUTO_DETECT !== 'false',
252
+ preserveFormatting: process.env.TRANSLATION_PRESERVE_FORMAT !== 'false',
253
+ cacheEnabled: process.env.TRANSLATION_CACHE_ENABLED !== 'false',
254
+ cacheTTL: parseInt(process.env.TRANSLATION_CACHE_TTL || '86400000') // 24 hours
255
+ },
256
+
257
+ // Geo-blocking bypass
258
+ geoBlocking: {
259
+ autoBypass: process.env.GEO_BLOCKING_AUTO_BYPASS === 'true',
260
+ maxRetries: parseInt(process.env.GEO_BLOCKING_MAX_RETRIES || '3'),
261
+ retryDelay: parseInt(process.env.GEO_BLOCKING_RETRY_DELAY || '2000'),
262
+ fallbackCountries: (process.env.GEO_BLOCKING_FALLBACK_COUNTRIES || 'US,GB,DE,CA').split(','),
263
+ detectionSensitivity: process.env.GEO_BLOCKING_DETECTION_SENSITIVITY || 'medium'
264
+ },
265
+
266
+ // Cultural browsing simulation
267
+ cultural: {
268
+ enabled: process.env.CULTURAL_SIMULATION_ENABLED === 'true',
269
+ adaptBehavior: process.env.CULTURAL_ADAPT_BEHAVIOR !== 'false',
270
+ adaptTiming: process.env.CULTURAL_ADAPT_TIMING !== 'false',
271
+ respectRTL: process.env.CULTURAL_RESPECT_RTL !== 'false'
272
+ },
273
+
274
+ // DNS configuration
275
+ dns: {
276
+ enabled: process.env.LOCALIZATION_DNS_ENABLED === 'true',
277
+ overHttps: process.env.DNS_OVER_HTTPS === 'true',
278
+ customResolvers: process.env.CUSTOM_DNS_RESOLVERS ?
279
+ JSON.parse(process.env.CUSTOM_DNS_RESOLVERS) : {},
280
+ preferredCountry: process.env.DNS_PREFERRED_COUNTRY || null
281
+ }
282
+ }
283
+ };
284
+
285
+ // Validate required configuration
286
+ export function validateConfig() {
287
+ const errors = [];
288
+
289
+ // Check search provider configuration
290
+ const provider = getActiveSearchProvider();
291
+
292
+ if (config.server.nodeEnv === 'production') {
293
+ if (provider === 'google') {
294
+ if (!config.search.google.apiKey) {
295
+ errors.push('GOOGLE_API_KEY is required when using Google search provider in production');
296
+ }
297
+ if (!config.search.google.searchEngineId) {
298
+ errors.push('GOOGLE_SEARCH_ENGINE_ID is required when using Google search provider in production');
299
+ }
300
+ }
301
+
302
+ if (!isSearchConfigured()) {
303
+ errors.push('Search provider is not properly configured');
304
+ }
305
+ }
306
+
307
+ // Validate search provider setting
308
+ const validProviders = ['google', 'duckduckgo', 'auto'];
309
+ if (!validProviders.includes(config.search.provider.toLowerCase())) {
310
+ errors.push(`Invalid SEARCH_PROVIDER value. Must be one of: ${validProviders.join(', ')}`);
311
+ }
312
+
313
+ // Validate numeric ranges
314
+ if (config.crawling.maxDepth > 10) {
315
+ errors.push('MAX_CRAWL_DEPTH should not exceed 10 for performance reasons');
316
+ }
317
+
318
+ if (config.crawling.maxPages > 10000) {
319
+ errors.push('MAX_PAGES_PER_CRAWL should not exceed 10000 for memory reasons');
320
+ }
321
+
322
+ if (config.performance.queueConcurrency > 50) {
323
+ errors.push('QUEUE_CONCURRENCY should not exceed 50 to avoid overwhelming servers');
324
+ }
325
+
326
+ // Validate localization configuration
327
+ const localizationErrors = validateLocalizationConfig();
328
+ errors.push(...localizationErrors);
329
+
330
+ return errors;
331
+ }
332
+
333
+ // Check if search is properly configured
334
+ export function isSearchConfigured() {
335
+ const provider = getActiveSearchProvider();
336
+
337
+ switch (provider) {
338
+ case 'google':
339
+ return !!(config.search.google.apiKey && config.search.google.searchEngineId);
340
+ case 'duckduckgo':
341
+ return true; // DuckDuckGo doesn't require API credentials
342
+ default:
343
+ return false;
344
+ }
345
+ }
346
+
347
+ // Get the active search provider based on configuration and availability
348
+ export function getActiveSearchProvider() {
349
+ const configuredProvider = config.search.provider.toLowerCase();
350
+
351
+ switch (configuredProvider) {
352
+ case 'google':
353
+ return 'google';
354
+ case 'duckduckgo':
355
+ return 'duckduckgo';
356
+ case 'auto':
357
+ default:
358
+ // Auto mode: prefer Google if credentials available, otherwise use DuckDuckGo
359
+ if (config.search.google.apiKey && config.search.google.searchEngineId) {
360
+ return 'google';
361
+ }
362
+ return 'duckduckgo';
363
+ }
364
+ }
365
+
366
+ // Get configuration for a specific tool
367
+ export function getToolConfig(toolName) {
368
+ const provider = getActiveSearchProvider();
369
+
370
+ const toolConfigs = {
371
+ search_web: {
372
+ provider: provider,
373
+
374
+ // Google-specific configuration
375
+ google: {
376
+ apiKey: config.search.google.apiKey,
377
+ searchEngineId: config.search.google.searchEngineId
378
+ },
379
+
380
+ // DuckDuckGo-specific configuration
381
+ duckduckgo: {
382
+ timeout: config.search.duckduckgo.timeout,
383
+ maxRetries: config.search.duckduckgo.maxRetries,
384
+ retryDelay: config.search.duckduckgo.retryDelay,
385
+ userAgent: config.search.duckduckgo.userAgent
386
+ },
387
+
388
+ // Common configuration
389
+ cacheEnabled: config.performance.cacheEnableDisk,
390
+ cacheTTL: config.performance.cacheTTL,
391
+ rankingOptions: {
392
+ weights: config.searchProcessing.ranking.weights,
393
+ bm25: config.searchProcessing.ranking.bm25,
394
+ cacheEnabled: config.performance.cacheEnableDisk,
395
+ cacheTTL: config.performance.cacheTTL
396
+ },
397
+ deduplicationOptions: {
398
+ thresholds: config.searchProcessing.deduplication.thresholds,
399
+ strategies: config.searchProcessing.deduplication.strategies,
400
+ cacheEnabled: config.performance.cacheEnableDisk,
401
+ cacheTTL: config.performance.cacheTTL
402
+ }
403
+ },
404
+ crawl_deep: {
405
+ maxDepth: config.crawling.maxDepth,
406
+ maxPages: config.crawling.maxPages,
407
+ respectRobots: config.crawling.respectRobots,
408
+ userAgent: config.crawling.userAgent,
409
+ timeout: config.crawling.timeout,
410
+ followExternal: config.crawling.followExternal,
411
+ concurrency: config.performance.queueConcurrency
412
+ },
413
+ map_site: {
414
+ userAgent: config.crawling.userAgent,
415
+ timeout: config.crawling.timeout
416
+ },
417
+ process_document: {
418
+ stealthMode: config.stealth.enabled ? {
419
+ enabled: true,
420
+ level: config.stealth.defaultLevel,
421
+ randomizeFingerprint: config.stealth.fingerprinting.randomizeUserAgent,
422
+ hideWebDriver: config.stealth.fingerprinting.hideWebDriver,
423
+ blockWebRTC: config.stealth.fingerprinting.blockWebRTC
424
+ } : { enabled: false }
425
+ },
426
+ scrape_with_actions: {
427
+ stealthMode: config.stealth.enabled ? {
428
+ enabled: true,
429
+ level: config.stealth.defaultLevel,
430
+ randomizeFingerprint: config.stealth.fingerprinting.randomizeUserAgent,
431
+ simulateHumanBehavior: config.stealth.humanBehavior.enabled,
432
+ customUserAgent: config.stealth.fingerprinting.customUserAgent,
433
+ hideWebDriver: config.stealth.fingerprinting.hideWebDriver,
434
+ blockWebRTC: config.stealth.fingerprinting.blockWebRTC
435
+ } : { enabled: false },
436
+ humanBehavior: config.stealth.humanBehavior.enabled ? {
437
+ enabled: true,
438
+ mouseMovements: config.stealth.humanBehavior.mouseMovements,
439
+ typingVariation: config.stealth.humanBehavior.naturalTyping,
440
+ scrollBehavior: config.stealth.humanBehavior.scrollBehavior,
441
+ idlePeriods: config.stealth.humanBehavior.idlePeriods,
442
+ readingTime: config.stealth.humanBehavior.readingSimulation
443
+ } : { enabled: false }
444
+ }
445
+ };
446
+
447
+ return toolConfigs[toolName] || {};
448
+ }
449
+
450
+ // Get stealth configuration for specific level
451
+ export function getStealthConfig(level = 'medium') {
452
+ const baseConfig = {
453
+ enabled: true,
454
+ level,
455
+ randomizeFingerprint: config.stealth.fingerprinting.randomizeUserAgent,
456
+ hideWebDriver: config.stealth.fingerprinting.hideWebDriver,
457
+ blockWebRTC: config.stealth.fingerprinting.blockWebRTC,
458
+ customUserAgent: config.stealth.fingerprinting.customUserAgent
459
+ };
460
+
461
+ // Adjust settings based on level
462
+ switch (level) {
463
+ case 'basic':
464
+ return {
465
+ ...baseConfig,
466
+ randomizeFingerprint: false,
467
+ blockWebRTC: true,
468
+ hideWebDriver: true
469
+ };
470
+ case 'advanced':
471
+ return {
472
+ ...baseConfig,
473
+ randomizeFingerprint: true,
474
+ blockWebRTC: true,
475
+ hideWebDriver: true,
476
+ spoofTimezone: config.stealth.fingerprinting.spoofTimezone,
477
+ preventCanvasFingerprinting: config.stealth.antiDetection.preventCanvasFingerprinting,
478
+ preventWebGLFingerprinting: config.stealth.antiDetection.preventWebGLFingerprinting,
479
+ networkEmulation: config.stealth.antiDetection.networkEmulation
480
+ };
481
+ case 'medium':
482
+ default:
483
+ return baseConfig;
484
+ }
485
+ }
486
+
487
+ // Get human behavior configuration for specific level
488
+ export function getHumanBehaviorConfig(level = 'medium') {
489
+ const baseConfig = {
490
+ enabled: config.stealth.humanBehavior.enabled,
491
+ mouseMovements: config.stealth.humanBehavior.mouseMovements,
492
+ typingVariation: config.stealth.humanBehavior.naturalTyping,
493
+ scrollBehavior: config.stealth.humanBehavior.scrollBehavior,
494
+ idlePeriods: config.stealth.humanBehavior.idlePeriods,
495
+ readingTime: config.stealth.humanBehavior.readingSimulation
496
+ };
497
+
498
+ // Adjust behavior complexity based on level
499
+ switch (level) {
500
+ case 'basic':
501
+ return {
502
+ ...baseConfig,
503
+ mouseMovements: false,
504
+ typingVariation: false,
505
+ idlePeriods: false
506
+ };
507
+ case 'advanced':
508
+ return {
509
+ ...baseConfig,
510
+ mouseMovements: true,
511
+ typingVariation: true,
512
+ scrollBehavior: true,
513
+ idlePeriods: true,
514
+ readingTime: true
515
+ };
516
+ case 'medium':
517
+ default:
518
+ return baseConfig;
519
+ }
520
+ }
521
+
522
+ // Check if stealth mode is properly configured
523
+ export function isStealthConfigured() {
524
+ return config.stealth.enabled && (
525
+ config.stealth.fingerprinting.randomizeUserAgent ||
526
+ config.stealth.fingerprinting.hideWebDriver ||
527
+ config.stealth.humanBehavior.enabled
528
+ );
529
+ }
530
+
531
+ // Get localization configuration
532
+ export function getLocalizationConfig() {
533
+ return config.localization;
534
+ }
535
+
536
+ // Check if localization is enabled and properly configured
537
+ export function isLocalizationConfigured() {
538
+ return config.localization.enabled && (
539
+ config.localization.proxy.enabled ||
540
+ config.localization.translation.enabled ||
541
+ config.localization.geoBlocking.autoBypass
542
+ );
543
+ }
544
+
545
+ // Get proxy configuration for localization
546
+ export function getProxyConfig() {
547
+ return config.localization.proxy;
548
+ }
549
+
550
+ // Get translation configuration
551
+ export function getTranslationConfig() {
552
+ return config.localization.translation;
553
+ }
554
+
555
+ // Get geo-blocking bypass configuration
556
+ export function getGeoBlockingConfig() {
557
+ return config.localization.geoBlocking;
558
+ }
559
+
560
+ // Get cultural simulation configuration
561
+ export function getCulturalConfig() {
562
+ return config.localization.cultural;
563
+ }
564
+
565
+ // Validate localization configuration
566
+ export function validateLocalizationConfig() {
567
+ const errors = [];
568
+ const localizationConfig = config.localization;
569
+
570
+ if (localizationConfig.enabled) {
571
+ // Validate country code
572
+ if (!localizationConfig.defaultCountry || localizationConfig.defaultCountry.length !== 2) {
573
+ errors.push('DEFAULT_COUNTRY_CODE must be a valid 2-letter country code');
574
+ }
575
+
576
+ // Validate language code
577
+ if (!localizationConfig.defaultLanguage || !localizationConfig.defaultLanguage.includes('-')) {
578
+ errors.push('DEFAULT_LANGUAGE must be in format language-country (e.g., en-US)');
579
+ }
580
+
581
+ // Validate proxy configuration
582
+ if (localizationConfig.proxy.enabled) {
583
+ if (localizationConfig.proxy.rotation.interval < 60000) {
584
+ errors.push('PROXY_ROTATION_INTERVAL should be at least 60000ms (1 minute)');
585
+ }
586
+
587
+ if (localizationConfig.proxy.healthCheck.interval < 60000) {
588
+ errors.push('PROXY_HEALTH_CHECK_INTERVAL should be at least 60000ms (1 minute)');
589
+ }
590
+ }
591
+
592
+ // Validate translation configuration
593
+ if (localizationConfig.translation.enabled) {
594
+ const validProviders = ['google', 'azure', 'libre'];
595
+ if (!validProviders.includes(localizationConfig.translation.defaultProvider)) {
596
+ errors.push(`TRANSLATION_PROVIDER must be one of: ${validProviders.join(', ')}`);
597
+ }
598
+ }
599
+
600
+ // Validate geo-blocking configuration
601
+ if (localizationConfig.geoBlocking.autoBypass) {
602
+ if (localizationConfig.geoBlocking.maxRetries > 10) {
603
+ errors.push('GEO_BLOCKING_MAX_RETRIES should not exceed 10');
604
+ }
605
+
606
+ if (localizationConfig.geoBlocking.retryDelay < 1000) {
607
+ errors.push('GEO_BLOCKING_RETRY_DELAY should be at least 1000ms');
608
+ }
609
+ }
610
+ }
611
+
612
+ return errors;
613
+ }
614
+
615
+ export default config;