crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,482 @@
1
+ import { z } from 'zod';
2
+ import { SearchProviderFactory } from './adapters/searchProviderFactory.js';
3
+ import { CacheManager } from '../../core/cache/CacheManager.js';
4
+ import { QueryExpander } from './queryExpander.js';
5
+ import { ResultRanker } from './ranking/ResultRanker.js';
6
+ import { ResultDeduplicator } from './ranking/ResultDeduplicator.js';
7
+ import LocalizationManager from '../../core/LocalizationManager.js';
8
+
9
+ const SearchWebSchema = z.object({
10
+ query: z.string().min(1),
11
+ limit: z.number().min(1).max(100).optional().default(10),
12
+ offset: z.number().min(0).optional().default(0),
13
+ lang: z.string().optional().default('en'),
14
+ safe_search: z.boolean().optional().default(true),
15
+ time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional().default('all'),
16
+ site: z.string().optional(),
17
+ file_type: z.string().optional(),
18
+ expand_query: z.boolean().optional().default(true),
19
+ expansion_options: z.object({
20
+ enableSynonyms: z.boolean().optional(),
21
+ enableSpellCheck: z.boolean().optional(),
22
+ enableStemming: z.boolean().optional(),
23
+ enablePhraseDetection: z.boolean().optional(),
24
+ enableBooleanOperators: z.boolean().optional(),
25
+ maxExpansions: z.number().min(1).max(10).optional()
26
+ }).optional(),
27
+
28
+ // Ranking options
29
+ enable_ranking: z.boolean().optional().default(true),
30
+ ranking_weights: z.object({
31
+ bm25: z.number().min(0).max(1).optional(),
32
+ semantic: z.number().min(0).max(1).optional(),
33
+ authority: z.number().min(0).max(1).optional(),
34
+ freshness: z.number().min(0).max(1).optional()
35
+ }).optional(),
36
+
37
+ // Deduplication options
38
+ enable_deduplication: z.boolean().optional().default(true),
39
+ deduplication_thresholds: z.object({
40
+ url: z.number().min(0).max(1).optional(),
41
+ title: z.number().min(0).max(1).optional(),
42
+ content: z.number().min(0).max(1).optional(),
43
+ combined: z.number().min(0).max(1).optional()
44
+ }).optional(),
45
+
46
+ // Output options
47
+ include_ranking_details: z.boolean().optional().default(false),
48
+ include_deduplication_details: z.boolean().optional().default(false),
49
+
50
+ // Localization options
51
+ localization: z.object({
52
+ countryCode: z.string().length(2).optional(),
53
+ language: z.string().optional(),
54
+ timezone: z.string().optional(),
55
+ enableGeoTargeting: z.boolean().default(false),
56
+ customLocation: z.object({
57
+ latitude: z.number().min(-90).max(90),
58
+ longitude: z.number().min(-180).max(180)
59
+ }).optional()
60
+ }).optional()
61
+ });
62
+
63
+ export class SearchWebTool {
64
+ constructor(options = {}) {
65
+ const {
66
+ provider = 'auto',
67
+ google = {},
68
+ duckduckgo = {},
69
+ cacheEnabled = true,
70
+ cacheTTL = 3600000, // 1 hour
71
+ expanderOptions = {},
72
+ rankingOptions = {},
73
+ deduplicationOptions = {}
74
+ } = options;
75
+
76
+ // Determine which provider to use
77
+ this.provider = this.determineProvider(provider, { google, duckduckgo });
78
+
79
+ // Create the search adapter
80
+ try {
81
+ this.searchAdapter = SearchProviderFactory.createAdapter(this.provider, {
82
+ google,
83
+ duckduckgo
84
+ });
85
+ } catch (error) {
86
+ throw new Error(`Failed to initialize search provider '${this.provider}': ${error.message}`);
87
+ }
88
+
89
+ this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
90
+
91
+ // Initialize query expander
92
+ this.queryExpander = new QueryExpander(expanderOptions);
93
+
94
+ // Initialize ranking and deduplication systems
95
+ this.resultRanker = new ResultRanker({ cacheEnabled, cacheTTL, ...rankingOptions });
96
+ this.resultDeduplicator = new ResultDeduplicator({ cacheEnabled, cacheTTL, ...deduplicationOptions });
97
+
98
+ // Initialize localization manager
99
+ this.localizationManager = new LocalizationManager({
100
+ enableGeoBlockingBypass: options.enableGeoBlockingBypass !== false,
101
+ dynamicFingerprinting: options.dynamicFingerprinting !== false
102
+ });
103
+ }
104
+
105
+ determineProvider(configuredProvider, providerOptions) {
106
+ switch (configuredProvider.toLowerCase()) {
107
+ case 'google':
108
+ if (!providerOptions.google?.apiKey || !providerOptions.google?.searchEngineId) {
109
+ throw new Error('Google provider requires apiKey and searchEngineId');
110
+ }
111
+ return 'google';
112
+
113
+ case 'duckduckgo':
114
+ return 'duckduckgo';
115
+
116
+ case 'auto':
117
+ default:
118
+ // Auto mode: prefer Google if credentials available, otherwise use DuckDuckGo
119
+ if (providerOptions.google?.apiKey && providerOptions.google?.searchEngineId) {
120
+ return 'google';
121
+ }
122
+ return 'duckduckgo';
123
+ }
124
+ }
125
+
126
+ async execute(params) {
127
+ try {
128
+ const validated = SearchWebSchema.parse(params);
129
+
130
+ // Apply localization if specified
131
+ let localizedParams = validated;
132
+ if (validated.localization) {
133
+ try {
134
+ localizedParams = await this.localizationManager.localizeSearchQuery(
135
+ validated,
136
+ validated.localization.countryCode
137
+ );
138
+ } catch (localizationError) {
139
+ console.warn('Localization failed, using original parameters:', localizationError.message);
140
+ // Continue with original parameters
141
+ }
142
+ }
143
+
144
+ // Expand query if enabled
145
+ let searchQueries = [localizedParams.query];
146
+ let expandedQueries = [];
147
+
148
+ if (localizedParams.expand_query) {
149
+ try {
150
+ expandedQueries = await this.queryExpander.expandQuery(
151
+ localizedParams.query,
152
+ localizedParams.expansion_options || {}
153
+ );
154
+
155
+ // Use the best expanded query as primary, keep original as fallback
156
+ if (expandedQueries.length > 1) {
157
+ searchQueries = expandedQueries;
158
+ }
159
+ } catch (expansionError) {
160
+ console.warn('Query expansion failed, using original query:', expansionError.message);
161
+ // Continue with original query
162
+ }
163
+ }
164
+
165
+ // Generate cache key (include expansion and localization info for accurate caching)
166
+ const cacheKey = this.cache ? this.cache.generateKey('search', {
167
+ ...localizedParams,
168
+ expandedQueries: localizedParams.expand_query ? expandedQueries : undefined,
169
+ localization: validated.localization
170
+ }) : null;
171
+
172
+ // Check cache
173
+ if (this.cache) {
174
+ const cached = await this.cache.get(cacheKey);
175
+ if (cached) {
176
+ return {
177
+ ...cached,
178
+ cached: true
179
+ };
180
+ }
181
+ }
182
+
183
+ // Try searches with expanded queries, starting with the best one
184
+ let bestResults = null;
185
+ let usedQuery = validated.query;
186
+ let searchError = null;
187
+
188
+ for (let i = 0; i < searchQueries.length; i++) {
189
+ try {
190
+ // Build search query with modifiers
191
+ let searchQuery = searchQueries[i];
192
+
193
+ if (validated.site) {
194
+ searchQuery = `site:${validated.site} ${searchQuery}`;
195
+ }
196
+
197
+ if (validated.file_type) {
198
+ searchQuery = `filetype:${validated.file_type} ${searchQuery}`;
199
+ }
200
+
201
+ // Perform search with localized parameters
202
+ const searchParams = {
203
+ query: searchQuery,
204
+ num: localizedParams.limit,
205
+ start: localizedParams.offset + 1, // Google uses 1-based indexing
206
+ lr: localizedParams.lr || `lang_${localizedParams.lang}`,
207
+ safe: localizedParams.safe_search ? 'active' : 'off',
208
+ dateRestrict: this.getDateRestrict(localizedParams.time_range),
209
+ // Add localization-specific parameters
210
+ ...localizedParams.headers && { headers: localizedParams.headers },
211
+ cr: localizedParams.cr, // Country restrict
212
+ uule: localizedParams.uule // Location encoding
213
+ };
214
+
215
+ const results = await this.searchAdapter.search(searchParams);
216
+
217
+ // Check if we got good results
218
+ if (results.items && results.items.length > 0) {
219
+ bestResults = results;
220
+ usedQuery = searchQueries[i];
221
+ break;
222
+ } else if (i === 0) {
223
+ // Save results from first query even if no items (might be the original query)
224
+ bestResults = results;
225
+ usedQuery = searchQueries[i];
226
+ }
227
+ } catch (error) {
228
+ searchError = error;
229
+ console.warn(`Search failed for query "${searchQueries[i]}":`, error.message);
230
+
231
+ // If this is the last query and we haven't found results, throw the error
232
+ if (i === searchQueries.length - 1 && !bestResults) {
233
+ throw error;
234
+ }
235
+ }
236
+ }
237
+
238
+ if (!bestResults) {
239
+ throw searchError || new Error('All search queries failed');
240
+ }
241
+
242
+ // Process and enrich results
243
+ let processedResults = await this.processResults(bestResults);
244
+
245
+ // Apply deduplication if enabled
246
+ let deduplicationInfo = null;
247
+ if (validated.enable_deduplication && processedResults.length > 1) {
248
+ const dedupeOptions = validated.deduplication_thresholds ?
249
+ { thresholds: validated.deduplication_thresholds } : {};
250
+
251
+ const originalCount = processedResults.length;
252
+ processedResults = await this.resultDeduplicator.deduplicateResults(
253
+ processedResults,
254
+ dedupeOptions
255
+ );
256
+
257
+ deduplicationInfo = {
258
+ originalCount,
259
+ finalCount: processedResults.length,
260
+ duplicatesRemoved: originalCount - processedResults.length,
261
+ deduplicationRate: ((originalCount - processedResults.length) / originalCount * 100).toFixed(1) + '%'
262
+ };
263
+ }
264
+
265
+ // Apply ranking if enabled
266
+ let rankingInfo = null;
267
+ if (validated.enable_ranking && processedResults.length > 1) {
268
+ const rankingOptions = validated.ranking_weights ?
269
+ { weights: validated.ranking_weights } : {};
270
+
271
+ processedResults = await this.resultRanker.rankResults(
272
+ processedResults,
273
+ validated.query,
274
+ rankingOptions
275
+ );
276
+
277
+ rankingInfo = {
278
+ algorithmsUsed: ['bm25', 'semantic', 'authority', 'freshness'],
279
+ weightsApplied: this.resultRanker.options.weights,
280
+ totalResults: processedResults.length
281
+ };
282
+ }
283
+
284
+ // Clean up results based on detail level requested
285
+ if (!validated.include_ranking_details) {
286
+ processedResults = processedResults.map(result => {
287
+ const { rankingDetails, ...cleanResult } = result;
288
+ return cleanResult;
289
+ });
290
+ }
291
+
292
+ if (!validated.include_deduplication_details) {
293
+ processedResults = processedResults.map(result => {
294
+ const { deduplicationInfo, ...cleanResult } = result;
295
+ return cleanResult;
296
+ });
297
+ }
298
+
299
+ const response = {
300
+ query: validated.query,
301
+ effective_query: usedQuery !== validated.query ? usedQuery : undefined,
302
+ expanded_queries: localizedParams.expand_query && expandedQueries.length > 1 ? expandedQueries : undefined,
303
+ results: processedResults,
304
+ total_results: bestResults.searchInformation?.totalResults || 0,
305
+ search_time: bestResults.searchInformation?.searchTime || 0,
306
+ offset: localizedParams.offset,
307
+ limit: localizedParams.limit,
308
+ cached: false,
309
+
310
+ // Add provider information
311
+ provider: {
312
+ name: this.provider,
313
+ capabilities: SearchProviderFactory.getProviderCapabilities(this.provider)
314
+ },
315
+
316
+ // Add localization information
317
+ localization: validated.localization ? {
318
+ applied: true,
319
+ countryCode: validated.localization.countryCode,
320
+ language: localizedParams.lang,
321
+ searchDomain: localizedParams.searchDomain,
322
+ geoTargeting: validated.localization.enableGeoTargeting
323
+ } : null,
324
+
325
+ // Add processing information
326
+ processing: {
327
+ ranking: rankingInfo,
328
+ deduplication: deduplicationInfo,
329
+ query_expansion: localizedParams.expand_query && expandedQueries.length > 1 ? {
330
+ original_query: validated.query,
331
+ expanded_count: expandedQueries.length,
332
+ used_query: usedQuery
333
+ } : null,
334
+ localization_applied: !!validated.localization
335
+ }
336
+ };
337
+
338
+ // Cache the results
339
+ if (this.cache) {
340
+ await this.cache.set(cacheKey, response);
341
+ }
342
+
343
+ return response;
344
+ } catch (error) {
345
+ throw new Error(`Search failed: ${error.message}`);
346
+ }
347
+ }
348
+
349
+ async processResults(searchResults) {
350
+ if (!searchResults.items || searchResults.items.length === 0) {
351
+ return [];
352
+ }
353
+
354
+ return searchResults.items.map(item => ({
355
+ title: item.title || '',
356
+ link: item.link || '',
357
+ snippet: item.snippet || '',
358
+ displayLink: item.displayLink || '',
359
+ formattedUrl: item.formattedUrl || '',
360
+ htmlSnippet: item.htmlSnippet || '',
361
+ pagemap: this.extractPagemap(item.pagemap),
362
+ metadata: {
363
+ mime: item.mime,
364
+ fileFormat: item.fileFormat,
365
+ cacheId: item.cacheId
366
+ }
367
+ }));
368
+ }
369
+
370
+ extractPagemap(pagemap) {
371
+ if (!pagemap) return {};
372
+
373
+ const extracted = {};
374
+
375
+ // Extract metatags
376
+ if (pagemap.metatags && pagemap.metatags[0]) {
377
+ const meta = pagemap.metatags[0];
378
+ extracted.metatags = {
379
+ title: meta['og:title'] || meta['twitter:title'] || '',
380
+ description: meta['og:description'] || meta['twitter:description'] || meta.description || '',
381
+ image: meta['og:image'] || meta['twitter:image'] || '',
382
+ author: meta.author || '',
383
+ publishedTime: meta['article:published_time'] || '',
384
+ modifiedTime: meta['article:modified_time'] || ''
385
+ };
386
+ }
387
+
388
+ // Extract CSE thumbnail
389
+ if (pagemap.cse_thumbnail && pagemap.cse_thumbnail[0]) {
390
+ extracted.thumbnail = {
391
+ src: pagemap.cse_thumbnail[0].src,
392
+ width: pagemap.cse_thumbnail[0].width,
393
+ height: pagemap.cse_thumbnail[0].height
394
+ };
395
+ }
396
+
397
+ // Extract CSE image
398
+ if (pagemap.cse_image && pagemap.cse_image[0]) {
399
+ extracted.image = pagemap.cse_image[0].src;
400
+ }
401
+
402
+ return extracted;
403
+ }
404
+
405
+ getDateRestrict(timeRange) {
406
+ const ranges = {
407
+ 'day': 'd1',
408
+ 'week': 'w1',
409
+ 'month': 'm1',
410
+ 'year': 'y1',
411
+ 'all': ''
412
+ };
413
+
414
+ return ranges[timeRange] || '';
415
+ }
416
+
417
+ async expandQuery(query, options = {}) {
418
+ // Enhanced query expansion using QueryExpander
419
+ try {
420
+ return await this.queryExpander.expandQuery(query, options);
421
+ } catch (error) {
422
+ console.warn('Advanced query expansion failed, falling back to simple expansion:', error.message);
423
+
424
+ // Fallback to simple expansion for backward compatibility
425
+ const expansions = [];
426
+
427
+ // Add common variations
428
+ expansions.push(query);
429
+
430
+ // Add quoted exact match
431
+ if (!query.includes('"')) {
432
+ expansions.push(`"${query}"`);
433
+ }
434
+
435
+ // Add OR variations for multi-word queries
436
+ const words = query.split(' ').filter(w => w.length > 2);
437
+ if (words.length > 1) {
438
+ expansions.push(words.join(' OR '));
439
+ }
440
+
441
+ return expansions;
442
+ }
443
+ }
444
+
445
+ /**
446
+ * Generate query suggestions
447
+ * @param {string} query
448
+ * @returns {Array<string>}
449
+ */
450
+ async generateSuggestions(query) {
451
+ try {
452
+ return this.queryExpander.generateSuggestions(query);
453
+ } catch (error) {
454
+ console.warn('Suggestion generation failed:', error.message);
455
+ return [];
456
+ }
457
+ }
458
+
459
+ getStats() {
460
+ return {
461
+ provider: {
462
+ name: this.provider,
463
+ capabilities: SearchProviderFactory.getProviderCapabilities(this.provider)
464
+ },
465
+ cacheStats: this.cache ? this.cache.getStats() : null,
466
+ queryExpanderStats: this.queryExpander ? this.queryExpander.getStats() : null,
467
+ rankingStats: this.resultRanker ? this.resultRanker.getStats() : null,
468
+ deduplicationStats: this.resultDeduplicator ? this.resultDeduplicator.getStats() : null
469
+ };
470
+ }
471
+
472
+ getProviderInfo() {
473
+ return {
474
+ activeProvider: this.provider,
475
+ capabilities: SearchProviderFactory.getProviderCapabilities(this.provider),
476
+ supportedProviders: SearchProviderFactory.getSupportedProviders(),
477
+ allProviders: SearchProviderFactory.compareProviders()
478
+ };
479
+ }
480
+ }
481
+
482
+ export default SearchWebTool;