crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,449 @@
1
+ import { z } from 'zod';
2
+ import { BFSCrawler } from '../../core/crawlers/BFSCrawler.js';
3
+ import { DomainFilter } from '../../utils/domainFilter.js';
4
+
5
+ const CrawlDeepSchema = z.object({
6
+ url: z.string().url(),
7
+ max_depth: z.number().min(1).max(5).optional().default(3),
8
+ max_pages: z.number().min(1).max(1000).optional().default(100),
9
+ include_patterns: z.array(z.string()).optional().default([]),
10
+ exclude_patterns: z.array(z.string()).optional().default([]),
11
+ follow_external: z.boolean().optional().default(false),
12
+ respect_robots: z.boolean().optional().default(true),
13
+ extract_content: z.boolean().optional().default(true),
14
+ concurrency: z.number().min(1).max(20).optional().default(10),
15
+ enable_link_analysis: z.boolean().optional().default(true),
16
+ link_analysis_options: z.object({
17
+ dampingFactor: z.number().min(0).max(1).optional().default(0.85),
18
+ maxIterations: z.number().min(1).max(1000).optional().default(100),
19
+ enableCaching: z.boolean().optional().default(true)
20
+ }).optional().default({}),
21
+ // New domain filtering options
22
+ domain_filter: z.object({
23
+ whitelist: z.array(z.union([
24
+ z.string(),
25
+ z.object({
26
+ domain: z.string(),
27
+ options: z.object({
28
+ includeSubdomains: z.boolean().optional(),
29
+ maxDepth: z.number().optional(),
30
+ rateLimit: z.number().optional(),
31
+ customHeaders: z.record(z.string()).optional(),
32
+ timeout: z.number().optional()
33
+ }).optional()
34
+ })
35
+ ])).optional().default([]),
36
+ blacklist: z.array(z.union([
37
+ z.string(),
38
+ z.object({
39
+ domain: z.string(),
40
+ options: z.object({
41
+ includeSubdomains: z.boolean().optional(),
42
+ reason: z.string().optional(),
43
+ permanent: z.boolean().optional()
44
+ }).optional()
45
+ })
46
+ ])).optional().default([]),
47
+ domain_rules: z.record(z.object({
48
+ maxDepth: z.number().optional(),
49
+ rateLimit: z.number().optional(),
50
+ respectRobots: z.boolean().optional(),
51
+ allowedPaths: z.array(z.string()).optional(),
52
+ blockedPaths: z.array(z.string()).optional(),
53
+ customHeaders: z.record(z.string()).optional(),
54
+ timeout: z.number().optional(),
55
+ maxPages: z.number().optional(),
56
+ concurrency: z.number().optional()
57
+ })).optional().default({})
58
+ }).optional(),
59
+ import_filter_config: z.string().optional() // JSON string of exported config
60
+ });
61
+
62
+ export class CrawlDeepTool {
63
+ constructor(options = {}) {
64
+ const {
65
+ userAgent = 'MCP-WebScraper/1.0',
66
+ timeout = 30000
67
+ } = options;
68
+
69
+ this.userAgent = userAgent;
70
+ this.timeout = timeout;
71
+ }
72
+
73
+ async execute(params) {
74
+ try {
75
+ const validated = CrawlDeepSchema.parse(params);
76
+
77
+ // Create domain filter if configuration provided
78
+ let domainFilter = null;
79
+ if (validated.import_filter_config) {
80
+ // Import from exported configuration
81
+ domainFilter = new DomainFilter();
82
+ try {
83
+ const importConfig = JSON.parse(validated.import_filter_config);
84
+ domainFilter.importConfig(importConfig);
85
+ } catch (error) {
86
+ throw new Error(`Invalid filter configuration: ${error.message}`);
87
+ }
88
+ } else if (validated.domain_filter) {
89
+ // Create from inline configuration
90
+ domainFilter = new DomainFilter({
91
+ allowSubdomains: !validated.follow_external,
92
+ defaultMaxDepth: validated.max_depth,
93
+ defaultRateLimit: 10
94
+ });
95
+
96
+ // Configure whitelist
97
+ for (const item of validated.domain_filter.whitelist) {
98
+ if (typeof item === 'string') {
99
+ domainFilter.addWhitelistDomain(item);
100
+ } else {
101
+ domainFilter.addWhitelistDomain(item.domain, item.options || {});
102
+ }
103
+ }
104
+
105
+ // Configure blacklist
106
+ for (const item of validated.domain_filter.blacklist) {
107
+ if (typeof item === 'string') {
108
+ domainFilter.addBlacklistDomain(item);
109
+ } else {
110
+ domainFilter.addBlacklistDomain(item.domain, item.options || {});
111
+ }
112
+ }
113
+
114
+ // Configure domain rules
115
+ for (const [domain, rules] of Object.entries(validated.domain_filter.domain_rules)) {
116
+ domainFilter.setDomainRules(domain, rules);
117
+ }
118
+ }
119
+
120
+ // Create crawler instance
121
+ const crawler = new BFSCrawler({
122
+ maxDepth: validated.max_depth,
123
+ maxPages: validated.max_pages,
124
+ followExternal: validated.follow_external,
125
+ respectRobots: validated.respect_robots,
126
+ userAgent: this.userAgent,
127
+ timeout: this.timeout,
128
+ concurrency: validated.concurrency,
129
+ domainFilter: domainFilter,
130
+ enableLinkAnalysis: validated.enable_link_analysis,
131
+ linkAnalyzerOptions: validated.link_analysis_options
132
+ });
133
+
134
+ // Start crawling
135
+ const startTime = Date.now();
136
+ const results = await crawler.crawl(validated.url, {
137
+ includePatterns: validated.include_patterns,
138
+ excludePatterns: validated.exclude_patterns,
139
+ extractContent: validated.extract_content
140
+ });
141
+ const duration = Date.now() - startTime;
142
+
143
+ // Process and format results
144
+ const response = {
145
+ url: validated.url,
146
+ crawl_depth: validated.max_depth,
147
+ pages_crawled: results.urls.length,
148
+ pages_found: results.results.length,
149
+ errors: results.errors.length,
150
+ duration_ms: duration,
151
+ pages_per_second: results.urls.length / (duration / 1000),
152
+ results: this.formatResults(results.results, validated.extract_content),
153
+ errors: results.errors,
154
+ stats: results.stats,
155
+ site_structure: this.analyzeSiteStructure(results.urls),
156
+ domain_filter_config: domainFilter ? domainFilter.exportConfig() : null,
157
+ link_analysis: results.linkAnalysis
158
+ };
159
+
160
+ return response;
161
+ } catch (error) {
162
+ throw new Error(`Crawl failed: ${error.message}`);
163
+ }
164
+ }
165
+
166
+ formatResults(results, includeContent) {
167
+ return results.map(result => {
168
+ const formatted = {
169
+ url: result.url,
170
+ depth: result.depth,
171
+ title: result.title,
172
+ links_count: result.links,
173
+ content_length: result.contentLength,
174
+ timestamp: result.timestamp
175
+ };
176
+
177
+ if (includeContent) {
178
+ formatted.content = result.content ? result.content.substring(0, 500) + '...' : '';
179
+ formatted.metadata = result.metadata;
180
+ }
181
+
182
+ return formatted;
183
+ });
184
+ }
185
+
186
+ analyzeSiteStructure(urls) {
187
+ const structure = {
188
+ total_pages: urls.length,
189
+ depth_distribution: {},
190
+ path_patterns: {},
191
+ file_types: {},
192
+ subdomains: new Set()
193
+ };
194
+
195
+ for (const url of urls) {
196
+ try {
197
+ const urlObj = new URL(url);
198
+
199
+ // Analyze depth
200
+ const depth = urlObj.pathname.split('/').filter(s => s).length;
201
+ structure.depth_distribution[depth] = (structure.depth_distribution[depth] || 0) + 1;
202
+
203
+ // Analyze path patterns
204
+ const pathSegments = urlObj.pathname.split('/').filter(s => s);
205
+ if (pathSegments.length > 0) {
206
+ const firstSegment = pathSegments[0];
207
+ structure.path_patterns[firstSegment] = (structure.path_patterns[firstSegment] || 0) + 1;
208
+ }
209
+
210
+ // Analyze file types
211
+ const extension = this.getFileExtension(urlObj.pathname);
212
+ if (extension) {
213
+ structure.file_types[extension] = (structure.file_types[extension] || 0) + 1;
214
+ }
215
+
216
+ // Collect subdomains
217
+ structure.subdomains.add(urlObj.hostname);
218
+ } catch {
219
+ // Skip invalid URLs
220
+ }
221
+ }
222
+
223
+ structure.subdomains = Array.from(structure.subdomains);
224
+
225
+ return structure;
226
+ }
227
+
228
+ getFileExtension(pathname) {
229
+ const match = pathname.match(/\.([a-z0-9]+)$/i);
230
+ return match ? match[1].toLowerCase() : null;
231
+ }
232
+
233
+ async executeBatch(urls, options = {}) {
234
+ const results = [];
235
+
236
+ for (const url of urls) {
237
+ try {
238
+ const result = await this.execute({ ...options, url });
239
+ results.push({ url, success: true, result });
240
+ } catch (error) {
241
+ results.push({ url, success: false, error: error.message });
242
+ }
243
+ }
244
+
245
+ return results;
246
+ }
247
+
248
+ /**
249
+ * Export link graph from crawler results
250
+ * @param {Object} crawler - BFSCrawler instance
251
+ * @param {string} format - Export format ('json', 'dot', 'csv', 'adjacency')
252
+ * @param {Object} options - Export options
253
+ * @returns {string|Object} Exported graph data
254
+ */
255
+ exportLinkGraph(crawler, format = 'json', options = {}) {
256
+ if (!crawler || !crawler.getLinkAnalyzer()) {
257
+ throw new Error('Crawler with link analysis is required');
258
+ }
259
+
260
+ return crawler.exportLinkGraph(format, options);
261
+ }
262
+
263
+ /**
264
+ * Get relationship path between two URLs from crawler
265
+ * @param {Object} crawler - BFSCrawler instance
266
+ * @param {string} url1 - Starting URL
267
+ * @param {string} url2 - Target URL
268
+ * @param {Object} options - Path finding options
269
+ * @returns {Object|null} Path object or null
270
+ */
271
+ getRelationshipPath(crawler, url1, url2, options = {}) {
272
+ if (!crawler || !crawler.getLinkAnalyzer()) {
273
+ throw new Error('Crawler with link analysis is required');
274
+ }
275
+
276
+ const linkAnalyzer = crawler.getLinkAnalyzer();
277
+ return linkAnalyzer.getRelationshipPath(url1, url2, options);
278
+ }
279
+
280
+ /**
281
+ * Analyze site structure with enhanced link analysis
282
+ * @param {Array} urls - Crawled URLs
283
+ * @param {Object} linkAnalysis - Link analysis results
284
+ * @returns {Object} Enhanced site structure analysis
285
+ */
286
+ analyzeEnhancedSiteStructure(urls, linkAnalysis = null) {
287
+ const basicStructure = this.analyzeSiteStructure(urls);
288
+
289
+ if (!linkAnalysis) {
290
+ return basicStructure;
291
+ }
292
+
293
+ return {
294
+ ...basicStructure,
295
+ link_metrics: {
296
+ total_links: linkAnalysis.statistics?.links || 0,
297
+ link_density: linkAnalysis.statistics?.density || 0,
298
+ avg_outbound_links: linkAnalysis.statistics?.avgOutboundLinks || 0,
299
+ avg_inbound_links: linkAnalysis.statistics?.avgInboundLinks || 0
300
+ },
301
+ authority_pages: linkAnalysis.hubsAndAuthorities?.authorities?.slice(0, 5) || [],
302
+ hub_pages: linkAnalysis.hubsAndAuthorities?.hubs?.slice(0, 5) || [],
303
+ most_important_pages: linkAnalysis.importance?.topPages?.slice(0, 10) || [],
304
+ circular_references: linkAnalysis.cycles?.length || 0,
305
+ domain_connectivity: linkAnalysis.domainAnalysis?.domainConnectivity || {},
306
+ link_patterns: linkAnalysis.linkPatterns?.linkDistribution || {}
307
+ };
308
+ }
309
+
310
+ /**
311
+ * Generate link analysis summary report
312
+ * @param {Object} linkAnalysis - Link analysis results
313
+ * @returns {Object} Summary report
314
+ */
315
+ generateLinkAnalysisSummary(linkAnalysis) {
316
+ if (!linkAnalysis) {
317
+ return { error: 'No link analysis data provided' };
318
+ }
319
+
320
+ const summary = {
321
+ overview: {
322
+ total_pages: linkAnalysis.statistics?.nodes || 0,
323
+ total_links: linkAnalysis.statistics?.links || 0,
324
+ link_density: (linkAnalysis.statistics?.density || 0).toFixed(4),
325
+ analysis_time: linkAnalysis.analysisTime || 0,
326
+ generated_at: linkAnalysis.generatedAt
327
+ },
328
+ key_metrics: {
329
+ most_important_page: linkAnalysis.importance?.topPages?.[0] || null,
330
+ highest_authority: linkAnalysis.hubsAndAuthorities?.authorities?.[0] || null,
331
+ main_hub: linkAnalysis.hubsAndAuthorities?.hubs?.[0] || null,
332
+ circular_references: linkAnalysis.cycles?.length || 0,
333
+ domains_analyzed: linkAnalysis.domainAnalysis?.totalDomains || 0
334
+ },
335
+ link_patterns: {
336
+ internal_vs_external: {
337
+ internal: linkAnalysis.linkPatterns?.linkDistribution?.internal || 0,
338
+ external: linkAnalysis.linkPatterns?.linkDistribution?.external || 0
339
+ },
340
+ top_anchor_texts: linkAnalysis.linkPatterns?.topAnchorTexts?.slice(0, 5) || [],
341
+ top_path_patterns: linkAnalysis.linkPatterns?.topPathPatterns?.slice(0, 5) || []
342
+ },
343
+ recommendations: this.generateLinkRecommendations(linkAnalysis)
344
+ };
345
+
346
+ return summary;
347
+ }
348
+
349
+ /**
350
+ * Generate SEO and structural recommendations based on link analysis
351
+ * @param {Object} linkAnalysis - Link analysis results
352
+ * @returns {Array} Array of recommendations
353
+ */
354
+ generateLinkRecommendations(linkAnalysis) {
355
+ const recommendations = [];
356
+
357
+ if (!linkAnalysis || !linkAnalysis.statistics) {
358
+ return recommendations;
359
+ }
360
+
361
+ const stats = linkAnalysis.statistics;
362
+ const patterns = linkAnalysis.linkPatterns;
363
+ const cycles = linkAnalysis.cycles || [];
364
+
365
+ // Link density recommendations
366
+ if (stats.density < 0.01) {
367
+ recommendations.push({
368
+ type: 'link_density',
369
+ priority: 'medium',
370
+ issue: 'Low link density detected',
371
+ description: 'The site has very few internal links relative to the number of pages',
372
+ suggestion: 'Consider adding more internal links to improve navigation and SEO'
373
+ });
374
+ }
375
+
376
+ // Hub/Authority balance
377
+ const hubs = linkAnalysis.hubsAndAuthorities?.hubs || [];
378
+ const authorities = linkAnalysis.hubsAndAuthorities?.authorities || [];
379
+
380
+ if (hubs.length === 0) {
381
+ recommendations.push({
382
+ type: 'hub_pages',
383
+ priority: 'low',
384
+ issue: 'No hub pages identified',
385
+ description: 'No pages with many outbound links were found',
386
+ suggestion: 'Consider creating navigation or category pages that link to multiple content pages'
387
+ });
388
+ }
389
+
390
+ if (authorities.length === 0) {
391
+ recommendations.push({
392
+ type: 'authority_pages',
393
+ priority: 'medium',
394
+ issue: 'No authority pages identified',
395
+ description: 'No pages with many inbound links were found',
396
+ suggestion: 'Focus on creating high-quality content that other pages naturally link to'
397
+ });
398
+ }
399
+
400
+ // Circular reference warnings
401
+ if (cycles.length > 0) {
402
+ const strongCycles = cycles.filter(cycle => cycle.strength > 2);
403
+ if (strongCycles.length > 0) {
404
+ recommendations.push({
405
+ type: 'circular_references',
406
+ priority: 'low',
407
+ issue: `${strongCycles.length} circular reference chains detected`,
408
+ description: 'Some pages have circular linking patterns',
409
+ suggestion: 'Review circular links to ensure they provide value and don\'t confuse users'
410
+ });
411
+ }
412
+ }
413
+
414
+ // External vs internal link balance
415
+ if (patterns && patterns.linkDistribution) {
416
+ const internal = patterns.linkDistribution.internal || 0;
417
+ const external = patterns.linkDistribution.external || 0;
418
+ const total = internal + external;
419
+
420
+ if (total > 0) {
421
+ const externalRatio = external / total;
422
+ if (externalRatio > 0.3) {
423
+ recommendations.push({
424
+ type: 'external_links',
425
+ priority: 'low',
426
+ issue: 'High ratio of external links',
427
+ description: `${(externalRatio * 100).toFixed(1)}% of links are external`,
428
+ suggestion: 'Consider balancing with more internal links to keep users on your site'
429
+ });
430
+ }
431
+ }
432
+ }
433
+
434
+ // Orphaned pages (pages with no inbound links)
435
+ if (stats.nodes > 0 && stats.avgInboundLinks < 1) {
436
+ recommendations.push({
437
+ type: 'orphaned_pages',
438
+ priority: 'high',
439
+ issue: 'Potential orphaned pages detected',
440
+ description: 'Average inbound links per page is very low',
441
+ suggestion: 'Ensure all important pages can be reached through internal navigation'
442
+ });
443
+ }
444
+
445
+ return recommendations;
446
+ }
447
+ }
448
+
449
+ export default CrawlDeepTool;