crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,400 @@
1
+ import { z } from 'zod';
2
+ import { load } from 'cheerio';
3
+ import { DomainFilter } from '../../utils/domainFilter.js';
4
+ import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
5
+
6
+ const MapSiteSchema = z.object({
7
+ url: z.string().url(),
8
+ include_sitemap: z.boolean().optional().default(true),
9
+ max_urls: z.number().min(1).max(10000).optional().default(1000),
10
+ group_by_path: z.boolean().optional().default(true),
11
+ include_metadata: z.boolean().optional().default(false),
12
+ // New domain filtering options
13
+ domain_filter: z.object({
14
+ whitelist: z.array(z.string()).optional().default([]),
15
+ blacklist: z.array(z.string()).optional().default([]),
16
+ include_patterns: z.array(z.string()).optional().default([]),
17
+ exclude_patterns: z.array(z.string()).optional().default([])
18
+ }).optional(),
19
+ import_filter_config: z.string().optional() // JSON string of exported config
20
+ });
21
+
22
+ export class MapSiteTool {
23
+ constructor(options = {}) {
24
+ const {
25
+ userAgent = 'MCP-WebScraper/1.0',
26
+ timeout = 10000
27
+ } = options;
28
+
29
+ this.userAgent = userAgent;
30
+ this.timeout = timeout;
31
+ }
32
+
33
+ async execute(params) {
34
+ try {
35
+ const validated = MapSiteSchema.parse(params);
36
+ const baseUrl = getBaseUrl(validated.url);
37
+ const urls = new Set();
38
+ const metadata = new Map();
39
+
40
+ // Create domain filter if configuration provided
41
+ let domainFilter = null;
42
+ if (validated.import_filter_config) {
43
+ // Import from exported configuration
44
+ domainFilter = new DomainFilter();
45
+ try {
46
+ const importConfig = JSON.parse(validated.import_filter_config);
47
+ domainFilter.importConfig(importConfig);
48
+ } catch (error) {
49
+ throw new Error(`Invalid filter configuration: ${error.message}`);
50
+ }
51
+ } else if (validated.domain_filter) {
52
+ // Create from inline configuration
53
+ domainFilter = new DomainFilter({ allowSubdomains: true });
54
+
55
+ // Configure domain filter
56
+ for (const domain of validated.domain_filter.whitelist) {
57
+ domainFilter.addWhitelistDomain(domain);
58
+ }
59
+ for (const domain of validated.domain_filter.blacklist) {
60
+ domainFilter.addBlacklistDomain(domain);
61
+ }
62
+ for (const pattern of validated.domain_filter.include_patterns) {
63
+ domainFilter.addPattern(pattern, 'include');
64
+ }
65
+ for (const pattern of validated.domain_filter.exclude_patterns) {
66
+ domainFilter.addPattern(pattern, 'exclude');
67
+ }
68
+ }
69
+
70
+ // Try to fetch sitemap first
71
+ if (validated.include_sitemap) {
72
+ const sitemapUrls = await this.fetchSitemapUrls(baseUrl, domainFilter);
73
+ sitemapUrls.forEach(url => urls.add(normalizeUrl(url)));
74
+ }
75
+
76
+ // Fetch and parse the main page for additional URLs
77
+ const pageUrls = await this.fetchPageUrls(validated.url, domainFilter);
78
+ pageUrls.forEach(url => {
79
+ if (urls.size < validated.max_urls) {
80
+ urls.add(normalizeUrl(url));
81
+ }
82
+ });
83
+
84
+ // Convert to array and limit
85
+ const urlArray = Array.from(urls).slice(0, validated.max_urls);
86
+
87
+ // Fetch metadata if requested
88
+ if (validated.include_metadata) {
89
+ await this.fetchMetadata(urlArray.slice(0, 50), metadata); // Limit metadata fetching
90
+ }
91
+
92
+ // Organize results
93
+ const organized = validated.group_by_path
94
+ ? this.groupByPath(urlArray)
95
+ : urlArray;
96
+
97
+ return {
98
+ base_url: baseUrl,
99
+ total_urls: urlArray.length,
100
+ urls: organized,
101
+ metadata: validated.include_metadata ? Object.fromEntries(metadata) : {},
102
+ site_map: this.generateSiteMap(urlArray),
103
+ statistics: this.generateStatistics(urlArray),
104
+ domain_filter_config: domainFilter ? domainFilter.exportConfig() : null,
105
+ filter_stats: domainFilter ? domainFilter.getStats() : null
106
+ };
107
+ } catch (error) {
108
+ throw new Error(`Site mapping failed: ${error.message}`);
109
+ }
110
+ }
111
+
112
+ async fetchSitemapUrls(baseUrl, domainFilter = null) {
113
+ const urls = new Set();
114
+ const sitemapUrls = [
115
+ `${baseUrl}/sitemap.xml`,
116
+ `${baseUrl}/sitemap_index.xml`,
117
+ `${baseUrl}/sitemap-index.xml`,
118
+ `${baseUrl}/sitemaps.xml`
119
+ ];
120
+
121
+ for (const sitemapUrl of sitemapUrls) {
122
+ try {
123
+ const response = await this.fetchWithTimeout(sitemapUrl);
124
+ if (response.ok) {
125
+ const xml = await response.text();
126
+ const extractedUrls = this.parseSitemap(xml);
127
+
128
+ // Apply domain filter if provided
129
+ extractedUrls.forEach(url => {
130
+ if (!domainFilter || domainFilter.isAllowed(url).allowed) {
131
+ urls.add(url);
132
+ }
133
+ });
134
+
135
+ // If we found a sitemap, don't try others
136
+ if (urls.size > 0) break;
137
+ }
138
+ } catch {
139
+ // Continue to next sitemap URL
140
+ }
141
+ }
142
+
143
+ return Array.from(urls);
144
+ }
145
+
146
+ parseSitemap(xml) {
147
+ const urls = new Set();
148
+
149
+ // Extract URLs from sitemap
150
+ const urlMatches = xml.match(/<loc>([^<]+)<\/loc>/g);
151
+ if (urlMatches) {
152
+ urlMatches.forEach(match => {
153
+ const url = match.replace(/<\/?loc>/g, '').trim();
154
+ if (url) urls.add(url);
155
+ });
156
+ }
157
+
158
+ // Check for nested sitemaps (sitemap index)
159
+ const sitemapMatches = xml.match(/<sitemap>[\s\S]*?<\/sitemap>/g);
160
+ if (sitemapMatches) {
161
+ for (const sitemapMatch of sitemapMatches) {
162
+ const locMatch = sitemapMatch.match(/<loc>([^<]+)<\/loc>/);
163
+ if (locMatch && locMatch[1]) {
164
+ // We could recursively fetch nested sitemaps here
165
+ // For now, just add the sitemap URL itself
166
+ urls.add(locMatch[1]);
167
+ }
168
+ }
169
+ }
170
+
171
+ return Array.from(urls);
172
+ }
173
+
174
+ async fetchPageUrls(url, domainFilter = null) {
175
+ try {
176
+ const response = await this.fetchWithTimeout(url);
177
+ if (!response.ok) {
178
+ return [];
179
+ }
180
+
181
+ const html = await response.text();
182
+ const $ = load(html);
183
+ const urls = new Set();
184
+ const baseUrl = getBaseUrl(url);
185
+
186
+ // Extract all links
187
+ $('a[href]').each((_, element) => {
188
+ const href = $(element).attr('href');
189
+ if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
190
+ try {
191
+ const absoluteUrl = new URL(href, url);
192
+ // Only include URLs from the same domain
193
+ if (absoluteUrl.origin === new URL(baseUrl).origin) {
194
+ const urlString = absoluteUrl.toString();
195
+
196
+ // Apply domain filter if provided
197
+ if (!domainFilter || domainFilter.isAllowed(urlString).allowed) {
198
+ urls.add(urlString);
199
+ }
200
+ }
201
+ } catch {
202
+ // Invalid URL, skip
203
+ }
204
+ }
205
+ });
206
+
207
+ return Array.from(urls);
208
+ } catch {
209
+ return [];
210
+ }
211
+ }
212
+
213
+ async fetchMetadata(urls, metadataMap) {
214
+ const promises = urls.slice(0, 10).map(async (url) => {
215
+ try {
216
+ const response = await this.fetchWithTimeout(url);
217
+ if (response.ok) {
218
+ const html = await response.text();
219
+ const $ = load(html);
220
+
221
+ metadataMap.set(url, {
222
+ title: $('title').text().trim(),
223
+ description: $('meta[name="description"]').attr('content') || '',
224
+ keywords: $('meta[name="keywords"]').attr('content') || '',
225
+ h1: $('h1').first().text().trim(),
226
+ canonical: $('link[rel="canonical"]').attr('href') || ''
227
+ });
228
+ }
229
+ } catch {
230
+ // Skip metadata for failed URLs
231
+ }
232
+ });
233
+
234
+ await Promise.allSettled(promises);
235
+ }
236
+
237
+ async fetchWithTimeout(url) {
238
+ const controller = new AbortController();
239
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
240
+
241
+ try {
242
+ const response = await fetch(url, {
243
+ signal: controller.signal,
244
+ headers: {
245
+ 'User-Agent': this.userAgent
246
+ }
247
+ });
248
+ clearTimeout(timeoutId);
249
+ return response;
250
+ } catch (error) {
251
+ clearTimeout(timeoutId);
252
+ throw error;
253
+ }
254
+ }
255
+
256
+ groupByPath(urls) {
257
+ const grouped = {};
258
+
259
+ for (const url of urls) {
260
+ try {
261
+ const urlObj = new URL(url);
262
+ const pathSegments = urlObj.pathname.split('/').filter(s => s);
263
+
264
+ if (pathSegments.length === 0) {
265
+ if (!grouped['/']) grouped['/'] = [];
266
+ grouped['/'].push(url);
267
+ } else {
268
+ const firstSegment = '/' + pathSegments[0];
269
+ if (!grouped[firstSegment]) grouped[firstSegment] = [];
270
+ grouped[firstSegment].push(url);
271
+ }
272
+ } catch {
273
+ // Skip invalid URLs
274
+ }
275
+ }
276
+
277
+ // Sort URLs within each group
278
+ for (const path in grouped) {
279
+ grouped[path].sort();
280
+ }
281
+
282
+ return grouped;
283
+ }
284
+
285
+ generateSiteMap(urls) {
286
+ const siteMap = {
287
+ root: [],
288
+ sections: {},
289
+ depth_levels: {}
290
+ };
291
+
292
+ for (const url of urls) {
293
+ try {
294
+ const urlObj = new URL(url);
295
+ const pathSegments = urlObj.pathname.split('/').filter(s => s);
296
+ const depth = pathSegments.length;
297
+
298
+ // Add to depth levels
299
+ if (!siteMap.depth_levels[depth]) {
300
+ siteMap.depth_levels[depth] = [];
301
+ }
302
+ siteMap.depth_levels[depth].push(url);
303
+
304
+ // Add to sections
305
+ if (depth === 0) {
306
+ siteMap.root.push(url);
307
+ } else {
308
+ const section = pathSegments[0];
309
+ if (!siteMap.sections[section]) {
310
+ siteMap.sections[section] = {
311
+ urls: [],
312
+ subsections: {}
313
+ };
314
+ }
315
+ siteMap.sections[section].urls.push(url);
316
+
317
+ // Add subsections
318
+ if (depth > 1) {
319
+ const subsection = pathSegments[1];
320
+ if (!siteMap.sections[section].subsections[subsection]) {
321
+ siteMap.sections[section].subsections[subsection] = [];
322
+ }
323
+ siteMap.sections[section].subsections[subsection].push(url);
324
+ }
325
+ }
326
+ } catch {
327
+ // Skip invalid URLs
328
+ }
329
+ }
330
+
331
+ return siteMap;
332
+ }
333
+
334
+ generateStatistics(urls) {
335
+ const stats = {
336
+ total_urls: urls.length,
337
+ unique_paths: new Set(),
338
+ file_extensions: {},
339
+ query_parameters: 0,
340
+ secure_urls: 0,
341
+ max_depth: 0,
342
+ average_depth: 0,
343
+ url_lengths: {
344
+ min: Infinity,
345
+ max: 0,
346
+ average: 0
347
+ }
348
+ };
349
+
350
+ let totalDepth = 0;
351
+ let totalLength = 0;
352
+
353
+ for (const url of urls) {
354
+ try {
355
+ const urlObj = new URL(url);
356
+
357
+ // Count secure URLs
358
+ if (urlObj.protocol === 'https:') {
359
+ stats.secure_urls++;
360
+ }
361
+
362
+ // Count query parameters
363
+ if (urlObj.search) {
364
+ stats.query_parameters++;
365
+ }
366
+
367
+ // Track unique paths
368
+ stats.unique_paths.add(urlObj.pathname);
369
+
370
+ // Calculate depth
371
+ const depth = urlObj.pathname.split('/').filter(s => s).length;
372
+ totalDepth += depth;
373
+ stats.max_depth = Math.max(stats.max_depth, depth);
374
+
375
+ // Track URL lengths
376
+ const length = url.length;
377
+ totalLength += length;
378
+ stats.url_lengths.min = Math.min(stats.url_lengths.min, length);
379
+ stats.url_lengths.max = Math.max(stats.url_lengths.max, length);
380
+
381
+ // Track file extensions
382
+ const match = urlObj.pathname.match(/\.([a-z0-9]+)$/i);
383
+ if (match) {
384
+ const ext = match[1].toLowerCase();
385
+ stats.file_extensions[ext] = (stats.file_extensions[ext] || 0) + 1;
386
+ }
387
+ } catch {
388
+ // Skip invalid URLs
389
+ }
390
+ }
391
+
392
+ stats.unique_paths = stats.unique_paths.size;
393
+ stats.average_depth = urls.length > 0 ? totalDepth / urls.length : 0;
394
+ stats.url_lengths.average = urls.length > 0 ? totalLength / urls.length : 0;
395
+
396
+ return stats;
397
+ }
398
+ }
399
+
400
+ export default MapSiteTool;