crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { load } from 'cheerio';
|
|
3
|
+
import { DomainFilter } from '../../utils/domainFilter.js';
|
|
4
|
+
import { normalizeUrl, getBaseUrl } from '../../utils/urlNormalizer.js';
|
|
5
|
+
|
|
6
|
+
const MapSiteSchema = z.object({
|
|
7
|
+
url: z.string().url(),
|
|
8
|
+
include_sitemap: z.boolean().optional().default(true),
|
|
9
|
+
max_urls: z.number().min(1).max(10000).optional().default(1000),
|
|
10
|
+
group_by_path: z.boolean().optional().default(true),
|
|
11
|
+
include_metadata: z.boolean().optional().default(false),
|
|
12
|
+
// New domain filtering options
|
|
13
|
+
domain_filter: z.object({
|
|
14
|
+
whitelist: z.array(z.string()).optional().default([]),
|
|
15
|
+
blacklist: z.array(z.string()).optional().default([]),
|
|
16
|
+
include_patterns: z.array(z.string()).optional().default([]),
|
|
17
|
+
exclude_patterns: z.array(z.string()).optional().default([])
|
|
18
|
+
}).optional(),
|
|
19
|
+
import_filter_config: z.string().optional() // JSON string of exported config
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
export class MapSiteTool {
|
|
23
|
+
constructor(options = {}) {
|
|
24
|
+
const {
|
|
25
|
+
userAgent = 'MCP-WebScraper/1.0',
|
|
26
|
+
timeout = 10000
|
|
27
|
+
} = options;
|
|
28
|
+
|
|
29
|
+
this.userAgent = userAgent;
|
|
30
|
+
this.timeout = timeout;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
async execute(params) {
|
|
34
|
+
try {
|
|
35
|
+
const validated = MapSiteSchema.parse(params);
|
|
36
|
+
const baseUrl = getBaseUrl(validated.url);
|
|
37
|
+
const urls = new Set();
|
|
38
|
+
const metadata = new Map();
|
|
39
|
+
|
|
40
|
+
// Create domain filter if configuration provided
|
|
41
|
+
let domainFilter = null;
|
|
42
|
+
if (validated.import_filter_config) {
|
|
43
|
+
// Import from exported configuration
|
|
44
|
+
domainFilter = new DomainFilter();
|
|
45
|
+
try {
|
|
46
|
+
const importConfig = JSON.parse(validated.import_filter_config);
|
|
47
|
+
domainFilter.importConfig(importConfig);
|
|
48
|
+
} catch (error) {
|
|
49
|
+
throw new Error(`Invalid filter configuration: ${error.message}`);
|
|
50
|
+
}
|
|
51
|
+
} else if (validated.domain_filter) {
|
|
52
|
+
// Create from inline configuration
|
|
53
|
+
domainFilter = new DomainFilter({ allowSubdomains: true });
|
|
54
|
+
|
|
55
|
+
// Configure domain filter
|
|
56
|
+
for (const domain of validated.domain_filter.whitelist) {
|
|
57
|
+
domainFilter.addWhitelistDomain(domain);
|
|
58
|
+
}
|
|
59
|
+
for (const domain of validated.domain_filter.blacklist) {
|
|
60
|
+
domainFilter.addBlacklistDomain(domain);
|
|
61
|
+
}
|
|
62
|
+
for (const pattern of validated.domain_filter.include_patterns) {
|
|
63
|
+
domainFilter.addPattern(pattern, 'include');
|
|
64
|
+
}
|
|
65
|
+
for (const pattern of validated.domain_filter.exclude_patterns) {
|
|
66
|
+
domainFilter.addPattern(pattern, 'exclude');
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Try to fetch sitemap first
|
|
71
|
+
if (validated.include_sitemap) {
|
|
72
|
+
const sitemapUrls = await this.fetchSitemapUrls(baseUrl, domainFilter);
|
|
73
|
+
sitemapUrls.forEach(url => urls.add(normalizeUrl(url)));
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Fetch and parse the main page for additional URLs
|
|
77
|
+
const pageUrls = await this.fetchPageUrls(validated.url, domainFilter);
|
|
78
|
+
pageUrls.forEach(url => {
|
|
79
|
+
if (urls.size < validated.max_urls) {
|
|
80
|
+
urls.add(normalizeUrl(url));
|
|
81
|
+
}
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
// Convert to array and limit
|
|
85
|
+
const urlArray = Array.from(urls).slice(0, validated.max_urls);
|
|
86
|
+
|
|
87
|
+
// Fetch metadata if requested
|
|
88
|
+
if (validated.include_metadata) {
|
|
89
|
+
await this.fetchMetadata(urlArray.slice(0, 50), metadata); // Limit metadata fetching
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Organize results
|
|
93
|
+
const organized = validated.group_by_path
|
|
94
|
+
? this.groupByPath(urlArray)
|
|
95
|
+
: urlArray;
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
base_url: baseUrl,
|
|
99
|
+
total_urls: urlArray.length,
|
|
100
|
+
urls: organized,
|
|
101
|
+
metadata: validated.include_metadata ? Object.fromEntries(metadata) : {},
|
|
102
|
+
site_map: this.generateSiteMap(urlArray),
|
|
103
|
+
statistics: this.generateStatistics(urlArray),
|
|
104
|
+
domain_filter_config: domainFilter ? domainFilter.exportConfig() : null,
|
|
105
|
+
filter_stats: domainFilter ? domainFilter.getStats() : null
|
|
106
|
+
};
|
|
107
|
+
} catch (error) {
|
|
108
|
+
throw new Error(`Site mapping failed: ${error.message}`);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
async fetchSitemapUrls(baseUrl, domainFilter = null) {
|
|
113
|
+
const urls = new Set();
|
|
114
|
+
const sitemapUrls = [
|
|
115
|
+
`${baseUrl}/sitemap.xml`,
|
|
116
|
+
`${baseUrl}/sitemap_index.xml`,
|
|
117
|
+
`${baseUrl}/sitemap-index.xml`,
|
|
118
|
+
`${baseUrl}/sitemaps.xml`
|
|
119
|
+
];
|
|
120
|
+
|
|
121
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
122
|
+
try {
|
|
123
|
+
const response = await this.fetchWithTimeout(sitemapUrl);
|
|
124
|
+
if (response.ok) {
|
|
125
|
+
const xml = await response.text();
|
|
126
|
+
const extractedUrls = this.parseSitemap(xml);
|
|
127
|
+
|
|
128
|
+
// Apply domain filter if provided
|
|
129
|
+
extractedUrls.forEach(url => {
|
|
130
|
+
if (!domainFilter || domainFilter.isAllowed(url).allowed) {
|
|
131
|
+
urls.add(url);
|
|
132
|
+
}
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
// If we found a sitemap, don't try others
|
|
136
|
+
if (urls.size > 0) break;
|
|
137
|
+
}
|
|
138
|
+
} catch {
|
|
139
|
+
// Continue to next sitemap URL
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return Array.from(urls);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
parseSitemap(xml) {
|
|
147
|
+
const urls = new Set();
|
|
148
|
+
|
|
149
|
+
// Extract URLs from sitemap
|
|
150
|
+
const urlMatches = xml.match(/<loc>([^<]+)<\/loc>/g);
|
|
151
|
+
if (urlMatches) {
|
|
152
|
+
urlMatches.forEach(match => {
|
|
153
|
+
const url = match.replace(/<\/?loc>/g, '').trim();
|
|
154
|
+
if (url) urls.add(url);
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Check for nested sitemaps (sitemap index)
|
|
159
|
+
const sitemapMatches = xml.match(/<sitemap>[\s\S]*?<\/sitemap>/g);
|
|
160
|
+
if (sitemapMatches) {
|
|
161
|
+
for (const sitemapMatch of sitemapMatches) {
|
|
162
|
+
const locMatch = sitemapMatch.match(/<loc>([^<]+)<\/loc>/);
|
|
163
|
+
if (locMatch && locMatch[1]) {
|
|
164
|
+
// We could recursively fetch nested sitemaps here
|
|
165
|
+
// For now, just add the sitemap URL itself
|
|
166
|
+
urls.add(locMatch[1]);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return Array.from(urls);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
async fetchPageUrls(url, domainFilter = null) {
|
|
175
|
+
try {
|
|
176
|
+
const response = await this.fetchWithTimeout(url);
|
|
177
|
+
if (!response.ok) {
|
|
178
|
+
return [];
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const html = await response.text();
|
|
182
|
+
const $ = load(html);
|
|
183
|
+
const urls = new Set();
|
|
184
|
+
const baseUrl = getBaseUrl(url);
|
|
185
|
+
|
|
186
|
+
// Extract all links
|
|
187
|
+
$('a[href]').each((_, element) => {
|
|
188
|
+
const href = $(element).attr('href');
|
|
189
|
+
if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
|
|
190
|
+
try {
|
|
191
|
+
const absoluteUrl = new URL(href, url);
|
|
192
|
+
// Only include URLs from the same domain
|
|
193
|
+
if (absoluteUrl.origin === new URL(baseUrl).origin) {
|
|
194
|
+
const urlString = absoluteUrl.toString();
|
|
195
|
+
|
|
196
|
+
// Apply domain filter if provided
|
|
197
|
+
if (!domainFilter || domainFilter.isAllowed(urlString).allowed) {
|
|
198
|
+
urls.add(urlString);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
} catch {
|
|
202
|
+
// Invalid URL, skip
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
return Array.from(urls);
|
|
208
|
+
} catch {
|
|
209
|
+
return [];
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
async fetchMetadata(urls, metadataMap) {
|
|
214
|
+
const promises = urls.slice(0, 10).map(async (url) => {
|
|
215
|
+
try {
|
|
216
|
+
const response = await this.fetchWithTimeout(url);
|
|
217
|
+
if (response.ok) {
|
|
218
|
+
const html = await response.text();
|
|
219
|
+
const $ = load(html);
|
|
220
|
+
|
|
221
|
+
metadataMap.set(url, {
|
|
222
|
+
title: $('title').text().trim(),
|
|
223
|
+
description: $('meta[name="description"]').attr('content') || '',
|
|
224
|
+
keywords: $('meta[name="keywords"]').attr('content') || '',
|
|
225
|
+
h1: $('h1').first().text().trim(),
|
|
226
|
+
canonical: $('link[rel="canonical"]').attr('href') || ''
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
} catch {
|
|
230
|
+
// Skip metadata for failed URLs
|
|
231
|
+
}
|
|
232
|
+
});
|
|
233
|
+
|
|
234
|
+
await Promise.allSettled(promises);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
async fetchWithTimeout(url) {
|
|
238
|
+
const controller = new AbortController();
|
|
239
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
240
|
+
|
|
241
|
+
try {
|
|
242
|
+
const response = await fetch(url, {
|
|
243
|
+
signal: controller.signal,
|
|
244
|
+
headers: {
|
|
245
|
+
'User-Agent': this.userAgent
|
|
246
|
+
}
|
|
247
|
+
});
|
|
248
|
+
clearTimeout(timeoutId);
|
|
249
|
+
return response;
|
|
250
|
+
} catch (error) {
|
|
251
|
+
clearTimeout(timeoutId);
|
|
252
|
+
throw error;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
groupByPath(urls) {
|
|
257
|
+
const grouped = {};
|
|
258
|
+
|
|
259
|
+
for (const url of urls) {
|
|
260
|
+
try {
|
|
261
|
+
const urlObj = new URL(url);
|
|
262
|
+
const pathSegments = urlObj.pathname.split('/').filter(s => s);
|
|
263
|
+
|
|
264
|
+
if (pathSegments.length === 0) {
|
|
265
|
+
if (!grouped['/']) grouped['/'] = [];
|
|
266
|
+
grouped['/'].push(url);
|
|
267
|
+
} else {
|
|
268
|
+
const firstSegment = '/' + pathSegments[0];
|
|
269
|
+
if (!grouped[firstSegment]) grouped[firstSegment] = [];
|
|
270
|
+
grouped[firstSegment].push(url);
|
|
271
|
+
}
|
|
272
|
+
} catch {
|
|
273
|
+
// Skip invalid URLs
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Sort URLs within each group
|
|
278
|
+
for (const path in grouped) {
|
|
279
|
+
grouped[path].sort();
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
return grouped;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
generateSiteMap(urls) {
|
|
286
|
+
const siteMap = {
|
|
287
|
+
root: [],
|
|
288
|
+
sections: {},
|
|
289
|
+
depth_levels: {}
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
for (const url of urls) {
|
|
293
|
+
try {
|
|
294
|
+
const urlObj = new URL(url);
|
|
295
|
+
const pathSegments = urlObj.pathname.split('/').filter(s => s);
|
|
296
|
+
const depth = pathSegments.length;
|
|
297
|
+
|
|
298
|
+
// Add to depth levels
|
|
299
|
+
if (!siteMap.depth_levels[depth]) {
|
|
300
|
+
siteMap.depth_levels[depth] = [];
|
|
301
|
+
}
|
|
302
|
+
siteMap.depth_levels[depth].push(url);
|
|
303
|
+
|
|
304
|
+
// Add to sections
|
|
305
|
+
if (depth === 0) {
|
|
306
|
+
siteMap.root.push(url);
|
|
307
|
+
} else {
|
|
308
|
+
const section = pathSegments[0];
|
|
309
|
+
if (!siteMap.sections[section]) {
|
|
310
|
+
siteMap.sections[section] = {
|
|
311
|
+
urls: [],
|
|
312
|
+
subsections: {}
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
siteMap.sections[section].urls.push(url);
|
|
316
|
+
|
|
317
|
+
// Add subsections
|
|
318
|
+
if (depth > 1) {
|
|
319
|
+
const subsection = pathSegments[1];
|
|
320
|
+
if (!siteMap.sections[section].subsections[subsection]) {
|
|
321
|
+
siteMap.sections[section].subsections[subsection] = [];
|
|
322
|
+
}
|
|
323
|
+
siteMap.sections[section].subsections[subsection].push(url);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
} catch {
|
|
327
|
+
// Skip invalid URLs
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
return siteMap;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
generateStatistics(urls) {
|
|
335
|
+
const stats = {
|
|
336
|
+
total_urls: urls.length,
|
|
337
|
+
unique_paths: new Set(),
|
|
338
|
+
file_extensions: {},
|
|
339
|
+
query_parameters: 0,
|
|
340
|
+
secure_urls: 0,
|
|
341
|
+
max_depth: 0,
|
|
342
|
+
average_depth: 0,
|
|
343
|
+
url_lengths: {
|
|
344
|
+
min: Infinity,
|
|
345
|
+
max: 0,
|
|
346
|
+
average: 0
|
|
347
|
+
}
|
|
348
|
+
};
|
|
349
|
+
|
|
350
|
+
let totalDepth = 0;
|
|
351
|
+
let totalLength = 0;
|
|
352
|
+
|
|
353
|
+
for (const url of urls) {
|
|
354
|
+
try {
|
|
355
|
+
const urlObj = new URL(url);
|
|
356
|
+
|
|
357
|
+
// Count secure URLs
|
|
358
|
+
if (urlObj.protocol === 'https:') {
|
|
359
|
+
stats.secure_urls++;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Count query parameters
|
|
363
|
+
if (urlObj.search) {
|
|
364
|
+
stats.query_parameters++;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// Track unique paths
|
|
368
|
+
stats.unique_paths.add(urlObj.pathname);
|
|
369
|
+
|
|
370
|
+
// Calculate depth
|
|
371
|
+
const depth = urlObj.pathname.split('/').filter(s => s).length;
|
|
372
|
+
totalDepth += depth;
|
|
373
|
+
stats.max_depth = Math.max(stats.max_depth, depth);
|
|
374
|
+
|
|
375
|
+
// Track URL lengths
|
|
376
|
+
const length = url.length;
|
|
377
|
+
totalLength += length;
|
|
378
|
+
stats.url_lengths.min = Math.min(stats.url_lengths.min, length);
|
|
379
|
+
stats.url_lengths.max = Math.max(stats.url_lengths.max, length);
|
|
380
|
+
|
|
381
|
+
// Track file extensions
|
|
382
|
+
const match = urlObj.pathname.match(/\.([a-z0-9]+)$/i);
|
|
383
|
+
if (match) {
|
|
384
|
+
const ext = match[1].toLowerCase();
|
|
385
|
+
stats.file_extensions[ext] = (stats.file_extensions[ext] || 0) + 1;
|
|
386
|
+
}
|
|
387
|
+
} catch {
|
|
388
|
+
// Skip invalid URLs
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
stats.unique_paths = stats.unique_paths.size;
|
|
393
|
+
stats.average_depth = urls.length > 0 ? totalDepth / urls.length : 0;
|
|
394
|
+
stats.url_lengths.average = urls.length > 0 ? totalLength / urls.length : 0;
|
|
395
|
+
|
|
396
|
+
return stats;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
export default MapSiteTool;
|