crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,845 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
|
+
import { QueueManager } from '../queue/QueueManager.js';
|
|
3
|
+
import { CacheManager } from '../cache/CacheManager.js';
|
|
4
|
+
import { RateLimiter } from '../../utils/rateLimiter.js';
|
|
5
|
+
import { RobotsChecker } from '../../utils/robotsChecker.js';
|
|
6
|
+
import { DomainFilter } from '../../utils/domainFilter.js';
|
|
7
|
+
import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
|
|
8
|
+
import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
|
|
9
|
+
|
|
10
|
+
export class BFSCrawler {
|
|
11
|
+
constructor(options = {}) {
|
|
12
|
+
const {
|
|
13
|
+
maxDepth = 5,
|
|
14
|
+
maxPages = 100,
|
|
15
|
+
followExternal = false,
|
|
16
|
+
respectRobots = true,
|
|
17
|
+
userAgent = 'MCP-WebScraper/1.0',
|
|
18
|
+
timeout = 30000,
|
|
19
|
+
concurrency = 10,
|
|
20
|
+
domainFilter = null,
|
|
21
|
+
enableLinkAnalysis = true,
|
|
22
|
+
linkAnalyzerOptions = {}
|
|
23
|
+
} = options;
|
|
24
|
+
|
|
25
|
+
this.maxDepth = maxDepth;
|
|
26
|
+
this.maxPages = maxPages;
|
|
27
|
+
this.followExternal = followExternal;
|
|
28
|
+
this.respectRobots = respectRobots;
|
|
29
|
+
this.userAgent = userAgent;
|
|
30
|
+
this.timeout = timeout;
|
|
31
|
+
|
|
32
|
+
this.visited = new Set();
|
|
33
|
+
this.results = [];
|
|
34
|
+
this.errors = [];
|
|
35
|
+
this.filterDecisions = []; // Track filtering decisions for analysis
|
|
36
|
+
|
|
37
|
+
// Link analysis
|
|
38
|
+
this.enableLinkAnalysis = enableLinkAnalysis;
|
|
39
|
+
this.linkAnalyzer = enableLinkAnalysis ? new LinkAnalyzer(linkAnalyzerOptions) : null;
|
|
40
|
+
|
|
41
|
+
this.queue = new QueueManager({ concurrency, timeout });
|
|
42
|
+
this.cache = new CacheManager({ ttl: 3600000 }); // 1 hour cache
|
|
43
|
+
this.rateLimiter = new RateLimiter({ requestsPerSecond: 10 });
|
|
44
|
+
this.robotsChecker = respectRobots ? new RobotsChecker(userAgent) : null;
|
|
45
|
+
|
|
46
|
+
// Initialize domain filter (create new if not provided)
|
|
47
|
+
this.domainFilter = domainFilter || new DomainFilter({
|
|
48
|
+
allowSubdomains: !followExternal, // If not following external, allow subdomains by default
|
|
49
|
+
defaultMaxDepth: maxDepth,
|
|
50
|
+
defaultRateLimit: 10
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
async crawl(startUrl, options = {}) {
|
|
55
|
+
const {
|
|
56
|
+
includePatterns = [],
|
|
57
|
+
excludePatterns = [],
|
|
58
|
+
extractContent = true,
|
|
59
|
+
domainFilterConfig = null
|
|
60
|
+
} = options;
|
|
61
|
+
|
|
62
|
+
// Backward compatibility: convert old patterns to domain filter patterns
|
|
63
|
+
this.includePatterns = includePatterns.map(p => new RegExp(p));
|
|
64
|
+
this.excludePatterns = excludePatterns.map(p => new RegExp(p));
|
|
65
|
+
|
|
66
|
+
// Add legacy patterns to domain filter for unified processing
|
|
67
|
+
for (const pattern of includePatterns) {
|
|
68
|
+
this.domainFilter.addPattern(pattern, 'include', { description: 'Legacy include pattern' });
|
|
69
|
+
}
|
|
70
|
+
for (const pattern of excludePatterns) {
|
|
71
|
+
this.domainFilter.addPattern(pattern, 'exclude', { description: 'Legacy exclude pattern' });
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Apply additional domain filter configuration if provided
|
|
75
|
+
if (domainFilterConfig) {
|
|
76
|
+
if (domainFilterConfig.whitelist) {
|
|
77
|
+
for (const [domain, options] of Object.entries(domainFilterConfig.whitelist)) {
|
|
78
|
+
this.domainFilter.addWhitelistDomain(domain, options);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
if (domainFilterConfig.blacklist) {
|
|
82
|
+
for (const [domain, options] of Object.entries(domainFilterConfig.blacklist)) {
|
|
83
|
+
this.domainFilter.addBlacklistDomain(domain, options);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
this.extractContent = extractContent;
|
|
89
|
+
this.filterDecisions = []; // Reset filter decisions
|
|
90
|
+
|
|
91
|
+
const normalizedStart = normalizeUrl(startUrl);
|
|
92
|
+
this.baseUrl = new URL(normalizedStart);
|
|
93
|
+
|
|
94
|
+
// Check if start URL is allowed
|
|
95
|
+
const startUrlDecision = this.domainFilter.isAllowed(normalizedStart);
|
|
96
|
+
if (!startUrlDecision.allowed) {
|
|
97
|
+
throw new Error(`Start URL blocked by domain filter: ${startUrlDecision.reason}`);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Initialize queue with starting URL
|
|
101
|
+
await this.queue.add(() => this.processUrl(normalizedStart, 0));
|
|
102
|
+
|
|
103
|
+
// Wait for crawling to complete
|
|
104
|
+
await this.queue.onIdle();
|
|
105
|
+
|
|
106
|
+
// Perform link analysis if enabled
|
|
107
|
+
let linkAnalysisResults = null;
|
|
108
|
+
if (this.enableLinkAnalysis && this.linkAnalyzer) {
|
|
109
|
+
linkAnalysisResults = this.performLinkAnalysis();
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
urls: Array.from(this.visited),
|
|
114
|
+
results: this.results,
|
|
115
|
+
errors: this.errors,
|
|
116
|
+
stats: this.getStats(),
|
|
117
|
+
linkAnalysis: linkAnalysisResults
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async processUrl(url, depth) {
|
|
122
|
+
// Check limits
|
|
123
|
+
if (depth > this.maxDepth || this.visited.size >= this.maxPages) {
|
|
124
|
+
return;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Check if already visited
|
|
128
|
+
const normalizedUrl = normalizeUrl(url);
|
|
129
|
+
if (this.visited.has(normalizedUrl)) {
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Check domain filter (replaces old pattern checking)
|
|
134
|
+
const filterDecision = this.domainFilter.isAllowed(normalizedUrl);
|
|
135
|
+
this.filterDecisions.push({
|
|
136
|
+
url: normalizedUrl,
|
|
137
|
+
decision: filterDecision,
|
|
138
|
+
timestamp: new Date().toISOString()
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
if (!filterDecision.allowed) {
|
|
142
|
+
console.log(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Backward compatibility: also check legacy patterns
|
|
147
|
+
if (!this.shouldCrawlUrl(normalizedUrl)) {
|
|
148
|
+
console.log(`Legacy pattern blocks: ${normalizedUrl}`);
|
|
149
|
+
return;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Check robots.txt
|
|
153
|
+
if (this.respectRobots && this.robotsChecker) {
|
|
154
|
+
const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
|
|
155
|
+
if (!canFetch) {
|
|
156
|
+
console.log(`Robots.txt blocks: ${normalizedUrl}`);
|
|
157
|
+
return;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Mark as visited
|
|
162
|
+
this.visited.add(normalizedUrl);
|
|
163
|
+
|
|
164
|
+
try {
|
|
165
|
+
// Check cache first
|
|
166
|
+
const cacheKey = this.cache.generateKey(normalizedUrl);
|
|
167
|
+
let pageData = await this.cache.get(cacheKey);
|
|
168
|
+
|
|
169
|
+
if (!pageData) {
|
|
170
|
+
// Apply domain-specific rate limiting
|
|
171
|
+
const urlObj = new URL(normalizedUrl);
|
|
172
|
+
const domainRules = this.domainFilter.getDomainRules(urlObj.hostname);
|
|
173
|
+
|
|
174
|
+
// Use domain-specific rate limit if available
|
|
175
|
+
const effectiveRateLimit = domainRules.rateLimit || 10;
|
|
176
|
+
if (this.rateLimiter.requestsPerSecond !== effectiveRateLimit) {
|
|
177
|
+
// Update rate limiter for this domain
|
|
178
|
+
this.rateLimiter = new RateLimiter({ requestsPerSecond: effectiveRateLimit });
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
await this.rateLimiter.checkLimit(normalizedUrl);
|
|
182
|
+
|
|
183
|
+
// Fetch the page
|
|
184
|
+
pageData = await this.fetchPage(normalizedUrl);
|
|
185
|
+
|
|
186
|
+
// Cache the result
|
|
187
|
+
await this.cache.set(cacheKey, pageData);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Process links for analysis
|
|
191
|
+
if (this.enableLinkAnalysis && this.linkAnalyzer && pageData.links) {
|
|
192
|
+
for (const link of pageData.links) {
|
|
193
|
+
const absoluteUrl = this.resolveUrl(link, normalizedUrl);
|
|
194
|
+
if (absoluteUrl) {
|
|
195
|
+
// Extract anchor text and context from link
|
|
196
|
+
const linkMetadata = this.extractLinkMetadata(link, pageData.originalHtml, normalizedUrl);
|
|
197
|
+
this.linkAnalyzer.addLink(normalizedUrl, absoluteUrl, linkMetadata);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// Process the page
|
|
203
|
+
const result = {
|
|
204
|
+
url: normalizedUrl,
|
|
205
|
+
depth,
|
|
206
|
+
title: pageData.title,
|
|
207
|
+
contentLength: pageData.content?.length || 0,
|
|
208
|
+
links: pageData.links?.length || 0,
|
|
209
|
+
timestamp: new Date().toISOString()
|
|
210
|
+
};
|
|
211
|
+
|
|
212
|
+
if (this.extractContent) {
|
|
213
|
+
result.content = pageData.content;
|
|
214
|
+
result.metadata = pageData.metadata;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
this.results.push(result);
|
|
218
|
+
|
|
219
|
+
// Add discovered links to queue (if not at max depth)
|
|
220
|
+
if (depth < this.maxDepth && pageData.links) {
|
|
221
|
+
for (const link of pageData.links) {
|
|
222
|
+
if (this.visited.size >= this.maxPages) break;
|
|
223
|
+
|
|
224
|
+
const absoluteUrl = this.resolveUrl(link, normalizedUrl);
|
|
225
|
+
if (absoluteUrl && !this.visited.has(absoluteUrl)) {
|
|
226
|
+
await this.queue.add(() => this.processUrl(absoluteUrl, depth + 1));
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
} catch (error) {
|
|
231
|
+
this.errors.push({
|
|
232
|
+
url: normalizedUrl,
|
|
233
|
+
depth,
|
|
234
|
+
error: error.message,
|
|
235
|
+
timestamp: new Date().toISOString()
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
async fetchPage(url) {
|
|
241
|
+
const controller = new AbortController();
|
|
242
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
243
|
+
|
|
244
|
+
try {
|
|
245
|
+
// Get domain-specific headers and timeout
|
|
246
|
+
const urlObj = new URL(url);
|
|
247
|
+
const domainRules = this.domainFilter.getDomainRules(urlObj.hostname);
|
|
248
|
+
|
|
249
|
+
const defaultHeaders = {
|
|
250
|
+
'User-Agent': this.userAgent,
|
|
251
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
252
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
253
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
254
|
+
'Connection': 'keep-alive',
|
|
255
|
+
'Upgrade-Insecure-Requests': '1'
|
|
256
|
+
};
|
|
257
|
+
|
|
258
|
+
const headers = { ...defaultHeaders, ...domainRules.customHeaders };
|
|
259
|
+
const effectiveTimeout = domainRules.timeout || this.timeout;
|
|
260
|
+
|
|
261
|
+
// Update timeout if different
|
|
262
|
+
if (effectiveTimeout !== this.timeout) {
|
|
263
|
+
clearTimeout(timeoutId);
|
|
264
|
+
setTimeout(() => controller.abort(), effectiveTimeout);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
const response = await fetch(url, {
|
|
268
|
+
signal: controller.signal,
|
|
269
|
+
headers
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
clearTimeout(timeoutId);
|
|
273
|
+
|
|
274
|
+
if (!response.ok) {
|
|
275
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
const contentType = response.headers.get('content-type');
|
|
279
|
+
if (!contentType || !contentType.includes('text/html')) {
|
|
280
|
+
throw new Error(`Non-HTML content type: ${contentType}`);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
const html = await response.text();
|
|
284
|
+
return this.parsePage(html, url);
|
|
285
|
+
} catch (error) {
|
|
286
|
+
clearTimeout(timeoutId);
|
|
287
|
+
throw error;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
parsePage(html, url) {
|
|
292
|
+
const $ = load(html);
|
|
293
|
+
|
|
294
|
+
// Extract title
|
|
295
|
+
const title = $('title').text().trim() || $('h1').first().text().trim() || '';
|
|
296
|
+
|
|
297
|
+
// Extract main content
|
|
298
|
+
$('script, style, noscript').remove();
|
|
299
|
+
const content = $('body').text().replace(/\s+/g, ' ').trim();
|
|
300
|
+
|
|
301
|
+
// Extract metadata
|
|
302
|
+
const metadata = {
|
|
303
|
+
description: $('meta[name="description"]').attr('content') || '',
|
|
304
|
+
keywords: $('meta[name="keywords"]').attr('content') || '',
|
|
305
|
+
author: $('meta[name="author"]').attr('content') || '',
|
|
306
|
+
ogTitle: $('meta[property="og:title"]').attr('content') || '',
|
|
307
|
+
ogDescription: $('meta[property="og:description"]').attr('content') || ''
|
|
308
|
+
};
|
|
309
|
+
|
|
310
|
+
// Extract links
|
|
311
|
+
const links = [];
|
|
312
|
+
$('a[href]').each((_, element) => {
|
|
313
|
+
const href = $(element).attr('href');
|
|
314
|
+
if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
|
|
315
|
+
links.push(href);
|
|
316
|
+
}
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
return {
|
|
320
|
+
title,
|
|
321
|
+
content,
|
|
322
|
+
metadata,
|
|
323
|
+
links: [...new Set(links)], // Remove duplicates
|
|
324
|
+
originalHtml: html // Store original HTML for link analysis
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
resolveUrl(link, baseUrl) {
|
|
329
|
+
try {
|
|
330
|
+
// Handle absolute URLs
|
|
331
|
+
if (link.startsWith('http://') || link.startsWith('https://')) {
|
|
332
|
+
const linkUrl = new URL(link);
|
|
333
|
+
|
|
334
|
+
// Check if we should follow external links
|
|
335
|
+
if (!this.followExternal && linkUrl.origin !== this.baseUrl.origin) {
|
|
336
|
+
return null;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
return normalizeUrl(link);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Handle relative URLs
|
|
343
|
+
const resolved = new URL(link, baseUrl);
|
|
344
|
+
|
|
345
|
+
// Check if we should follow external links
|
|
346
|
+
if (!this.followExternal && resolved.origin !== this.baseUrl.origin) {
|
|
347
|
+
return null;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
return normalizeUrl(resolved.toString());
|
|
351
|
+
} catch {
|
|
352
|
+
return null;
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
shouldCrawlUrl(url) {
|
|
357
|
+
// Check include patterns
|
|
358
|
+
if (this.includePatterns.length > 0) {
|
|
359
|
+
const matches = this.includePatterns.some(pattern => pattern.test(url));
|
|
360
|
+
if (!matches) return false;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// Check exclude patterns
|
|
364
|
+
if (this.excludePatterns.length > 0) {
|
|
365
|
+
const excluded = this.excludePatterns.some(pattern => pattern.test(url));
|
|
366
|
+
if (excluded) return false;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
return true;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
getStats() {
|
|
373
|
+
const filterStats = this.domainFilter.getStats();
|
|
374
|
+
const filterDecisionStats = this.getFilterDecisionStats();
|
|
375
|
+
|
|
376
|
+
return {
|
|
377
|
+
visited: this.visited.size,
|
|
378
|
+
results: this.results.length,
|
|
379
|
+
errors: this.errors.length,
|
|
380
|
+
cacheStats: this.cache.getStats(),
|
|
381
|
+
queueStats: this.queue.getStats(),
|
|
382
|
+
rateLimitStats: this.rateLimiter.getStats(),
|
|
383
|
+
domainFilterStats: filterStats,
|
|
384
|
+
filterDecisions: filterDecisionStats
|
|
385
|
+
};
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
getFilterDecisionStats() {
|
|
389
|
+
const total = this.filterDecisions.length;
|
|
390
|
+
const allowed = this.filterDecisions.filter(d => d.decision.allowed).length;
|
|
391
|
+
const blocked = total - allowed;
|
|
392
|
+
|
|
393
|
+
const reasonCounts = {};
|
|
394
|
+
this.filterDecisions.forEach(d => {
|
|
395
|
+
if (!d.decision.allowed) {
|
|
396
|
+
reasonCounts[d.decision.reason] = (reasonCounts[d.decision.reason] || 0) + 1;
|
|
397
|
+
}
|
|
398
|
+
});
|
|
399
|
+
|
|
400
|
+
return {
|
|
401
|
+
total,
|
|
402
|
+
allowed,
|
|
403
|
+
blocked,
|
|
404
|
+
allowedPercentage: total > 0 ? (allowed / total * 100).toFixed(2) : 0,
|
|
405
|
+
blockedReasons: reasonCounts
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
pause() {
|
|
410
|
+
this.queue.pause();
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
resume() {
|
|
414
|
+
this.queue.start();
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
stop() {
|
|
418
|
+
this.queue.clear();
|
|
419
|
+
this.queue.pause();
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
/**
|
|
423
|
+
* Get the domain filter instance
|
|
424
|
+
* @returns {DomainFilter} Current domain filter
|
|
425
|
+
*/
|
|
426
|
+
getDomainFilter() {
|
|
427
|
+
return this.domainFilter;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
/**
|
|
431
|
+
* Set a new domain filter instance
|
|
432
|
+
* @param {DomainFilter} domainFilter - New domain filter to use
|
|
433
|
+
*/
|
|
434
|
+
setDomainFilter(domainFilter) {
|
|
435
|
+
if (!(domainFilter instanceof DomainFilter)) {
|
|
436
|
+
throw new Error('Invalid domain filter: must be instance of DomainFilter');
|
|
437
|
+
}
|
|
438
|
+
this.domainFilter = domainFilter;
|
|
439
|
+
this.filterDecisions = []; // Reset filter decisions
|
|
440
|
+
return this;
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
/**
|
|
444
|
+
* Configure domain filter with simple options
|
|
445
|
+
* @param {Object} config - Configuration object
|
|
446
|
+
*/
|
|
447
|
+
configureDomainFilter(config) {
|
|
448
|
+
const {
|
|
449
|
+
whitelist = [],
|
|
450
|
+
blacklist = [],
|
|
451
|
+
includePatterns = [],
|
|
452
|
+
excludePatterns = [],
|
|
453
|
+
domainRules = {}
|
|
454
|
+
} = config;
|
|
455
|
+
|
|
456
|
+
// Add whitelist domains
|
|
457
|
+
for (const domain of whitelist) {
|
|
458
|
+
if (typeof domain === 'string') {
|
|
459
|
+
this.domainFilter.addWhitelistDomain(domain);
|
|
460
|
+
} else if (typeof domain === 'object' && domain.domain) {
|
|
461
|
+
this.domainFilter.addWhitelistDomain(domain.domain, domain.options || {});
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// Add blacklist domains
|
|
466
|
+
for (const domain of blacklist) {
|
|
467
|
+
if (typeof domain === 'string') {
|
|
468
|
+
this.domainFilter.addBlacklistDomain(domain);
|
|
469
|
+
} else if (typeof domain === 'object' && domain.domain) {
|
|
470
|
+
this.domainFilter.addBlacklistDomain(domain.domain, domain.options || {});
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// Add include patterns
|
|
475
|
+
for (const pattern of includePatterns) {
|
|
476
|
+
if (typeof pattern === 'string') {
|
|
477
|
+
this.domainFilter.addPattern(pattern, 'include');
|
|
478
|
+
} else if (typeof pattern === 'object' && pattern.pattern) {
|
|
479
|
+
this.domainFilter.addPattern(pattern.pattern, 'include', pattern.options || {});
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// Add exclude patterns
|
|
484
|
+
for (const pattern of excludePatterns) {
|
|
485
|
+
if (typeof pattern === 'string') {
|
|
486
|
+
this.domainFilter.addPattern(pattern, 'exclude');
|
|
487
|
+
} else if (typeof pattern === 'object' && pattern.pattern) {
|
|
488
|
+
this.domainFilter.addPattern(pattern.pattern, 'exclude', pattern.options || {});
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// Set domain rules
|
|
493
|
+
for (const [domain, rules] of Object.entries(domainRules)) {
|
|
494
|
+
this.domainFilter.setDomainRules(domain, rules);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
return this;
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
/**
|
|
501
|
+
* Extract link metadata from HTML
|
|
502
|
+
* @param {string} href - The href attribute value
|
|
503
|
+
* @param {string} html - Original HTML content
|
|
504
|
+
* @param {string} baseUrl - Base URL for context
|
|
505
|
+
* @returns {Object} Link metadata
|
|
506
|
+
*/
|
|
507
|
+
extractLinkMetadata(href, html, baseUrl) {
|
|
508
|
+
if (!html) return {};
|
|
509
|
+
|
|
510
|
+
try {
|
|
511
|
+
const $ = load(html);
|
|
512
|
+
const linkElement = $(`a[href="${href}"]`).first();
|
|
513
|
+
|
|
514
|
+
if (linkElement.length === 0) {
|
|
515
|
+
return { href };
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
const anchorText = linkElement.text().trim();
|
|
519
|
+
const title = linkElement.attr('title');
|
|
520
|
+
const rel = linkElement.attr('rel');
|
|
521
|
+
const className = linkElement.attr('class');
|
|
522
|
+
|
|
523
|
+
// Get surrounding context (up to 100 characters before and after)
|
|
524
|
+
const linkHtml = linkElement.prop('outerHTML');
|
|
525
|
+
const bodyText = $('body').text();
|
|
526
|
+
const linkTextIndex = bodyText.indexOf(anchorText);
|
|
527
|
+
let context = '';
|
|
528
|
+
|
|
529
|
+
if (linkTextIndex >= 0 && anchorText) {
|
|
530
|
+
const start = Math.max(0, linkTextIndex - 100);
|
|
531
|
+
const end = Math.min(bodyText.length, linkTextIndex + anchorText.length + 100);
|
|
532
|
+
context = bodyText.substring(start, end).trim();
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
return {
|
|
536
|
+
href,
|
|
537
|
+
anchorText,
|
|
538
|
+
title,
|
|
539
|
+
rel,
|
|
540
|
+
className,
|
|
541
|
+
context,
|
|
542
|
+
extractedAt: new Date().toISOString()
|
|
543
|
+
};
|
|
544
|
+
} catch (error) {
|
|
545
|
+
return { href, error: error.message };
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
/**
|
|
550
|
+
* Perform comprehensive link analysis
|
|
551
|
+
* @returns {Object} Link analysis results
|
|
552
|
+
*/
|
|
553
|
+
performLinkAnalysis() {
|
|
554
|
+
if (!this.enableLinkAnalysis || !this.linkAnalyzer) {
|
|
555
|
+
return null;
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
const startTime = Date.now();
|
|
559
|
+
|
|
560
|
+
try {
|
|
561
|
+
// Calculate link importance (PageRank)
|
|
562
|
+
const importance = this.linkAnalyzer.calculateImportance();
|
|
563
|
+
|
|
564
|
+
// Detect cycles
|
|
565
|
+
const cycles = this.linkAnalyzer.detectCycles({ maxCycleLength: 8, includeMetadata: true });
|
|
566
|
+
|
|
567
|
+
// Get comprehensive statistics
|
|
568
|
+
const statistics = this.linkAnalyzer.getStatistics();
|
|
569
|
+
|
|
570
|
+
// Find hub and authority pages
|
|
571
|
+
const hubsAndAuthorities = this.findHubsAndAuthorities(importance);
|
|
572
|
+
|
|
573
|
+
// Analyze link patterns
|
|
574
|
+
const linkPatterns = this.analyzeLinkPatterns();
|
|
575
|
+
|
|
576
|
+
// Get domain-level analysis
|
|
577
|
+
const domainAnalysis = this.analyzeDomainLinking();
|
|
578
|
+
|
|
579
|
+
const analysisTime = Date.now() - startTime;
|
|
580
|
+
|
|
581
|
+
return {
|
|
582
|
+
statistics,
|
|
583
|
+
importance: this.formatImportanceResults(importance),
|
|
584
|
+
cycles: cycles.map(cycle => ({
|
|
585
|
+
...cycle,
|
|
586
|
+
urls: cycle.nodes,
|
|
587
|
+
cycleLength: cycle.length,
|
|
588
|
+
strength: cycle.strength
|
|
589
|
+
})),
|
|
590
|
+
hubsAndAuthorities,
|
|
591
|
+
linkPatterns,
|
|
592
|
+
domainAnalysis,
|
|
593
|
+
analysisTime,
|
|
594
|
+
generatedAt: new Date().toISOString()
|
|
595
|
+
};
|
|
596
|
+
} catch (error) {
|
|
597
|
+
return {
|
|
598
|
+
error: error.message,
|
|
599
|
+
analysisTime: Date.now() - startTime,
|
|
600
|
+
generatedAt: new Date().toISOString()
|
|
601
|
+
};
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
/**
|
|
606
|
+
* Format importance results for output
|
|
607
|
+
*/
|
|
608
|
+
formatImportanceResults(importance) {
|
|
609
|
+
const results = Array.from(importance.entries())
|
|
610
|
+
.map(([url, score]) => ({ url, importance: score }))
|
|
611
|
+
.sort((a, b) => b.importance - a.importance);
|
|
612
|
+
|
|
613
|
+
return {
|
|
614
|
+
topPages: results.slice(0, 20),
|
|
615
|
+
totalPages: results.length,
|
|
616
|
+
averageImportance: results.reduce((sum, item) => sum + item.importance, 0) / results.length,
|
|
617
|
+
importanceRange: {
|
|
618
|
+
min: results[results.length - 1]?.importance || 0,
|
|
619
|
+
max: results[0]?.importance || 0
|
|
620
|
+
}
|
|
621
|
+
};
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
/**
|
|
625
|
+
* Find hub and authority pages
|
|
626
|
+
*/
|
|
627
|
+
findHubsAndAuthorities(importance) {
|
|
628
|
+
const nodes = Array.from(this.linkAnalyzer.nodes.keys());
|
|
629
|
+
const hubs = [];
|
|
630
|
+
const authorities = [];
|
|
631
|
+
|
|
632
|
+
for (const node of nodes) {
|
|
633
|
+
const outboundCount = this.linkAnalyzer.getOutboundLinks(node).length;
|
|
634
|
+
const inboundCount = this.linkAnalyzer.getInboundLinks(node).length;
|
|
635
|
+
const importanceScore = importance.get(node) || 0;
|
|
636
|
+
|
|
637
|
+
// Hubs: pages with many outbound links
|
|
638
|
+
if (outboundCount >= 10) {
|
|
639
|
+
hubs.push({
|
|
640
|
+
url: node,
|
|
641
|
+
outboundLinks: outboundCount,
|
|
642
|
+
importance: importanceScore
|
|
643
|
+
});
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// Authorities: pages with many inbound links
|
|
647
|
+
if (inboundCount >= 5) {
|
|
648
|
+
authorities.push({
|
|
649
|
+
url: node,
|
|
650
|
+
inboundLinks: inboundCount,
|
|
651
|
+
importance: importanceScore
|
|
652
|
+
});
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
return {
|
|
657
|
+
hubs: hubs.sort((a, b) => b.outboundLinks - a.outboundLinks).slice(0, 10),
|
|
658
|
+
authorities: authorities.sort((a, b) => b.inboundLinks - a.inboundLinks).slice(0, 10)
|
|
659
|
+
};
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
/**
|
|
663
|
+
* Analyze link patterns
|
|
664
|
+
*/
|
|
665
|
+
analyzeLinkPatterns() {
|
|
666
|
+
const patterns = {
|
|
667
|
+
internal: 0,
|
|
668
|
+
external: 0,
|
|
669
|
+
sameDomain: 0,
|
|
670
|
+
crossDomain: 0,
|
|
671
|
+
pathPatterns: new Map(),
|
|
672
|
+
anchorTextAnalysis: new Map()
|
|
673
|
+
};
|
|
674
|
+
|
|
675
|
+
for (const [linkKey, linkData] of this.linkAnalyzer.linkMetadata) {
|
|
676
|
+
const [from, to] = linkKey.split('|');
|
|
677
|
+
|
|
678
|
+
try {
|
|
679
|
+
const fromUrl = new URL(from);
|
|
680
|
+
const toUrl = new URL(to);
|
|
681
|
+
|
|
682
|
+
if (fromUrl.origin === this.baseUrl.origin) {
|
|
683
|
+
patterns.internal++;
|
|
684
|
+
} else {
|
|
685
|
+
patterns.external++;
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
if (fromUrl.hostname === toUrl.hostname) {
|
|
689
|
+
patterns.sameDomain++;
|
|
690
|
+
} else {
|
|
691
|
+
patterns.crossDomain++;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
// Analyze path patterns
|
|
695
|
+
const pathPattern = this.getPathPattern(toUrl.pathname);
|
|
696
|
+
patterns.pathPatterns.set(pathPattern,
|
|
697
|
+
(patterns.pathPatterns.get(pathPattern) || 0) + 1);
|
|
698
|
+
|
|
699
|
+
// Analyze anchor text
|
|
700
|
+
const anchorText = linkData.anchorText?.toLowerCase().trim();
|
|
701
|
+
if (anchorText && anchorText.length > 0) {
|
|
702
|
+
patterns.anchorTextAnalysis.set(anchorText,
|
|
703
|
+
(patterns.anchorTextAnalysis.get(anchorText) || 0) + 1);
|
|
704
|
+
}
|
|
705
|
+
} catch (error) {
|
|
706
|
+
// Skip malformed URLs
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
return {
|
|
711
|
+
linkDistribution: {
|
|
712
|
+
internal: patterns.internal,
|
|
713
|
+
external: patterns.external,
|
|
714
|
+
sameDomain: patterns.sameDomain,
|
|
715
|
+
crossDomain: patterns.crossDomain
|
|
716
|
+
},
|
|
717
|
+
topPathPatterns: Array.from(patterns.pathPatterns.entries())
|
|
718
|
+
.sort((a, b) => b[1] - a[1])
|
|
719
|
+
.slice(0, 10)
|
|
720
|
+
.map(([pattern, count]) => ({ pattern, count })),
|
|
721
|
+
topAnchorTexts: Array.from(patterns.anchorTextAnalysis.entries())
|
|
722
|
+
.sort((a, b) => b[1] - a[1])
|
|
723
|
+
.slice(0, 15)
|
|
724
|
+
.map(([text, count]) => ({ text, count }))
|
|
725
|
+
};
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
/**
|
|
729
|
+
* Analyze domain-level linking
|
|
730
|
+
*/
|
|
731
|
+
analyzeDomainLinking() {
|
|
732
|
+
const domainStats = new Map();
|
|
733
|
+
|
|
734
|
+
for (const [linkKey] of this.linkAnalyzer.linkMetadata) {
|
|
735
|
+
const [from, to] = linkKey.split('|');
|
|
736
|
+
|
|
737
|
+
try {
|
|
738
|
+
const fromDomain = new URL(from).hostname;
|
|
739
|
+
const toDomain = new URL(to).hostname;
|
|
740
|
+
|
|
741
|
+
if (!domainStats.has(fromDomain)) {
|
|
742
|
+
domainStats.set(fromDomain, { outbound: 0, inbound: 0, internal: 0, external: 0 });
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
if (!domainStats.has(toDomain)) {
|
|
746
|
+
domainStats.set(toDomain, { outbound: 0, inbound: 0, internal: 0, external: 0 });
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
domainStats.get(fromDomain).outbound++;
|
|
750
|
+
domainStats.get(toDomain).inbound++;
|
|
751
|
+
|
|
752
|
+
if (fromDomain === toDomain) {
|
|
753
|
+
domainStats.get(fromDomain).internal++;
|
|
754
|
+
} else {
|
|
755
|
+
domainStats.get(fromDomain).external++;
|
|
756
|
+
}
|
|
757
|
+
} catch (error) {
|
|
758
|
+
// Skip malformed URLs
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
const topDomains = Array.from(domainStats.entries())
|
|
763
|
+
.map(([domain, stats]) => ({ domain, ...stats }))
|
|
764
|
+
.sort((a, b) => (b.outbound + b.inbound) - (a.outbound + a.inbound))
|
|
765
|
+
.slice(0, 20);
|
|
766
|
+
|
|
767
|
+
return {
|
|
768
|
+
totalDomains: domainStats.size,
|
|
769
|
+
topDomains,
|
|
770
|
+
domainConnectivity: this.calculateDomainConnectivity(domainStats)
|
|
771
|
+
};
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
/**
|
|
775
|
+
* Calculate domain connectivity metrics
|
|
776
|
+
*/
|
|
777
|
+
calculateDomainConnectivity(domainStats) {
|
|
778
|
+
const domains = Array.from(domainStats.keys());
|
|
779
|
+
const totalDomains = domains.length;
|
|
780
|
+
|
|
781
|
+
if (totalDomains <= 1) {
|
|
782
|
+
return { density: 0, averageConnections: 0 };
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
let totalConnections = 0;
|
|
786
|
+
const connections = new Set();
|
|
787
|
+
|
|
788
|
+
for (const [linkKey] of this.linkAnalyzer.linkMetadata) {
|
|
789
|
+
const [from, to] = linkKey.split('|');
|
|
790
|
+
|
|
791
|
+
try {
|
|
792
|
+
const fromDomain = new URL(from).hostname;
|
|
793
|
+
const toDomain = new URL(to).hostname;
|
|
794
|
+
|
|
795
|
+
if (fromDomain !== toDomain) {
|
|
796
|
+
const connectionKey = fromDomain < toDomain ?
|
|
797
|
+
`${fromDomain}-${toDomain}` : `${toDomain}-${fromDomain}`;
|
|
798
|
+
connections.add(connectionKey);
|
|
799
|
+
}
|
|
800
|
+
} catch (error) {
|
|
801
|
+
// Skip malformed URLs
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
const uniqueConnections = connections.size;
|
|
806
|
+
const maxPossibleConnections = (totalDomains * (totalDomains - 1)) / 2;
|
|
807
|
+
const density = maxPossibleConnections > 0 ? uniqueConnections / maxPossibleConnections : 0;
|
|
808
|
+
const averageConnections = totalDomains > 0 ? uniqueConnections / totalDomains : 0;
|
|
809
|
+
|
|
810
|
+
return { density, averageConnections, uniqueConnections, maxPossibleConnections };
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
/**
|
|
814
|
+
* Get path pattern for analysis
|
|
815
|
+
*/
|
|
816
|
+
getPathPattern(pathname) {
|
|
817
|
+
const segments = pathname.split('/').filter(s => s);
|
|
818
|
+
|
|
819
|
+
if (segments.length === 0) return '/';
|
|
820
|
+
if (segments.length === 1) return `/${segments[0]}/`;
|
|
821
|
+
|
|
822
|
+
// Return first two segments as pattern
|
|
823
|
+
return `/${segments[0]}/${segments[1]}/...`;
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
/**
|
|
827
|
+
* Get link analyzer instance
|
|
828
|
+
*/
|
|
829
|
+
getLinkAnalyzer() {
|
|
830
|
+
return this.linkAnalyzer;
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
/**
|
|
834
|
+
* Export link graph
|
|
835
|
+
*/
|
|
836
|
+
exportLinkGraph(format = 'json', options = {}) {
|
|
837
|
+
if (!this.enableLinkAnalysis || !this.linkAnalyzer) {
|
|
838
|
+
throw new Error('Link analysis is not enabled');
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
return this.linkAnalyzer.exportGraph(format, options);
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
export default BFSCrawler;
|