crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,845 @@
1
+ import { load } from 'cheerio';
2
+ import { QueueManager } from '../queue/QueueManager.js';
3
+ import { CacheManager } from '../cache/CacheManager.js';
4
+ import { RateLimiter } from '../../utils/rateLimiter.js';
5
+ import { RobotsChecker } from '../../utils/robotsChecker.js';
6
+ import { DomainFilter } from '../../utils/domainFilter.js';
7
+ import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
8
+ import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
9
+
10
+ export class BFSCrawler {
11
+ constructor(options = {}) {
12
+ const {
13
+ maxDepth = 5,
14
+ maxPages = 100,
15
+ followExternal = false,
16
+ respectRobots = true,
17
+ userAgent = 'MCP-WebScraper/1.0',
18
+ timeout = 30000,
19
+ concurrency = 10,
20
+ domainFilter = null,
21
+ enableLinkAnalysis = true,
22
+ linkAnalyzerOptions = {}
23
+ } = options;
24
+
25
+ this.maxDepth = maxDepth;
26
+ this.maxPages = maxPages;
27
+ this.followExternal = followExternal;
28
+ this.respectRobots = respectRobots;
29
+ this.userAgent = userAgent;
30
+ this.timeout = timeout;
31
+
32
+ this.visited = new Set();
33
+ this.results = [];
34
+ this.errors = [];
35
+ this.filterDecisions = []; // Track filtering decisions for analysis
36
+
37
+ // Link analysis
38
+ this.enableLinkAnalysis = enableLinkAnalysis;
39
+ this.linkAnalyzer = enableLinkAnalysis ? new LinkAnalyzer(linkAnalyzerOptions) : null;
40
+
41
+ this.queue = new QueueManager({ concurrency, timeout });
42
+ this.cache = new CacheManager({ ttl: 3600000 }); // 1 hour cache
43
+ this.rateLimiter = new RateLimiter({ requestsPerSecond: 10 });
44
+ this.robotsChecker = respectRobots ? new RobotsChecker(userAgent) : null;
45
+
46
+ // Initialize domain filter (create new if not provided)
47
+ this.domainFilter = domainFilter || new DomainFilter({
48
+ allowSubdomains: !followExternal, // If not following external, allow subdomains by default
49
+ defaultMaxDepth: maxDepth,
50
+ defaultRateLimit: 10
51
+ });
52
+ }
53
+
54
+ async crawl(startUrl, options = {}) {
55
+ const {
56
+ includePatterns = [],
57
+ excludePatterns = [],
58
+ extractContent = true,
59
+ domainFilterConfig = null
60
+ } = options;
61
+
62
+ // Backward compatibility: convert old patterns to domain filter patterns
63
+ this.includePatterns = includePatterns.map(p => new RegExp(p));
64
+ this.excludePatterns = excludePatterns.map(p => new RegExp(p));
65
+
66
+ // Add legacy patterns to domain filter for unified processing
67
+ for (const pattern of includePatterns) {
68
+ this.domainFilter.addPattern(pattern, 'include', { description: 'Legacy include pattern' });
69
+ }
70
+ for (const pattern of excludePatterns) {
71
+ this.domainFilter.addPattern(pattern, 'exclude', { description: 'Legacy exclude pattern' });
72
+ }
73
+
74
+ // Apply additional domain filter configuration if provided
75
+ if (domainFilterConfig) {
76
+ if (domainFilterConfig.whitelist) {
77
+ for (const [domain, options] of Object.entries(domainFilterConfig.whitelist)) {
78
+ this.domainFilter.addWhitelistDomain(domain, options);
79
+ }
80
+ }
81
+ if (domainFilterConfig.blacklist) {
82
+ for (const [domain, options] of Object.entries(domainFilterConfig.blacklist)) {
83
+ this.domainFilter.addBlacklistDomain(domain, options);
84
+ }
85
+ }
86
+ }
87
+
88
+ this.extractContent = extractContent;
89
+ this.filterDecisions = []; // Reset filter decisions
90
+
91
+ const normalizedStart = normalizeUrl(startUrl);
92
+ this.baseUrl = new URL(normalizedStart);
93
+
94
+ // Check if start URL is allowed
95
+ const startUrlDecision = this.domainFilter.isAllowed(normalizedStart);
96
+ if (!startUrlDecision.allowed) {
97
+ throw new Error(`Start URL blocked by domain filter: ${startUrlDecision.reason}`);
98
+ }
99
+
100
+ // Initialize queue with starting URL
101
+ await this.queue.add(() => this.processUrl(normalizedStart, 0));
102
+
103
+ // Wait for crawling to complete
104
+ await this.queue.onIdle();
105
+
106
+ // Perform link analysis if enabled
107
+ let linkAnalysisResults = null;
108
+ if (this.enableLinkAnalysis && this.linkAnalyzer) {
109
+ linkAnalysisResults = this.performLinkAnalysis();
110
+ }
111
+
112
+ return {
113
+ urls: Array.from(this.visited),
114
+ results: this.results,
115
+ errors: this.errors,
116
+ stats: this.getStats(),
117
+ linkAnalysis: linkAnalysisResults
118
+ };
119
+ }
120
+
121
+ async processUrl(url, depth) {
122
+ // Check limits
123
+ if (depth > this.maxDepth || this.visited.size >= this.maxPages) {
124
+ return;
125
+ }
126
+
127
+ // Check if already visited
128
+ const normalizedUrl = normalizeUrl(url);
129
+ if (this.visited.has(normalizedUrl)) {
130
+ return;
131
+ }
132
+
133
+ // Check domain filter (replaces old pattern checking)
134
+ const filterDecision = this.domainFilter.isAllowed(normalizedUrl);
135
+ this.filterDecisions.push({
136
+ url: normalizedUrl,
137
+ decision: filterDecision,
138
+ timestamp: new Date().toISOString()
139
+ });
140
+
141
+ if (!filterDecision.allowed) {
142
+ console.log(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
143
+ return;
144
+ }
145
+
146
+ // Backward compatibility: also check legacy patterns
147
+ if (!this.shouldCrawlUrl(normalizedUrl)) {
148
+ console.log(`Legacy pattern blocks: ${normalizedUrl}`);
149
+ return;
150
+ }
151
+
152
+ // Check robots.txt
153
+ if (this.respectRobots && this.robotsChecker) {
154
+ const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
155
+ if (!canFetch) {
156
+ console.log(`Robots.txt blocks: ${normalizedUrl}`);
157
+ return;
158
+ }
159
+ }
160
+
161
+ // Mark as visited
162
+ this.visited.add(normalizedUrl);
163
+
164
+ try {
165
+ // Check cache first
166
+ const cacheKey = this.cache.generateKey(normalizedUrl);
167
+ let pageData = await this.cache.get(cacheKey);
168
+
169
+ if (!pageData) {
170
+ // Apply domain-specific rate limiting
171
+ const urlObj = new URL(normalizedUrl);
172
+ const domainRules = this.domainFilter.getDomainRules(urlObj.hostname);
173
+
174
+ // Use domain-specific rate limit if available
175
+ const effectiveRateLimit = domainRules.rateLimit || 10;
176
+ if (this.rateLimiter.requestsPerSecond !== effectiveRateLimit) {
177
+ // Update rate limiter for this domain
178
+ this.rateLimiter = new RateLimiter({ requestsPerSecond: effectiveRateLimit });
179
+ }
180
+
181
+ await this.rateLimiter.checkLimit(normalizedUrl);
182
+
183
+ // Fetch the page
184
+ pageData = await this.fetchPage(normalizedUrl);
185
+
186
+ // Cache the result
187
+ await this.cache.set(cacheKey, pageData);
188
+ }
189
+
190
+ // Process links for analysis
191
+ if (this.enableLinkAnalysis && this.linkAnalyzer && pageData.links) {
192
+ for (const link of pageData.links) {
193
+ const absoluteUrl = this.resolveUrl(link, normalizedUrl);
194
+ if (absoluteUrl) {
195
+ // Extract anchor text and context from link
196
+ const linkMetadata = this.extractLinkMetadata(link, pageData.originalHtml, normalizedUrl);
197
+ this.linkAnalyzer.addLink(normalizedUrl, absoluteUrl, linkMetadata);
198
+ }
199
+ }
200
+ }
201
+
202
+ // Process the page
203
+ const result = {
204
+ url: normalizedUrl,
205
+ depth,
206
+ title: pageData.title,
207
+ contentLength: pageData.content?.length || 0,
208
+ links: pageData.links?.length || 0,
209
+ timestamp: new Date().toISOString()
210
+ };
211
+
212
+ if (this.extractContent) {
213
+ result.content = pageData.content;
214
+ result.metadata = pageData.metadata;
215
+ }
216
+
217
+ this.results.push(result);
218
+
219
+ // Add discovered links to queue (if not at max depth)
220
+ if (depth < this.maxDepth && pageData.links) {
221
+ for (const link of pageData.links) {
222
+ if (this.visited.size >= this.maxPages) break;
223
+
224
+ const absoluteUrl = this.resolveUrl(link, normalizedUrl);
225
+ if (absoluteUrl && !this.visited.has(absoluteUrl)) {
226
+ await this.queue.add(() => this.processUrl(absoluteUrl, depth + 1));
227
+ }
228
+ }
229
+ }
230
+ } catch (error) {
231
+ this.errors.push({
232
+ url: normalizedUrl,
233
+ depth,
234
+ error: error.message,
235
+ timestamp: new Date().toISOString()
236
+ });
237
+ }
238
+ }
239
+
240
+ async fetchPage(url) {
241
+ const controller = new AbortController();
242
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
243
+
244
+ try {
245
+ // Get domain-specific headers and timeout
246
+ const urlObj = new URL(url);
247
+ const domainRules = this.domainFilter.getDomainRules(urlObj.hostname);
248
+
249
+ const defaultHeaders = {
250
+ 'User-Agent': this.userAgent,
251
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
252
+ 'Accept-Language': 'en-US,en;q=0.5',
253
+ 'Accept-Encoding': 'gzip, deflate',
254
+ 'Connection': 'keep-alive',
255
+ 'Upgrade-Insecure-Requests': '1'
256
+ };
257
+
258
+ const headers = { ...defaultHeaders, ...domainRules.customHeaders };
259
+ const effectiveTimeout = domainRules.timeout || this.timeout;
260
+
261
+ // Update timeout if different
262
+ if (effectiveTimeout !== this.timeout) {
263
+ clearTimeout(timeoutId);
264
+ setTimeout(() => controller.abort(), effectiveTimeout);
265
+ }
266
+
267
+ const response = await fetch(url, {
268
+ signal: controller.signal,
269
+ headers
270
+ });
271
+
272
+ clearTimeout(timeoutId);
273
+
274
+ if (!response.ok) {
275
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
276
+ }
277
+
278
+ const contentType = response.headers.get('content-type');
279
+ if (!contentType || !contentType.includes('text/html')) {
280
+ throw new Error(`Non-HTML content type: ${contentType}`);
281
+ }
282
+
283
+ const html = await response.text();
284
+ return this.parsePage(html, url);
285
+ } catch (error) {
286
+ clearTimeout(timeoutId);
287
+ throw error;
288
+ }
289
+ }
290
+
291
+ parsePage(html, url) {
292
+ const $ = load(html);
293
+
294
+ // Extract title
295
+ const title = $('title').text().trim() || $('h1').first().text().trim() || '';
296
+
297
+ // Extract main content
298
+ $('script, style, noscript').remove();
299
+ const content = $('body').text().replace(/\s+/g, ' ').trim();
300
+
301
+ // Extract metadata
302
+ const metadata = {
303
+ description: $('meta[name="description"]').attr('content') || '',
304
+ keywords: $('meta[name="keywords"]').attr('content') || '',
305
+ author: $('meta[name="author"]').attr('content') || '',
306
+ ogTitle: $('meta[property="og:title"]').attr('content') || '',
307
+ ogDescription: $('meta[property="og:description"]').attr('content') || ''
308
+ };
309
+
310
+ // Extract links
311
+ const links = [];
312
+ $('a[href]').each((_, element) => {
313
+ const href = $(element).attr('href');
314
+ if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
315
+ links.push(href);
316
+ }
317
+ });
318
+
319
+ return {
320
+ title,
321
+ content,
322
+ metadata,
323
+ links: [...new Set(links)], // Remove duplicates
324
+ originalHtml: html // Store original HTML for link analysis
325
+ };
326
+ }
327
+
328
+ resolveUrl(link, baseUrl) {
329
+ try {
330
+ // Handle absolute URLs
331
+ if (link.startsWith('http://') || link.startsWith('https://')) {
332
+ const linkUrl = new URL(link);
333
+
334
+ // Check if we should follow external links
335
+ if (!this.followExternal && linkUrl.origin !== this.baseUrl.origin) {
336
+ return null;
337
+ }
338
+
339
+ return normalizeUrl(link);
340
+ }
341
+
342
+ // Handle relative URLs
343
+ const resolved = new URL(link, baseUrl);
344
+
345
+ // Check if we should follow external links
346
+ if (!this.followExternal && resolved.origin !== this.baseUrl.origin) {
347
+ return null;
348
+ }
349
+
350
+ return normalizeUrl(resolved.toString());
351
+ } catch {
352
+ return null;
353
+ }
354
+ }
355
+
356
+ shouldCrawlUrl(url) {
357
+ // Check include patterns
358
+ if (this.includePatterns.length > 0) {
359
+ const matches = this.includePatterns.some(pattern => pattern.test(url));
360
+ if (!matches) return false;
361
+ }
362
+
363
+ // Check exclude patterns
364
+ if (this.excludePatterns.length > 0) {
365
+ const excluded = this.excludePatterns.some(pattern => pattern.test(url));
366
+ if (excluded) return false;
367
+ }
368
+
369
+ return true;
370
+ }
371
+
372
+ getStats() {
373
+ const filterStats = this.domainFilter.getStats();
374
+ const filterDecisionStats = this.getFilterDecisionStats();
375
+
376
+ return {
377
+ visited: this.visited.size,
378
+ results: this.results.length,
379
+ errors: this.errors.length,
380
+ cacheStats: this.cache.getStats(),
381
+ queueStats: this.queue.getStats(),
382
+ rateLimitStats: this.rateLimiter.getStats(),
383
+ domainFilterStats: filterStats,
384
+ filterDecisions: filterDecisionStats
385
+ };
386
+ }
387
+
388
+ getFilterDecisionStats() {
389
+ const total = this.filterDecisions.length;
390
+ const allowed = this.filterDecisions.filter(d => d.decision.allowed).length;
391
+ const blocked = total - allowed;
392
+
393
+ const reasonCounts = {};
394
+ this.filterDecisions.forEach(d => {
395
+ if (!d.decision.allowed) {
396
+ reasonCounts[d.decision.reason] = (reasonCounts[d.decision.reason] || 0) + 1;
397
+ }
398
+ });
399
+
400
+ return {
401
+ total,
402
+ allowed,
403
+ blocked,
404
+ allowedPercentage: total > 0 ? (allowed / total * 100).toFixed(2) : 0,
405
+ blockedReasons: reasonCounts
406
+ };
407
+ }
408
+
409
+ pause() {
410
+ this.queue.pause();
411
+ }
412
+
413
+ resume() {
414
+ this.queue.start();
415
+ }
416
+
417
+ stop() {
418
+ this.queue.clear();
419
+ this.queue.pause();
420
+ }
421
+
422
+ /**
423
+ * Get the domain filter instance
424
+ * @returns {DomainFilter} Current domain filter
425
+ */
426
+ getDomainFilter() {
427
+ return this.domainFilter;
428
+ }
429
+
430
+ /**
431
+ * Set a new domain filter instance
432
+ * @param {DomainFilter} domainFilter - New domain filter to use
433
+ */
434
+ setDomainFilter(domainFilter) {
435
+ if (!(domainFilter instanceof DomainFilter)) {
436
+ throw new Error('Invalid domain filter: must be instance of DomainFilter');
437
+ }
438
+ this.domainFilter = domainFilter;
439
+ this.filterDecisions = []; // Reset filter decisions
440
+ return this;
441
+ }
442
+
443
+ /**
444
+ * Configure domain filter with simple options
445
+ * @param {Object} config - Configuration object
446
+ */
447
+ configureDomainFilter(config) {
448
+ const {
449
+ whitelist = [],
450
+ blacklist = [],
451
+ includePatterns = [],
452
+ excludePatterns = [],
453
+ domainRules = {}
454
+ } = config;
455
+
456
+ // Add whitelist domains
457
+ for (const domain of whitelist) {
458
+ if (typeof domain === 'string') {
459
+ this.domainFilter.addWhitelistDomain(domain);
460
+ } else if (typeof domain === 'object' && domain.domain) {
461
+ this.domainFilter.addWhitelistDomain(domain.domain, domain.options || {});
462
+ }
463
+ }
464
+
465
+ // Add blacklist domains
466
+ for (const domain of blacklist) {
467
+ if (typeof domain === 'string') {
468
+ this.domainFilter.addBlacklistDomain(domain);
469
+ } else if (typeof domain === 'object' && domain.domain) {
470
+ this.domainFilter.addBlacklistDomain(domain.domain, domain.options || {});
471
+ }
472
+ }
473
+
474
+ // Add include patterns
475
+ for (const pattern of includePatterns) {
476
+ if (typeof pattern === 'string') {
477
+ this.domainFilter.addPattern(pattern, 'include');
478
+ } else if (typeof pattern === 'object' && pattern.pattern) {
479
+ this.domainFilter.addPattern(pattern.pattern, 'include', pattern.options || {});
480
+ }
481
+ }
482
+
483
+ // Add exclude patterns
484
+ for (const pattern of excludePatterns) {
485
+ if (typeof pattern === 'string') {
486
+ this.domainFilter.addPattern(pattern, 'exclude');
487
+ } else if (typeof pattern === 'object' && pattern.pattern) {
488
+ this.domainFilter.addPattern(pattern.pattern, 'exclude', pattern.options || {});
489
+ }
490
+ }
491
+
492
+ // Set domain rules
493
+ for (const [domain, rules] of Object.entries(domainRules)) {
494
+ this.domainFilter.setDomainRules(domain, rules);
495
+ }
496
+
497
+ return this;
498
+ }
499
+
500
+ /**
501
+ * Extract link metadata from HTML
502
+ * @param {string} href - The href attribute value
503
+ * @param {string} html - Original HTML content
504
+ * @param {string} baseUrl - Base URL for context
505
+ * @returns {Object} Link metadata
506
+ */
507
+ extractLinkMetadata(href, html, baseUrl) {
508
+ if (!html) return {};
509
+
510
+ try {
511
+ const $ = load(html);
512
+ const linkElement = $(`a[href="${href}"]`).first();
513
+
514
+ if (linkElement.length === 0) {
515
+ return { href };
516
+ }
517
+
518
+ const anchorText = linkElement.text().trim();
519
+ const title = linkElement.attr('title');
520
+ const rel = linkElement.attr('rel');
521
+ const className = linkElement.attr('class');
522
+
523
+ // Get surrounding context (up to 100 characters before and after)
524
+ const linkHtml = linkElement.prop('outerHTML');
525
+ const bodyText = $('body').text();
526
+ const linkTextIndex = bodyText.indexOf(anchorText);
527
+ let context = '';
528
+
529
+ if (linkTextIndex >= 0 && anchorText) {
530
+ const start = Math.max(0, linkTextIndex - 100);
531
+ const end = Math.min(bodyText.length, linkTextIndex + anchorText.length + 100);
532
+ context = bodyText.substring(start, end).trim();
533
+ }
534
+
535
+ return {
536
+ href,
537
+ anchorText,
538
+ title,
539
+ rel,
540
+ className,
541
+ context,
542
+ extractedAt: new Date().toISOString()
543
+ };
544
+ } catch (error) {
545
+ return { href, error: error.message };
546
+ }
547
+ }
548
+
549
+ /**
550
+ * Perform comprehensive link analysis
551
+ * @returns {Object} Link analysis results
552
+ */
553
+ performLinkAnalysis() {
554
+ if (!this.enableLinkAnalysis || !this.linkAnalyzer) {
555
+ return null;
556
+ }
557
+
558
+ const startTime = Date.now();
559
+
560
+ try {
561
+ // Calculate link importance (PageRank)
562
+ const importance = this.linkAnalyzer.calculateImportance();
563
+
564
+ // Detect cycles
565
+ const cycles = this.linkAnalyzer.detectCycles({ maxCycleLength: 8, includeMetadata: true });
566
+
567
+ // Get comprehensive statistics
568
+ const statistics = this.linkAnalyzer.getStatistics();
569
+
570
+ // Find hub and authority pages
571
+ const hubsAndAuthorities = this.findHubsAndAuthorities(importance);
572
+
573
+ // Analyze link patterns
574
+ const linkPatterns = this.analyzeLinkPatterns();
575
+
576
+ // Get domain-level analysis
577
+ const domainAnalysis = this.analyzeDomainLinking();
578
+
579
+ const analysisTime = Date.now() - startTime;
580
+
581
+ return {
582
+ statistics,
583
+ importance: this.formatImportanceResults(importance),
584
+ cycles: cycles.map(cycle => ({
585
+ ...cycle,
586
+ urls: cycle.nodes,
587
+ cycleLength: cycle.length,
588
+ strength: cycle.strength
589
+ })),
590
+ hubsAndAuthorities,
591
+ linkPatterns,
592
+ domainAnalysis,
593
+ analysisTime,
594
+ generatedAt: new Date().toISOString()
595
+ };
596
+ } catch (error) {
597
+ return {
598
+ error: error.message,
599
+ analysisTime: Date.now() - startTime,
600
+ generatedAt: new Date().toISOString()
601
+ };
602
+ }
603
+ }
604
+
605
+ /**
606
+ * Format importance results for output
607
+ */
608
+ formatImportanceResults(importance) {
609
+ const results = Array.from(importance.entries())
610
+ .map(([url, score]) => ({ url, importance: score }))
611
+ .sort((a, b) => b.importance - a.importance);
612
+
613
+ return {
614
+ topPages: results.slice(0, 20),
615
+ totalPages: results.length,
616
+ averageImportance: results.reduce((sum, item) => sum + item.importance, 0) / results.length,
617
+ importanceRange: {
618
+ min: results[results.length - 1]?.importance || 0,
619
+ max: results[0]?.importance || 0
620
+ }
621
+ };
622
+ }
623
+
624
+ /**
625
+ * Find hub and authority pages
626
+ */
627
+ findHubsAndAuthorities(importance) {
628
+ const nodes = Array.from(this.linkAnalyzer.nodes.keys());
629
+ const hubs = [];
630
+ const authorities = [];
631
+
632
+ for (const node of nodes) {
633
+ const outboundCount = this.linkAnalyzer.getOutboundLinks(node).length;
634
+ const inboundCount = this.linkAnalyzer.getInboundLinks(node).length;
635
+ const importanceScore = importance.get(node) || 0;
636
+
637
+ // Hubs: pages with many outbound links
638
+ if (outboundCount >= 10) {
639
+ hubs.push({
640
+ url: node,
641
+ outboundLinks: outboundCount,
642
+ importance: importanceScore
643
+ });
644
+ }
645
+
646
+ // Authorities: pages with many inbound links
647
+ if (inboundCount >= 5) {
648
+ authorities.push({
649
+ url: node,
650
+ inboundLinks: inboundCount,
651
+ importance: importanceScore
652
+ });
653
+ }
654
+ }
655
+
656
+ return {
657
+ hubs: hubs.sort((a, b) => b.outboundLinks - a.outboundLinks).slice(0, 10),
658
+ authorities: authorities.sort((a, b) => b.inboundLinks - a.inboundLinks).slice(0, 10)
659
+ };
660
+ }
661
+
662
+ /**
663
+ * Analyze link patterns
664
+ */
665
+ analyzeLinkPatterns() {
666
+ const patterns = {
667
+ internal: 0,
668
+ external: 0,
669
+ sameDomain: 0,
670
+ crossDomain: 0,
671
+ pathPatterns: new Map(),
672
+ anchorTextAnalysis: new Map()
673
+ };
674
+
675
+ for (const [linkKey, linkData] of this.linkAnalyzer.linkMetadata) {
676
+ const [from, to] = linkKey.split('|');
677
+
678
+ try {
679
+ const fromUrl = new URL(from);
680
+ const toUrl = new URL(to);
681
+
682
+ if (fromUrl.origin === this.baseUrl.origin) {
683
+ patterns.internal++;
684
+ } else {
685
+ patterns.external++;
686
+ }
687
+
688
+ if (fromUrl.hostname === toUrl.hostname) {
689
+ patterns.sameDomain++;
690
+ } else {
691
+ patterns.crossDomain++;
692
+ }
693
+
694
+ // Analyze path patterns
695
+ const pathPattern = this.getPathPattern(toUrl.pathname);
696
+ patterns.pathPatterns.set(pathPattern,
697
+ (patterns.pathPatterns.get(pathPattern) || 0) + 1);
698
+
699
+ // Analyze anchor text
700
+ const anchorText = linkData.anchorText?.toLowerCase().trim();
701
+ if (anchorText && anchorText.length > 0) {
702
+ patterns.anchorTextAnalysis.set(anchorText,
703
+ (patterns.anchorTextAnalysis.get(anchorText) || 0) + 1);
704
+ }
705
+ } catch (error) {
706
+ // Skip malformed URLs
707
+ }
708
+ }
709
+
710
+ return {
711
+ linkDistribution: {
712
+ internal: patterns.internal,
713
+ external: patterns.external,
714
+ sameDomain: patterns.sameDomain,
715
+ crossDomain: patterns.crossDomain
716
+ },
717
+ topPathPatterns: Array.from(patterns.pathPatterns.entries())
718
+ .sort((a, b) => b[1] - a[1])
719
+ .slice(0, 10)
720
+ .map(([pattern, count]) => ({ pattern, count })),
721
+ topAnchorTexts: Array.from(patterns.anchorTextAnalysis.entries())
722
+ .sort((a, b) => b[1] - a[1])
723
+ .slice(0, 15)
724
+ .map(([text, count]) => ({ text, count }))
725
+ };
726
+ }
727
+
728
+ /**
729
+ * Analyze domain-level linking
730
+ */
731
+ analyzeDomainLinking() {
732
+ const domainStats = new Map();
733
+
734
+ for (const [linkKey] of this.linkAnalyzer.linkMetadata) {
735
+ const [from, to] = linkKey.split('|');
736
+
737
+ try {
738
+ const fromDomain = new URL(from).hostname;
739
+ const toDomain = new URL(to).hostname;
740
+
741
+ if (!domainStats.has(fromDomain)) {
742
+ domainStats.set(fromDomain, { outbound: 0, inbound: 0, internal: 0, external: 0 });
743
+ }
744
+
745
+ if (!domainStats.has(toDomain)) {
746
+ domainStats.set(toDomain, { outbound: 0, inbound: 0, internal: 0, external: 0 });
747
+ }
748
+
749
+ domainStats.get(fromDomain).outbound++;
750
+ domainStats.get(toDomain).inbound++;
751
+
752
+ if (fromDomain === toDomain) {
753
+ domainStats.get(fromDomain).internal++;
754
+ } else {
755
+ domainStats.get(fromDomain).external++;
756
+ }
757
+ } catch (error) {
758
+ // Skip malformed URLs
759
+ }
760
+ }
761
+
762
+ const topDomains = Array.from(domainStats.entries())
763
+ .map(([domain, stats]) => ({ domain, ...stats }))
764
+ .sort((a, b) => (b.outbound + b.inbound) - (a.outbound + a.inbound))
765
+ .slice(0, 20);
766
+
767
+ return {
768
+ totalDomains: domainStats.size,
769
+ topDomains,
770
+ domainConnectivity: this.calculateDomainConnectivity(domainStats)
771
+ };
772
+ }
773
+
774
+ /**
775
+ * Calculate domain connectivity metrics
776
+ */
777
+ calculateDomainConnectivity(domainStats) {
778
+ const domains = Array.from(domainStats.keys());
779
+ const totalDomains = domains.length;
780
+
781
+ if (totalDomains <= 1) {
782
+ return { density: 0, averageConnections: 0 };
783
+ }
784
+
785
+ let totalConnections = 0;
786
+ const connections = new Set();
787
+
788
+ for (const [linkKey] of this.linkAnalyzer.linkMetadata) {
789
+ const [from, to] = linkKey.split('|');
790
+
791
+ try {
792
+ const fromDomain = new URL(from).hostname;
793
+ const toDomain = new URL(to).hostname;
794
+
795
+ if (fromDomain !== toDomain) {
796
+ const connectionKey = fromDomain < toDomain ?
797
+ `${fromDomain}-${toDomain}` : `${toDomain}-${fromDomain}`;
798
+ connections.add(connectionKey);
799
+ }
800
+ } catch (error) {
801
+ // Skip malformed URLs
802
+ }
803
+ }
804
+
805
+ const uniqueConnections = connections.size;
806
+ const maxPossibleConnections = (totalDomains * (totalDomains - 1)) / 2;
807
+ const density = maxPossibleConnections > 0 ? uniqueConnections / maxPossibleConnections : 0;
808
+ const averageConnections = totalDomains > 0 ? uniqueConnections / totalDomains : 0;
809
+
810
+ return { density, averageConnections, uniqueConnections, maxPossibleConnections };
811
+ }
812
+
813
+ /**
814
+ * Get path pattern for analysis
815
+ */
816
+ getPathPattern(pathname) {
817
+ const segments = pathname.split('/').filter(s => s);
818
+
819
+ if (segments.length === 0) return '/';
820
+ if (segments.length === 1) return `/${segments[0]}/`;
821
+
822
+ // Return first two segments as pattern
823
+ return `/${segments[0]}/${segments[1]}/...`;
824
+ }
825
+
826
+ /**
827
+ * Get link analyzer instance
828
+ */
829
+ getLinkAnalyzer() {
830
+ return this.linkAnalyzer;
831
+ }
832
+
833
+ /**
834
+ * Export link graph
835
+ */
836
+ exportLinkGraph(format = 'json', options = {}) {
837
+ if (!this.enableLinkAnalysis || !this.linkAnalyzer) {
838
+ throw new Error('Link analysis is not enabled');
839
+ }
840
+
841
+ return this.linkAnalyzer.exportGraph(format, options);
842
+ }
843
+ }
844
+
845
+ export default BFSCrawler;