crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,972 @@
1
+ import { URL } from 'url';
2
+ import { normalizeUrl } from '../../utils/urlNormalizer.js';
3
+
4
+ /**
5
+ * LinkAnalyzer - Comprehensive link analysis system with graph builder
6
+ *
7
+ * Features:
8
+ * - Directed graph data structure for link relationships
9
+ * - Parent-child relationship tracking
10
+ * - Link importance calculation (simplified PageRank)
11
+ * - Circular reference detection and handling
12
+ * - Path analysis (shortest paths, common ancestors)
13
+ * - Graph export capabilities
14
+ * - Performance optimized for large link networks
15
+ */
16
+ export class LinkAnalyzer {
17
+ constructor(options = {}) {
18
+ const {
19
+ dampingFactor = 0.85, // PageRank damping factor
20
+ maxIterations = 100, // Max PageRank iterations
21
+ convergenceThreshold = 0.0001, // PageRank convergence threshold
22
+ defaultImportance = 1.0, // Default node importance
23
+ enableCaching = true, // Enable calculation caching
24
+ maxCacheSize = 10000 // Max cache entries
25
+ } = options;
26
+
27
+ // Graph data structures
28
+ this.nodes = new Map(); // url -> node data
29
+ this.outboundLinks = new Map(); // url -> Set of outbound URLs
30
+ this.inboundLinks = new Map(); // url -> Set of inbound URLs
31
+ this.linkMetadata = new Map(); // `from|to` -> link metadata
32
+
33
+ // Analysis results cache
34
+ this.cache = enableCaching ? new Map() : null;
35
+ this.maxCacheSize = maxCacheSize;
36
+
37
+ // PageRank parameters
38
+ this.dampingFactor = dampingFactor;
39
+ this.maxIterations = maxIterations;
40
+ this.convergenceThreshold = convergenceThreshold;
41
+ this.defaultImportance = defaultImportance;
42
+
43
+ // Performance tracking
44
+ this.stats = {
45
+ nodesCount: 0,
46
+ linksCount: 0,
47
+ lastAnalysisTime: null,
48
+ totalAnalyses: 0,
49
+ cacheHits: 0,
50
+ cacheMisses: 0
51
+ };
52
+
53
+ // Cycle detection cache
54
+ this.cycleCache = new Map();
55
+ this.pathCache = new Map();
56
+ }
57
+
58
+ /**
59
+ * Add a link to the graph
60
+ * @param {string} from - Source URL
61
+ * @param {string} to - Target URL
62
+ * @param {Object} metadata - Link metadata (anchor text, context, etc.)
63
+ */
64
+ addLink(from, to, metadata = {}) {
65
+ const normalizedFrom = normalizeUrl(from);
66
+ const normalizedTo = normalizeUrl(to);
67
+
68
+ if (!normalizedFrom || !normalizedTo || normalizedFrom === normalizedTo) {
69
+ return false;
70
+ }
71
+
72
+ // Initialize nodes if they don't exist
73
+ this.ensureNode(normalizedFrom);
74
+ this.ensureNode(normalizedTo);
75
+
76
+ // Add outbound link
77
+ if (!this.outboundLinks.has(normalizedFrom)) {
78
+ this.outboundLinks.set(normalizedFrom, new Set());
79
+ }
80
+ this.outboundLinks.get(normalizedFrom).add(normalizedTo);
81
+
82
+ // Add inbound link
83
+ if (!this.inboundLinks.has(normalizedTo)) {
84
+ this.inboundLinks.set(normalizedTo, new Set());
85
+ }
86
+ this.inboundLinks.get(normalizedTo).add(normalizedFrom);
87
+
88
+ // Store link metadata
89
+ const linkKey = `${normalizedFrom}|${normalizedTo}`;
90
+ const existingMetadata = this.linkMetadata.get(linkKey) || {};
91
+ this.linkMetadata.set(linkKey, {
92
+ ...existingMetadata,
93
+ ...metadata,
94
+ firstSeen: existingMetadata.firstSeen || new Date().toISOString(),
95
+ lastSeen: new Date().toISOString(),
96
+ count: (existingMetadata.count || 0) + 1
97
+ });
98
+
99
+ // Update statistics
100
+ this.stats.linksCount = this.linkMetadata.size;
101
+
102
+ // Clear caches that depend on graph structure
103
+ this.clearStructuralCaches();
104
+
105
+ return true;
106
+ }
107
+
108
+ /**
109
+ * Ensure a node exists in the graph
110
+ * @param {string} url - URL to ensure exists
111
+ */
112
+ ensureNode(url) {
113
+ const normalizedUrl = normalizeUrl(url);
114
+ if (!this.nodes.has(normalizedUrl)) {
115
+ try {
116
+ const urlObj = new URL(normalizedUrl);
117
+ this.nodes.set(normalizedUrl, {
118
+ url: normalizedUrl,
119
+ domain: urlObj.hostname,
120
+ path: urlObj.pathname,
121
+ importance: this.defaultImportance,
122
+ depth: 0,
123
+ discovered: new Date().toISOString(),
124
+ metadata: {}
125
+ });
126
+ this.stats.nodesCount = this.nodes.size;
127
+ } catch (error) {
128
+ return false;
129
+ }
130
+ }
131
+ return true;
132
+ }
133
+
134
+ /**
135
+ * Get all inbound links for a URL
136
+ * @param {string} url - Target URL
137
+ * @returns {Array} Array of source URLs
138
+ */
139
+ getInboundLinks(url) {
140
+ const normalizedUrl = normalizeUrl(url);
141
+ const inbound = this.inboundLinks.get(normalizedUrl);
142
+ return inbound ? Array.from(inbound) : [];
143
+ }
144
+
145
+ /**
146
+ * Get all outbound links for a URL
147
+ * @param {string} url - Source URL
148
+ * @returns {Array} Array of target URLs
149
+ */
150
+ getOutboundLinks(url) {
151
+ const normalizedUrl = normalizeUrl(url);
152
+ const outbound = this.outboundLinks.get(normalizedUrl);
153
+ return outbound ? Array.from(outbound) : [];
154
+ }
155
+
156
+ /**
157
+ * Calculate link importance using simplified PageRank algorithm
158
+ * @param {Object} options - Calculation options
159
+ * @returns {Map} Map of URL to importance score
160
+ */
161
+ calculateImportance(options = {}) {
162
+ const cacheKey = 'importance_' + JSON.stringify(options);
163
+ if (this.cache && this.cache.has(cacheKey)) {
164
+ this.stats.cacheHits++;
165
+ return this.cache.get(cacheKey);
166
+ }
167
+
168
+ const startTime = Date.now();
169
+ const {
170
+ dampingFactor = this.dampingFactor,
171
+ maxIterations = this.maxIterations,
172
+ convergenceThreshold = this.convergenceThreshold
173
+ } = options;
174
+
175
+ const nodes = Array.from(this.nodes.keys());
176
+ const nodeCount = nodes.length;
177
+
178
+ if (nodeCount === 0) {
179
+ return new Map();
180
+ }
181
+
182
+ // Initialize PageRank values
183
+ let pageRank = new Map();
184
+ let newPageRank = new Map();
185
+ const initialValue = 1.0 / nodeCount;
186
+
187
+ for (const node of nodes) {
188
+ pageRank.set(node, initialValue);
189
+ newPageRank.set(node, initialValue);
190
+ }
191
+
192
+ let iteration = 0;
193
+ let hasConverged = false;
194
+
195
+ while (iteration < maxIterations && !hasConverged) {
196
+ hasConverged = true;
197
+
198
+ for (const node of nodes) {
199
+ let sum = 0;
200
+ const inboundNodes = this.getInboundLinks(node);
201
+
202
+ for (const inboundNode of inboundNodes) {
203
+ const outboundCount = this.getOutboundLinks(inboundNode).length;
204
+ if (outboundCount > 0) {
205
+ sum += pageRank.get(inboundNode) / outboundCount;
206
+ }
207
+ }
208
+
209
+ const newValue = (1 - dampingFactor) / nodeCount + dampingFactor * sum;
210
+ newPageRank.set(node, newValue);
211
+
212
+ // Check convergence
213
+ if (Math.abs(newValue - pageRank.get(node)) > convergenceThreshold) {
214
+ hasConverged = false;
215
+ }
216
+ }
217
+
218
+ // Swap maps for next iteration
219
+ [pageRank, newPageRank] = [newPageRank, pageRank];
220
+ iteration++;
221
+ }
222
+
223
+ // Update node importance scores
224
+ for (const [url, score] of pageRank) {
225
+ const node = this.nodes.get(url);
226
+ if (node) {
227
+ node.importance = score;
228
+ }
229
+ }
230
+
231
+ // Cache results
232
+ if (this.cache) {
233
+ this.setCacheEntry(cacheKey, pageRank);
234
+ this.stats.cacheMisses++;
235
+ }
236
+
237
+ this.stats.lastAnalysisTime = Date.now() - startTime;
238
+ this.stats.totalAnalyses++;
239
+
240
+ return pageRank;
241
+ }
242
+
243
+ /**
244
+ * Detect circular reference chains in the graph
245
+ * @param {Object} options - Detection options
246
+ * @returns {Array} Array of cycle objects
247
+ */
248
+ detectCycles(options = {}) {
249
+ const cacheKey = 'cycles_' + JSON.stringify(options);
250
+ if (this.cache && this.cache.has(cacheKey)) {
251
+ this.stats.cacheHits++;
252
+ return this.cache.get(cacheKey);
253
+ }
254
+
255
+ const {
256
+ maxCycleLength = 10,
257
+ includeMetadata = false
258
+ } = options;
259
+
260
+ const cycles = [];
261
+ const visited = new Set();
262
+ const recursionStack = new Set();
263
+ const path = [];
264
+
265
+ const dfs = (node) => {
266
+ if (recursionStack.has(node)) {
267
+ // Found a cycle
268
+ const cycleStart = path.indexOf(node);
269
+ if (cycleStart >= 0) {
270
+ const cycle = path.slice(cycleStart);
271
+ cycle.push(node); // Complete the cycle
272
+
273
+ if (cycle.length <= maxCycleLength) {
274
+ const cycleObj = {
275
+ nodes: cycle,
276
+ length: cycle.length - 1, // Don't count repeated node
277
+ strength: this.calculateCycleStrength(cycle)
278
+ };
279
+
280
+ if (includeMetadata) {
281
+ cycleObj.metadata = this.getCycleMetadata(cycle);
282
+ }
283
+
284
+ cycles.push(cycleObj);
285
+ }
286
+ }
287
+ return;
288
+ }
289
+
290
+ if (visited.has(node)) {
291
+ return;
292
+ }
293
+
294
+ visited.add(node);
295
+ recursionStack.add(node);
296
+ path.push(node);
297
+
298
+ const outbound = this.getOutboundLinks(node);
299
+ for (const neighbor of outbound) {
300
+ dfs(neighbor);
301
+ }
302
+
303
+ recursionStack.delete(node);
304
+ path.pop();
305
+ };
306
+
307
+ // Start DFS from each unvisited node
308
+ for (const node of this.nodes.keys()) {
309
+ if (!visited.has(node)) {
310
+ dfs(node);
311
+ }
312
+ }
313
+
314
+ // Remove duplicate cycles
315
+ const uniqueCycles = this.deduplicateCycles(cycles);
316
+
317
+ // Cache results
318
+ if (this.cache) {
319
+ this.setCacheEntry(cacheKey, uniqueCycles);
320
+ this.stats.cacheMisses++;
321
+ }
322
+
323
+ return uniqueCycles;
324
+ }
325
+
326
+ /**
327
+ * Find relationship path between two URLs
328
+ * @param {string} url1 - Starting URL
329
+ * @param {string} url2 - Target URL
330
+ * @param {Object} options - Path finding options
331
+ * @returns {Object|null} Path object or null if no path exists
332
+ */
333
+ getRelationshipPath(url1, url2, options = {}) {
334
+ const normalizedUrl1 = normalizeUrl(url1);
335
+ const normalizedUrl2 = normalizeUrl(url2);
336
+
337
+ if (!this.nodes.has(normalizedUrl1) || !this.nodes.has(normalizedUrl2)) {
338
+ return null;
339
+ }
340
+
341
+ const cacheKey = `path_${normalizedUrl1}_${normalizedUrl2}_${JSON.stringify(options)}`;
342
+ if (this.cache && this.cache.has(cacheKey)) {
343
+ this.stats.cacheHits++;
344
+ return this.cache.get(cacheKey);
345
+ }
346
+
347
+ const {
348
+ maxDepth = 10,
349
+ bidirectional = true,
350
+ includeMetadata = false
351
+ } = options;
352
+
353
+ let result = null;
354
+
355
+ if (bidirectional) {
356
+ // Try both directions and return the shortest path
357
+ const path1to2 = this.findShortestPath(normalizedUrl1, normalizedUrl2, maxDepth);
358
+ const path2to1 = this.findShortestPath(normalizedUrl2, normalizedUrl1, maxDepth);
359
+
360
+ if (path1to2 && path2to1) {
361
+ result = path1to2.length <= path2to1.length ?
362
+ { path: path1to2, direction: 'forward' } :
363
+ { path: path2to1.reverse(), direction: 'reverse' };
364
+ } else if (path1to2) {
365
+ result = { path: path1to2, direction: 'forward' };
366
+ } else if (path2to1) {
367
+ result = { path: path2to1.reverse(), direction: 'reverse' };
368
+ }
369
+ } else {
370
+ const path = this.findShortestPath(normalizedUrl1, normalizedUrl2, maxDepth);
371
+ if (path) {
372
+ result = { path, direction: 'forward' };
373
+ }
374
+ }
375
+
376
+ if (result && includeMetadata) {
377
+ result.metadata = this.getPathMetadata(result.path);
378
+ }
379
+
380
+ // Cache results
381
+ if (this.cache) {
382
+ this.setCacheEntry(cacheKey, result);
383
+ this.stats.cacheMisses++;
384
+ }
385
+
386
+ return result;
387
+ }
388
+
389
+ /**
390
+ * Find shortest path between two nodes using BFS
391
+ * @param {string} start - Start URL
392
+ * @param {string} end - End URL
393
+ * @param {number} maxDepth - Maximum search depth
394
+ * @returns {Array|null} Path array or null
395
+ */
396
+ findShortestPath(start, end, maxDepth) {
397
+ if (start === end) {
398
+ return [start];
399
+ }
400
+
401
+ const queue = [[start]];
402
+ const visited = new Set([start]);
403
+
404
+ while (queue.length > 0) {
405
+ const path = queue.shift();
406
+ const current = path[path.length - 1];
407
+
408
+ if (path.length > maxDepth) {
409
+ continue;
410
+ }
411
+
412
+ const neighbors = this.getOutboundLinks(current);
413
+ for (const neighbor of neighbors) {
414
+ if (neighbor === end) {
415
+ return [...path, neighbor];
416
+ }
417
+
418
+ if (!visited.has(neighbor)) {
419
+ visited.add(neighbor);
420
+ queue.push([...path, neighbor]);
421
+ }
422
+ }
423
+ }
424
+
425
+ return null;
426
+ }
427
+
428
+ /**
429
+ * Export graph in various formats
430
+ * @param {string} format - Export format ('json', 'dot', 'csv', 'adjacency')
431
+ * @param {Object} options - Export options
432
+ * @returns {string|Object} Exported data
433
+ */
434
+ exportGraph(format = 'json', options = {}) {
435
+ const {
436
+ includeMetadata = true,
437
+ includeImportance = true,
438
+ minImportance = 0
439
+ } = options;
440
+
441
+ switch (format.toLowerCase()) {
442
+ case 'json':
443
+ return this.exportJSON(includeMetadata, includeImportance, minImportance);
444
+ case 'dot':
445
+ return this.exportDOT(includeMetadata, includeImportance, minImportance);
446
+ case 'csv':
447
+ return this.exportCSV(includeMetadata, includeImportance, minImportance);
448
+ case 'adjacency':
449
+ return this.exportAdjacencyMatrix();
450
+ default:
451
+ throw new Error(`Unsupported export format: ${format}`);
452
+ }
453
+ }
454
+
455
+ /**
456
+ * Export graph as JSON
457
+ */
458
+ exportJSON(includeMetadata, includeImportance, minImportance) {
459
+ const nodes = [];
460
+ const links = [];
461
+
462
+ // Export nodes
463
+ for (const [url, nodeData] of this.nodes) {
464
+ if (includeImportance && nodeData.importance < minImportance) {
465
+ continue;
466
+ }
467
+
468
+ const node = {
469
+ id: url,
470
+ url: url,
471
+ domain: nodeData.domain,
472
+ path: nodeData.path
473
+ };
474
+
475
+ if (includeImportance) {
476
+ node.importance = nodeData.importance;
477
+ }
478
+
479
+ if (includeMetadata) {
480
+ node.metadata = nodeData.metadata;
481
+ node.discovered = nodeData.discovered;
482
+ }
483
+
484
+ nodes.push(node);
485
+ }
486
+
487
+ // Export links
488
+ for (const [linkKey, linkData] of this.linkMetadata) {
489
+ const [from, to] = linkKey.split('|');
490
+
491
+ if (includeImportance) {
492
+ const fromNode = this.nodes.get(from);
493
+ const toNode = this.nodes.get(to);
494
+ if ((fromNode && fromNode.importance < minImportance) ||
495
+ (toNode && toNode.importance < minImportance)) {
496
+ continue;
497
+ }
498
+ }
499
+
500
+ const link = {
501
+ source: from,
502
+ target: to,
503
+ count: linkData.count
504
+ };
505
+
506
+ if (includeMetadata) {
507
+ link.metadata = {
508
+ anchorText: linkData.anchorText,
509
+ context: linkData.context,
510
+ firstSeen: linkData.firstSeen,
511
+ lastSeen: linkData.lastSeen
512
+ };
513
+ }
514
+
515
+ links.push(link);
516
+ }
517
+
518
+ return {
519
+ nodes,
520
+ links,
521
+ statistics: this.getStatistics(),
522
+ exportedAt: new Date().toISOString()
523
+ };
524
+ }
525
+
526
+ /**
527
+ * Export graph in DOT format (Graphviz)
528
+ */
529
+ exportDOT(includeMetadata, includeImportance, minImportance) {
530
+ let dot = 'digraph LinkGraph {\n';
531
+ dot += ' rankdir=LR;\n';
532
+ dot += ' node [shape=ellipse];\n\n';
533
+
534
+ // Add nodes
535
+ for (const [url, nodeData] of this.nodes) {
536
+ if (includeImportance && nodeData.importance < minImportance) {
537
+ continue;
538
+ }
539
+
540
+ const nodeId = this.getDOTNodeId(url);
541
+ const domain = nodeData.domain;
542
+ const importance = includeImportance ? nodeData.importance.toFixed(3) : '';
543
+
544
+ dot += ` ${nodeId} [label="${domain}${importance ? '\\n' + importance : ''}"];\n`;
545
+ }
546
+
547
+ dot += '\n';
548
+
549
+ // Add edges
550
+ for (const [linkKey] of this.linkMetadata) {
551
+ const [from, to] = linkKey.split('|');
552
+
553
+ if (includeImportance) {
554
+ const fromNode = this.nodes.get(from);
555
+ const toNode = this.nodes.get(to);
556
+ if ((fromNode && fromNode.importance < minImportance) ||
557
+ (toNode && toNode.importance < minImportance)) {
558
+ continue;
559
+ }
560
+ }
561
+
562
+ const fromId = this.getDOTNodeId(from);
563
+ const toId = this.getDOTNodeId(to);
564
+ dot += ` ${fromId} -> ${toId};\n`;
565
+ }
566
+
567
+ dot += '}';
568
+ return dot;
569
+ }
570
+
571
+ /**
572
+ * Export graph as CSV
573
+ */
574
+ exportCSV(includeMetadata, includeImportance, minImportance) {
575
+ const headers = ['source', 'target', 'count'];
576
+ if (includeImportance) {
577
+ headers.push('source_importance', 'target_importance');
578
+ }
579
+ if (includeMetadata) {
580
+ headers.push('anchor_text', 'first_seen', 'last_seen');
581
+ }
582
+
583
+ let csv = headers.join(',') + '\n';
584
+
585
+ for (const [linkKey, linkData] of this.linkMetadata) {
586
+ const [from, to] = linkKey.split('|');
587
+
588
+ if (includeImportance) {
589
+ const fromNode = this.nodes.get(from);
590
+ const toNode = this.nodes.get(to);
591
+ if ((fromNode && fromNode.importance < minImportance) ||
592
+ (toNode && toNode.importance < minImportance)) {
593
+ continue;
594
+ }
595
+ }
596
+
597
+ const row = [from, to, linkData.count];
598
+
599
+ if (includeImportance) {
600
+ const fromImportance = this.nodes.get(from)?.importance || 0;
601
+ const toImportance = this.nodes.get(to)?.importance || 0;
602
+ row.push(fromImportance.toFixed(4), toImportance.toFixed(4));
603
+ }
604
+
605
+ if (includeMetadata) {
606
+ row.push(
607
+ this.escapeCSV(linkData.anchorText || ''),
608
+ linkData.firstSeen || '',
609
+ linkData.lastSeen || ''
610
+ );
611
+ }
612
+
613
+ csv += row.join(',') + '\n';
614
+ }
615
+
616
+ return csv;
617
+ }
618
+
619
+ /**
620
+ * Export adjacency matrix
621
+ */
622
+ exportAdjacencyMatrix() {
623
+ const nodes = Array.from(this.nodes.keys()).sort();
624
+ const size = nodes.length;
625
+ const matrix = Array(size).fill(null).map(() => Array(size).fill(0));
626
+
627
+ const nodeIndex = new Map();
628
+ nodes.forEach((node, index) => {
629
+ nodeIndex.set(node, index);
630
+ });
631
+
632
+ for (const [linkKey] of this.linkMetadata) {
633
+ const [from, to] = linkKey.split('|');
634
+ const fromIndex = nodeIndex.get(from);
635
+ const toIndex = nodeIndex.get(to);
636
+
637
+ if (fromIndex !== undefined && toIndex !== undefined) {
638
+ matrix[fromIndex][toIndex] = 1;
639
+ }
640
+ }
641
+
642
+ return {
643
+ nodes,
644
+ matrix,
645
+ size
646
+ };
647
+ }
648
+
649
+ /**
650
+ * Get comprehensive graph statistics
651
+ */
652
+ getStatistics() {
653
+ const importance = this.calculateImportance();
654
+ const cycles = this.detectCycles();
655
+
656
+ const stats = {
657
+ ...this.stats,
658
+ nodes: this.nodes.size,
659
+ links: this.linkMetadata.size,
660
+ density: this.nodes.size > 1 ?
661
+ (this.linkMetadata.size / (this.nodes.size * (this.nodes.size - 1))) : 0,
662
+ avgOutboundLinks: 0,
663
+ avgInboundLinks: 0,
664
+ maxOutboundLinks: 0,
665
+ maxInboundLinks: 0,
666
+ cycles: cycles.length,
667
+ stronglyConnectedComponents: this.countStronglyConnectedComponents(),
668
+ importanceDistribution: this.getImportanceDistribution(importance),
669
+ domainDistribution: this.getDomainDistribution(),
670
+ pathLengthDistribution: this.getPathLengthDistribution()
671
+ };
672
+
673
+ // Calculate link statistics
674
+ let totalOutbound = 0;
675
+ let totalInbound = 0;
676
+ let maxOut = 0;
677
+ let maxIn = 0;
678
+
679
+ for (const node of this.nodes.keys()) {
680
+ const outCount = this.getOutboundLinks(node).length;
681
+ const inCount = this.getInboundLinks(node).length;
682
+
683
+ totalOutbound += outCount;
684
+ totalInbound += inCount;
685
+ maxOut = Math.max(maxOut, outCount);
686
+ maxIn = Math.max(maxIn, inCount);
687
+ }
688
+
689
+ stats.avgOutboundLinks = this.nodes.size > 0 ? totalOutbound / this.nodes.size : 0;
690
+ stats.avgInboundLinks = this.nodes.size > 0 ? totalInbound / this.nodes.size : 0;
691
+ stats.maxOutboundLinks = maxOut;
692
+ stats.maxInboundLinks = maxIn;
693
+
694
+ return stats;
695
+ }
696
+
697
+ /**
698
+ * Helper method to calculate cycle strength
699
+ */
700
+ calculateCycleStrength(cycle) {
701
+ let strength = 0;
702
+ for (let i = 0; i < cycle.length - 1; i++) {
703
+ const linkKey = `${cycle[i]}|${cycle[i + 1]}`;
704
+ const linkData = this.linkMetadata.get(linkKey);
705
+ strength += linkData ? linkData.count : 1;
706
+ }
707
+ return strength / (cycle.length - 1);
708
+ }
709
+
710
+ /**
711
+ * Helper method to get cycle metadata
712
+ */
713
+ getCycleMetadata(cycle) {
714
+ const metadata = [];
715
+ for (let i = 0; i < cycle.length - 1; i++) {
716
+ const linkKey = `${cycle[i]}|${cycle[i + 1]}`;
717
+ const linkData = this.linkMetadata.get(linkKey);
718
+ metadata.push({
719
+ from: cycle[i],
720
+ to: cycle[i + 1],
721
+ anchorText: linkData?.anchorText,
722
+ count: linkData?.count || 1
723
+ });
724
+ }
725
+ return metadata;
726
+ }
727
+
728
+ /**
729
+ * Helper method to get path metadata
730
+ */
731
+ getPathMetadata(path) {
732
+ const metadata = [];
733
+ for (let i = 0; i < path.length - 1; i++) {
734
+ const linkKey = `${path[i]}|${path[i + 1]}`;
735
+ const linkData = this.linkMetadata.get(linkKey);
736
+ metadata.push({
737
+ from: path[i],
738
+ to: path[i + 1],
739
+ anchorText: linkData?.anchorText,
740
+ count: linkData?.count || 1
741
+ });
742
+ }
743
+ return metadata;
744
+ }
745
+
746
+ /**
747
+ * Helper method to deduplicate cycles
748
+ */
749
+ deduplicateCycles(cycles) {
750
+ const seen = new Set();
751
+ return cycles.filter(cycle => {
752
+ const normalized = this.normalizeCycle(cycle.nodes);
753
+ if (seen.has(normalized)) {
754
+ return false;
755
+ }
756
+ seen.add(normalized);
757
+ return true;
758
+ });
759
+ }
760
+
761
+ /**
762
+ * Normalize cycle for deduplication
763
+ */
764
+ normalizeCycle(nodes) {
765
+ // Find the lexicographically smallest node as starting point
766
+ let minIndex = 0;
767
+ for (let i = 1; i < nodes.length - 1; i++) {
768
+ if (nodes[i] < nodes[minIndex]) {
769
+ minIndex = i;
770
+ }
771
+ }
772
+
773
+ // Rotate cycle to start with smallest node
774
+ const normalized = [
775
+ ...nodes.slice(minIndex, -1),
776
+ ...nodes.slice(0, minIndex),
777
+ nodes[minIndex]
778
+ ];
779
+
780
+ return normalized.join('->');
781
+ }
782
+
783
+ /**
784
+ * Count strongly connected components using Tarjan's algorithm
785
+ */
786
+ countStronglyConnectedComponents() {
787
+ let index = 0;
788
+ let componentCount = 0;
789
+ const stack = [];
790
+ const indices = new Map();
791
+ const lowLinks = new Map();
792
+ const onStack = new Set();
793
+
794
+ const strongConnect = (node) => {
795
+ indices.set(node, index);
796
+ lowLinks.set(node, index);
797
+ index++;
798
+ stack.push(node);
799
+ onStack.add(node);
800
+
801
+ const neighbors = this.getOutboundLinks(node);
802
+ for (const neighbor of neighbors) {
803
+ if (!indices.has(neighbor)) {
804
+ strongConnect(neighbor);
805
+ lowLinks.set(node, Math.min(lowLinks.get(node), lowLinks.get(neighbor)));
806
+ } else if (onStack.has(neighbor)) {
807
+ lowLinks.set(node, Math.min(lowLinks.get(node), indices.get(neighbor)));
808
+ }
809
+ }
810
+
811
+ if (lowLinks.get(node) === indices.get(node)) {
812
+ componentCount++;
813
+ let component;
814
+ do {
815
+ component = stack.pop();
816
+ onStack.delete(component);
817
+ } while (component !== node);
818
+ }
819
+ };
820
+
821
+ for (const node of this.nodes.keys()) {
822
+ if (!indices.has(node)) {
823
+ strongConnect(node);
824
+ }
825
+ }
826
+
827
+ return componentCount;
828
+ }
829
+
830
+ /**
831
+ * Get importance distribution statistics
832
+ */
833
+ getImportanceDistribution(importanceMap) {
834
+ const values = Array.from(importanceMap.values()).sort((a, b) => b - a);
835
+
836
+ if (values.length === 0) {
837
+ return { min: 0, max: 0, mean: 0, median: 0, stdDev: 0 };
838
+ }
839
+
840
+ const min = values[values.length - 1];
841
+ const max = values[0];
842
+ const mean = values.reduce((sum, val) => sum + val, 0) / values.length;
843
+ const median = values.length % 2 === 0 ?
844
+ (values[Math.floor(values.length / 2) - 1] + values[Math.floor(values.length / 2)]) / 2 :
845
+ values[Math.floor(values.length / 2)];
846
+
847
+ const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / values.length;
848
+ const stdDev = Math.sqrt(variance);
849
+
850
+ return { min, max, mean, median, stdDev };
851
+ }
852
+
853
+ /**
854
+ * Get domain distribution
855
+ */
856
+ getDomainDistribution() {
857
+ const domains = new Map();
858
+ for (const node of this.nodes.values()) {
859
+ domains.set(node.domain, (domains.get(node.domain) || 0) + 1);
860
+ }
861
+ return Object.fromEntries(domains);
862
+ }
863
+
864
+ /**
865
+ * Get path length distribution
866
+ */
867
+ getPathLengthDistribution() {
868
+ const lengths = new Map();
869
+ for (const node of this.nodes.values()) {
870
+ const pathLength = node.path.split('/').filter(s => s).length;
871
+ lengths.set(pathLength, (lengths.get(pathLength) || 0) + 1);
872
+ }
873
+ return Object.fromEntries(lengths);
874
+ }
875
+
876
+ /**
877
+ * Helper methods for caching
878
+ */
879
+ setCacheEntry(key, value) {
880
+ if (!this.cache) return;
881
+
882
+ if (this.cache.size >= this.maxCacheSize) {
883
+ const firstKey = this.cache.keys().next().value;
884
+ this.cache.delete(firstKey);
885
+ }
886
+
887
+ this.cache.set(key, value);
888
+ }
889
+
890
+ clearStructuralCaches() {
891
+ if (!this.cache) return;
892
+
893
+ for (const key of this.cache.keys()) {
894
+ if (key.startsWith('cycles_') || key.startsWith('path_') || key.startsWith('importance_')) {
895
+ this.cache.delete(key);
896
+ }
897
+ }
898
+ }
899
+
900
+ /**
901
+ * Helper methods for export formats
902
+ */
903
+ getDOTNodeId(url) {
904
+ return `"${url.replace(/"/g, '\\"')}"`;
905
+ }
906
+
907
+ escapeCSV(value) {
908
+ if (typeof value !== 'string') return value;
909
+ if (value.includes(',') || value.includes('"') || value.includes('\n')) {
910
+ return `"${value.replace(/"/g, '""')}"`;
911
+ }
912
+ return value;
913
+ }
914
+
915
+ /**
916
+ * Clear all data
917
+ */
918
+ clear() {
919
+ this.nodes.clear();
920
+ this.outboundLinks.clear();
921
+ this.inboundLinks.clear();
922
+ this.linkMetadata.clear();
923
+
924
+ if (this.cache) {
925
+ this.cache.clear();
926
+ }
927
+
928
+ this.stats = {
929
+ nodesCount: 0,
930
+ linksCount: 0,
931
+ lastAnalysisTime: null,
932
+ totalAnalyses: 0,
933
+ cacheHits: 0,
934
+ cacheMisses: 0
935
+ };
936
+ }
937
+
938
+ /**
939
+ * Merge another LinkAnalyzer into this one
940
+ */
941
+ merge(other) {
942
+ if (!(other instanceof LinkAnalyzer)) {
943
+ throw new Error('Can only merge with another LinkAnalyzer instance');
944
+ }
945
+
946
+ // Merge nodes
947
+ for (const [url, nodeData] of other.nodes) {
948
+ if (!this.nodes.has(url)) {
949
+ this.nodes.set(url, { ...nodeData });
950
+ } else {
951
+ // Update with more recent data
952
+ const existing = this.nodes.get(url);
953
+ if (new Date(nodeData.discovered) > new Date(existing.discovered)) {
954
+ this.nodes.set(url, { ...existing, ...nodeData });
955
+ }
956
+ }
957
+ }
958
+
959
+ // Merge links
960
+ for (const [linkKey, linkData] of other.linkMetadata) {
961
+ const [from, to] = linkKey.split('|');
962
+ this.addLink(from, to, linkData);
963
+ }
964
+
965
+ this.stats.nodesCount = this.nodes.size;
966
+ this.stats.linksCount = this.linkMetadata.size;
967
+
968
+ return this;
969
+ }
970
+ }
971
+
972
+ export default LinkAnalyzer;