crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,972 @@
|
|
|
1
|
+
import { URL } from 'url';
|
|
2
|
+
import { normalizeUrl } from '../../utils/urlNormalizer.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* LinkAnalyzer - Comprehensive link analysis system with graph builder
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Directed graph data structure for link relationships
|
|
9
|
+
* - Parent-child relationship tracking
|
|
10
|
+
* - Link importance calculation (simplified PageRank)
|
|
11
|
+
* - Circular reference detection and handling
|
|
12
|
+
* - Path analysis (shortest paths, common ancestors)
|
|
13
|
+
* - Graph export capabilities
|
|
14
|
+
* - Performance optimized for large link networks
|
|
15
|
+
*/
|
|
16
|
+
export class LinkAnalyzer {
|
|
17
|
+
constructor(options = {}) {
|
|
18
|
+
const {
|
|
19
|
+
dampingFactor = 0.85, // PageRank damping factor
|
|
20
|
+
maxIterations = 100, // Max PageRank iterations
|
|
21
|
+
convergenceThreshold = 0.0001, // PageRank convergence threshold
|
|
22
|
+
defaultImportance = 1.0, // Default node importance
|
|
23
|
+
enableCaching = true, // Enable calculation caching
|
|
24
|
+
maxCacheSize = 10000 // Max cache entries
|
|
25
|
+
} = options;
|
|
26
|
+
|
|
27
|
+
// Graph data structures
|
|
28
|
+
this.nodes = new Map(); // url -> node data
|
|
29
|
+
this.outboundLinks = new Map(); // url -> Set of outbound URLs
|
|
30
|
+
this.inboundLinks = new Map(); // url -> Set of inbound URLs
|
|
31
|
+
this.linkMetadata = new Map(); // `from|to` -> link metadata
|
|
32
|
+
|
|
33
|
+
// Analysis results cache
|
|
34
|
+
this.cache = enableCaching ? new Map() : null;
|
|
35
|
+
this.maxCacheSize = maxCacheSize;
|
|
36
|
+
|
|
37
|
+
// PageRank parameters
|
|
38
|
+
this.dampingFactor = dampingFactor;
|
|
39
|
+
this.maxIterations = maxIterations;
|
|
40
|
+
this.convergenceThreshold = convergenceThreshold;
|
|
41
|
+
this.defaultImportance = defaultImportance;
|
|
42
|
+
|
|
43
|
+
// Performance tracking
|
|
44
|
+
this.stats = {
|
|
45
|
+
nodesCount: 0,
|
|
46
|
+
linksCount: 0,
|
|
47
|
+
lastAnalysisTime: null,
|
|
48
|
+
totalAnalyses: 0,
|
|
49
|
+
cacheHits: 0,
|
|
50
|
+
cacheMisses: 0
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
// Cycle detection cache
|
|
54
|
+
this.cycleCache = new Map();
|
|
55
|
+
this.pathCache = new Map();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Add a link to the graph
|
|
60
|
+
* @param {string} from - Source URL
|
|
61
|
+
* @param {string} to - Target URL
|
|
62
|
+
* @param {Object} metadata - Link metadata (anchor text, context, etc.)
|
|
63
|
+
*/
|
|
64
|
+
addLink(from, to, metadata = {}) {
|
|
65
|
+
const normalizedFrom = normalizeUrl(from);
|
|
66
|
+
const normalizedTo = normalizeUrl(to);
|
|
67
|
+
|
|
68
|
+
if (!normalizedFrom || !normalizedTo || normalizedFrom === normalizedTo) {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Initialize nodes if they don't exist
|
|
73
|
+
this.ensureNode(normalizedFrom);
|
|
74
|
+
this.ensureNode(normalizedTo);
|
|
75
|
+
|
|
76
|
+
// Add outbound link
|
|
77
|
+
if (!this.outboundLinks.has(normalizedFrom)) {
|
|
78
|
+
this.outboundLinks.set(normalizedFrom, new Set());
|
|
79
|
+
}
|
|
80
|
+
this.outboundLinks.get(normalizedFrom).add(normalizedTo);
|
|
81
|
+
|
|
82
|
+
// Add inbound link
|
|
83
|
+
if (!this.inboundLinks.has(normalizedTo)) {
|
|
84
|
+
this.inboundLinks.set(normalizedTo, new Set());
|
|
85
|
+
}
|
|
86
|
+
this.inboundLinks.get(normalizedTo).add(normalizedFrom);
|
|
87
|
+
|
|
88
|
+
// Store link metadata
|
|
89
|
+
const linkKey = `${normalizedFrom}|${normalizedTo}`;
|
|
90
|
+
const existingMetadata = this.linkMetadata.get(linkKey) || {};
|
|
91
|
+
this.linkMetadata.set(linkKey, {
|
|
92
|
+
...existingMetadata,
|
|
93
|
+
...metadata,
|
|
94
|
+
firstSeen: existingMetadata.firstSeen || new Date().toISOString(),
|
|
95
|
+
lastSeen: new Date().toISOString(),
|
|
96
|
+
count: (existingMetadata.count || 0) + 1
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
// Update statistics
|
|
100
|
+
this.stats.linksCount = this.linkMetadata.size;
|
|
101
|
+
|
|
102
|
+
// Clear caches that depend on graph structure
|
|
103
|
+
this.clearStructuralCaches();
|
|
104
|
+
|
|
105
|
+
return true;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Ensure a node exists in the graph
|
|
110
|
+
* @param {string} url - URL to ensure exists
|
|
111
|
+
*/
|
|
112
|
+
ensureNode(url) {
|
|
113
|
+
const normalizedUrl = normalizeUrl(url);
|
|
114
|
+
if (!this.nodes.has(normalizedUrl)) {
|
|
115
|
+
try {
|
|
116
|
+
const urlObj = new URL(normalizedUrl);
|
|
117
|
+
this.nodes.set(normalizedUrl, {
|
|
118
|
+
url: normalizedUrl,
|
|
119
|
+
domain: urlObj.hostname,
|
|
120
|
+
path: urlObj.pathname,
|
|
121
|
+
importance: this.defaultImportance,
|
|
122
|
+
depth: 0,
|
|
123
|
+
discovered: new Date().toISOString(),
|
|
124
|
+
metadata: {}
|
|
125
|
+
});
|
|
126
|
+
this.stats.nodesCount = this.nodes.size;
|
|
127
|
+
} catch (error) {
|
|
128
|
+
return false;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return true;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Get all inbound links for a URL
|
|
136
|
+
* @param {string} url - Target URL
|
|
137
|
+
* @returns {Array} Array of source URLs
|
|
138
|
+
*/
|
|
139
|
+
getInboundLinks(url) {
|
|
140
|
+
const normalizedUrl = normalizeUrl(url);
|
|
141
|
+
const inbound = this.inboundLinks.get(normalizedUrl);
|
|
142
|
+
return inbound ? Array.from(inbound) : [];
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Get all outbound links for a URL
|
|
147
|
+
* @param {string} url - Source URL
|
|
148
|
+
* @returns {Array} Array of target URLs
|
|
149
|
+
*/
|
|
150
|
+
getOutboundLinks(url) {
|
|
151
|
+
const normalizedUrl = normalizeUrl(url);
|
|
152
|
+
const outbound = this.outboundLinks.get(normalizedUrl);
|
|
153
|
+
return outbound ? Array.from(outbound) : [];
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Calculate link importance using simplified PageRank algorithm
|
|
158
|
+
* @param {Object} options - Calculation options
|
|
159
|
+
* @returns {Map} Map of URL to importance score
|
|
160
|
+
*/
|
|
161
|
+
calculateImportance(options = {}) {
|
|
162
|
+
const cacheKey = 'importance_' + JSON.stringify(options);
|
|
163
|
+
if (this.cache && this.cache.has(cacheKey)) {
|
|
164
|
+
this.stats.cacheHits++;
|
|
165
|
+
return this.cache.get(cacheKey);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const startTime = Date.now();
|
|
169
|
+
const {
|
|
170
|
+
dampingFactor = this.dampingFactor,
|
|
171
|
+
maxIterations = this.maxIterations,
|
|
172
|
+
convergenceThreshold = this.convergenceThreshold
|
|
173
|
+
} = options;
|
|
174
|
+
|
|
175
|
+
const nodes = Array.from(this.nodes.keys());
|
|
176
|
+
const nodeCount = nodes.length;
|
|
177
|
+
|
|
178
|
+
if (nodeCount === 0) {
|
|
179
|
+
return new Map();
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Initialize PageRank values
|
|
183
|
+
let pageRank = new Map();
|
|
184
|
+
let newPageRank = new Map();
|
|
185
|
+
const initialValue = 1.0 / nodeCount;
|
|
186
|
+
|
|
187
|
+
for (const node of nodes) {
|
|
188
|
+
pageRank.set(node, initialValue);
|
|
189
|
+
newPageRank.set(node, initialValue);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
let iteration = 0;
|
|
193
|
+
let hasConverged = false;
|
|
194
|
+
|
|
195
|
+
while (iteration < maxIterations && !hasConverged) {
|
|
196
|
+
hasConverged = true;
|
|
197
|
+
|
|
198
|
+
for (const node of nodes) {
|
|
199
|
+
let sum = 0;
|
|
200
|
+
const inboundNodes = this.getInboundLinks(node);
|
|
201
|
+
|
|
202
|
+
for (const inboundNode of inboundNodes) {
|
|
203
|
+
const outboundCount = this.getOutboundLinks(inboundNode).length;
|
|
204
|
+
if (outboundCount > 0) {
|
|
205
|
+
sum += pageRank.get(inboundNode) / outboundCount;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const newValue = (1 - dampingFactor) / nodeCount + dampingFactor * sum;
|
|
210
|
+
newPageRank.set(node, newValue);
|
|
211
|
+
|
|
212
|
+
// Check convergence
|
|
213
|
+
if (Math.abs(newValue - pageRank.get(node)) > convergenceThreshold) {
|
|
214
|
+
hasConverged = false;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// Swap maps for next iteration
|
|
219
|
+
[pageRank, newPageRank] = [newPageRank, pageRank];
|
|
220
|
+
iteration++;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// Update node importance scores
|
|
224
|
+
for (const [url, score] of pageRank) {
|
|
225
|
+
const node = this.nodes.get(url);
|
|
226
|
+
if (node) {
|
|
227
|
+
node.importance = score;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Cache results
|
|
232
|
+
if (this.cache) {
|
|
233
|
+
this.setCacheEntry(cacheKey, pageRank);
|
|
234
|
+
this.stats.cacheMisses++;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
this.stats.lastAnalysisTime = Date.now() - startTime;
|
|
238
|
+
this.stats.totalAnalyses++;
|
|
239
|
+
|
|
240
|
+
return pageRank;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Detect circular reference chains in the graph
|
|
245
|
+
* @param {Object} options - Detection options
|
|
246
|
+
* @returns {Array} Array of cycle objects
|
|
247
|
+
*/
|
|
248
|
+
detectCycles(options = {}) {
|
|
249
|
+
const cacheKey = 'cycles_' + JSON.stringify(options);
|
|
250
|
+
if (this.cache && this.cache.has(cacheKey)) {
|
|
251
|
+
this.stats.cacheHits++;
|
|
252
|
+
return this.cache.get(cacheKey);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const {
|
|
256
|
+
maxCycleLength = 10,
|
|
257
|
+
includeMetadata = false
|
|
258
|
+
} = options;
|
|
259
|
+
|
|
260
|
+
const cycles = [];
|
|
261
|
+
const visited = new Set();
|
|
262
|
+
const recursionStack = new Set();
|
|
263
|
+
const path = [];
|
|
264
|
+
|
|
265
|
+
const dfs = (node) => {
|
|
266
|
+
if (recursionStack.has(node)) {
|
|
267
|
+
// Found a cycle
|
|
268
|
+
const cycleStart = path.indexOf(node);
|
|
269
|
+
if (cycleStart >= 0) {
|
|
270
|
+
const cycle = path.slice(cycleStart);
|
|
271
|
+
cycle.push(node); // Complete the cycle
|
|
272
|
+
|
|
273
|
+
if (cycle.length <= maxCycleLength) {
|
|
274
|
+
const cycleObj = {
|
|
275
|
+
nodes: cycle,
|
|
276
|
+
length: cycle.length - 1, // Don't count repeated node
|
|
277
|
+
strength: this.calculateCycleStrength(cycle)
|
|
278
|
+
};
|
|
279
|
+
|
|
280
|
+
if (includeMetadata) {
|
|
281
|
+
cycleObj.metadata = this.getCycleMetadata(cycle);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
cycles.push(cycleObj);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
return;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
if (visited.has(node)) {
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
visited.add(node);
|
|
295
|
+
recursionStack.add(node);
|
|
296
|
+
path.push(node);
|
|
297
|
+
|
|
298
|
+
const outbound = this.getOutboundLinks(node);
|
|
299
|
+
for (const neighbor of outbound) {
|
|
300
|
+
dfs(neighbor);
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
recursionStack.delete(node);
|
|
304
|
+
path.pop();
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
// Start DFS from each unvisited node
|
|
308
|
+
for (const node of this.nodes.keys()) {
|
|
309
|
+
if (!visited.has(node)) {
|
|
310
|
+
dfs(node);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// Remove duplicate cycles
|
|
315
|
+
const uniqueCycles = this.deduplicateCycles(cycles);
|
|
316
|
+
|
|
317
|
+
// Cache results
|
|
318
|
+
if (this.cache) {
|
|
319
|
+
this.setCacheEntry(cacheKey, uniqueCycles);
|
|
320
|
+
this.stats.cacheMisses++;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
return uniqueCycles;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
/**
|
|
327
|
+
* Find relationship path between two URLs
|
|
328
|
+
* @param {string} url1 - Starting URL
|
|
329
|
+
* @param {string} url2 - Target URL
|
|
330
|
+
* @param {Object} options - Path finding options
|
|
331
|
+
* @returns {Object|null} Path object or null if no path exists
|
|
332
|
+
*/
|
|
333
|
+
getRelationshipPath(url1, url2, options = {}) {
|
|
334
|
+
const normalizedUrl1 = normalizeUrl(url1);
|
|
335
|
+
const normalizedUrl2 = normalizeUrl(url2);
|
|
336
|
+
|
|
337
|
+
if (!this.nodes.has(normalizedUrl1) || !this.nodes.has(normalizedUrl2)) {
|
|
338
|
+
return null;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
const cacheKey = `path_${normalizedUrl1}_${normalizedUrl2}_${JSON.stringify(options)}`;
|
|
342
|
+
if (this.cache && this.cache.has(cacheKey)) {
|
|
343
|
+
this.stats.cacheHits++;
|
|
344
|
+
return this.cache.get(cacheKey);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
const {
|
|
348
|
+
maxDepth = 10,
|
|
349
|
+
bidirectional = true,
|
|
350
|
+
includeMetadata = false
|
|
351
|
+
} = options;
|
|
352
|
+
|
|
353
|
+
let result = null;
|
|
354
|
+
|
|
355
|
+
if (bidirectional) {
|
|
356
|
+
// Try both directions and return the shortest path
|
|
357
|
+
const path1to2 = this.findShortestPath(normalizedUrl1, normalizedUrl2, maxDepth);
|
|
358
|
+
const path2to1 = this.findShortestPath(normalizedUrl2, normalizedUrl1, maxDepth);
|
|
359
|
+
|
|
360
|
+
if (path1to2 && path2to1) {
|
|
361
|
+
result = path1to2.length <= path2to1.length ?
|
|
362
|
+
{ path: path1to2, direction: 'forward' } :
|
|
363
|
+
{ path: path2to1.reverse(), direction: 'reverse' };
|
|
364
|
+
} else if (path1to2) {
|
|
365
|
+
result = { path: path1to2, direction: 'forward' };
|
|
366
|
+
} else if (path2to1) {
|
|
367
|
+
result = { path: path2to1.reverse(), direction: 'reverse' };
|
|
368
|
+
}
|
|
369
|
+
} else {
|
|
370
|
+
const path = this.findShortestPath(normalizedUrl1, normalizedUrl2, maxDepth);
|
|
371
|
+
if (path) {
|
|
372
|
+
result = { path, direction: 'forward' };
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
if (result && includeMetadata) {
|
|
377
|
+
result.metadata = this.getPathMetadata(result.path);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// Cache results
|
|
381
|
+
if (this.cache) {
|
|
382
|
+
this.setCacheEntry(cacheKey, result);
|
|
383
|
+
this.stats.cacheMisses++;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
return result;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
/**
|
|
390
|
+
* Find shortest path between two nodes using BFS
|
|
391
|
+
* @param {string} start - Start URL
|
|
392
|
+
* @param {string} end - End URL
|
|
393
|
+
* @param {number} maxDepth - Maximum search depth
|
|
394
|
+
* @returns {Array|null} Path array or null
|
|
395
|
+
*/
|
|
396
|
+
findShortestPath(start, end, maxDepth) {
|
|
397
|
+
if (start === end) {
|
|
398
|
+
return [start];
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
const queue = [[start]];
|
|
402
|
+
const visited = new Set([start]);
|
|
403
|
+
|
|
404
|
+
while (queue.length > 0) {
|
|
405
|
+
const path = queue.shift();
|
|
406
|
+
const current = path[path.length - 1];
|
|
407
|
+
|
|
408
|
+
if (path.length > maxDepth) {
|
|
409
|
+
continue;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const neighbors = this.getOutboundLinks(current);
|
|
413
|
+
for (const neighbor of neighbors) {
|
|
414
|
+
if (neighbor === end) {
|
|
415
|
+
return [...path, neighbor];
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
if (!visited.has(neighbor)) {
|
|
419
|
+
visited.add(neighbor);
|
|
420
|
+
queue.push([...path, neighbor]);
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
return null;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/**
|
|
429
|
+
* Export graph in various formats
|
|
430
|
+
* @param {string} format - Export format ('json', 'dot', 'csv', 'adjacency')
|
|
431
|
+
* @param {Object} options - Export options
|
|
432
|
+
* @returns {string|Object} Exported data
|
|
433
|
+
*/
|
|
434
|
+
exportGraph(format = 'json', options = {}) {
|
|
435
|
+
const {
|
|
436
|
+
includeMetadata = true,
|
|
437
|
+
includeImportance = true,
|
|
438
|
+
minImportance = 0
|
|
439
|
+
} = options;
|
|
440
|
+
|
|
441
|
+
switch (format.toLowerCase()) {
|
|
442
|
+
case 'json':
|
|
443
|
+
return this.exportJSON(includeMetadata, includeImportance, minImportance);
|
|
444
|
+
case 'dot':
|
|
445
|
+
return this.exportDOT(includeMetadata, includeImportance, minImportance);
|
|
446
|
+
case 'csv':
|
|
447
|
+
return this.exportCSV(includeMetadata, includeImportance, minImportance);
|
|
448
|
+
case 'adjacency':
|
|
449
|
+
return this.exportAdjacencyMatrix();
|
|
450
|
+
default:
|
|
451
|
+
throw new Error(`Unsupported export format: ${format}`);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* Export graph as JSON
|
|
457
|
+
*/
|
|
458
|
+
exportJSON(includeMetadata, includeImportance, minImportance) {
|
|
459
|
+
const nodes = [];
|
|
460
|
+
const links = [];
|
|
461
|
+
|
|
462
|
+
// Export nodes
|
|
463
|
+
for (const [url, nodeData] of this.nodes) {
|
|
464
|
+
if (includeImportance && nodeData.importance < minImportance) {
|
|
465
|
+
continue;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
const node = {
|
|
469
|
+
id: url,
|
|
470
|
+
url: url,
|
|
471
|
+
domain: nodeData.domain,
|
|
472
|
+
path: nodeData.path
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
if (includeImportance) {
|
|
476
|
+
node.importance = nodeData.importance;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
if (includeMetadata) {
|
|
480
|
+
node.metadata = nodeData.metadata;
|
|
481
|
+
node.discovered = nodeData.discovered;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
nodes.push(node);
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// Export links
|
|
488
|
+
for (const [linkKey, linkData] of this.linkMetadata) {
|
|
489
|
+
const [from, to] = linkKey.split('|');
|
|
490
|
+
|
|
491
|
+
if (includeImportance) {
|
|
492
|
+
const fromNode = this.nodes.get(from);
|
|
493
|
+
const toNode = this.nodes.get(to);
|
|
494
|
+
if ((fromNode && fromNode.importance < minImportance) ||
|
|
495
|
+
(toNode && toNode.importance < minImportance)) {
|
|
496
|
+
continue;
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
const link = {
|
|
501
|
+
source: from,
|
|
502
|
+
target: to,
|
|
503
|
+
count: linkData.count
|
|
504
|
+
};
|
|
505
|
+
|
|
506
|
+
if (includeMetadata) {
|
|
507
|
+
link.metadata = {
|
|
508
|
+
anchorText: linkData.anchorText,
|
|
509
|
+
context: linkData.context,
|
|
510
|
+
firstSeen: linkData.firstSeen,
|
|
511
|
+
lastSeen: linkData.lastSeen
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
links.push(link);
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
return {
|
|
519
|
+
nodes,
|
|
520
|
+
links,
|
|
521
|
+
statistics: this.getStatistics(),
|
|
522
|
+
exportedAt: new Date().toISOString()
|
|
523
|
+
};
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
/**
|
|
527
|
+
* Export graph in DOT format (Graphviz)
|
|
528
|
+
*/
|
|
529
|
+
exportDOT(includeMetadata, includeImportance, minImportance) {
|
|
530
|
+
let dot = 'digraph LinkGraph {\n';
|
|
531
|
+
dot += ' rankdir=LR;\n';
|
|
532
|
+
dot += ' node [shape=ellipse];\n\n';
|
|
533
|
+
|
|
534
|
+
// Add nodes
|
|
535
|
+
for (const [url, nodeData] of this.nodes) {
|
|
536
|
+
if (includeImportance && nodeData.importance < minImportance) {
|
|
537
|
+
continue;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
const nodeId = this.getDOTNodeId(url);
|
|
541
|
+
const domain = nodeData.domain;
|
|
542
|
+
const importance = includeImportance ? nodeData.importance.toFixed(3) : '';
|
|
543
|
+
|
|
544
|
+
dot += ` ${nodeId} [label="${domain}${importance ? '\\n' + importance : ''}"];\n`;
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
dot += '\n';
|
|
548
|
+
|
|
549
|
+
// Add edges
|
|
550
|
+
for (const [linkKey] of this.linkMetadata) {
|
|
551
|
+
const [from, to] = linkKey.split('|');
|
|
552
|
+
|
|
553
|
+
if (includeImportance) {
|
|
554
|
+
const fromNode = this.nodes.get(from);
|
|
555
|
+
const toNode = this.nodes.get(to);
|
|
556
|
+
if ((fromNode && fromNode.importance < minImportance) ||
|
|
557
|
+
(toNode && toNode.importance < minImportance)) {
|
|
558
|
+
continue;
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
const fromId = this.getDOTNodeId(from);
|
|
563
|
+
const toId = this.getDOTNodeId(to);
|
|
564
|
+
dot += ` ${fromId} -> ${toId};\n`;
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
dot += '}';
|
|
568
|
+
return dot;
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
/**
|
|
572
|
+
* Export graph as CSV
|
|
573
|
+
*/
|
|
574
|
+
exportCSV(includeMetadata, includeImportance, minImportance) {
|
|
575
|
+
const headers = ['source', 'target', 'count'];
|
|
576
|
+
if (includeImportance) {
|
|
577
|
+
headers.push('source_importance', 'target_importance');
|
|
578
|
+
}
|
|
579
|
+
if (includeMetadata) {
|
|
580
|
+
headers.push('anchor_text', 'first_seen', 'last_seen');
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
let csv = headers.join(',') + '\n';
|
|
584
|
+
|
|
585
|
+
for (const [linkKey, linkData] of this.linkMetadata) {
|
|
586
|
+
const [from, to] = linkKey.split('|');
|
|
587
|
+
|
|
588
|
+
if (includeImportance) {
|
|
589
|
+
const fromNode = this.nodes.get(from);
|
|
590
|
+
const toNode = this.nodes.get(to);
|
|
591
|
+
if ((fromNode && fromNode.importance < minImportance) ||
|
|
592
|
+
(toNode && toNode.importance < minImportance)) {
|
|
593
|
+
continue;
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
const row = [from, to, linkData.count];
|
|
598
|
+
|
|
599
|
+
if (includeImportance) {
|
|
600
|
+
const fromImportance = this.nodes.get(from)?.importance || 0;
|
|
601
|
+
const toImportance = this.nodes.get(to)?.importance || 0;
|
|
602
|
+
row.push(fromImportance.toFixed(4), toImportance.toFixed(4));
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
if (includeMetadata) {
|
|
606
|
+
row.push(
|
|
607
|
+
this.escapeCSV(linkData.anchorText || ''),
|
|
608
|
+
linkData.firstSeen || '',
|
|
609
|
+
linkData.lastSeen || ''
|
|
610
|
+
);
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
csv += row.join(',') + '\n';
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
return csv;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
/**
|
|
620
|
+
* Export adjacency matrix
|
|
621
|
+
*/
|
|
622
|
+
exportAdjacencyMatrix() {
|
|
623
|
+
const nodes = Array.from(this.nodes.keys()).sort();
|
|
624
|
+
const size = nodes.length;
|
|
625
|
+
const matrix = Array(size).fill(null).map(() => Array(size).fill(0));
|
|
626
|
+
|
|
627
|
+
const nodeIndex = new Map();
|
|
628
|
+
nodes.forEach((node, index) => {
|
|
629
|
+
nodeIndex.set(node, index);
|
|
630
|
+
});
|
|
631
|
+
|
|
632
|
+
for (const [linkKey] of this.linkMetadata) {
|
|
633
|
+
const [from, to] = linkKey.split('|');
|
|
634
|
+
const fromIndex = nodeIndex.get(from);
|
|
635
|
+
const toIndex = nodeIndex.get(to);
|
|
636
|
+
|
|
637
|
+
if (fromIndex !== undefined && toIndex !== undefined) {
|
|
638
|
+
matrix[fromIndex][toIndex] = 1;
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
return {
|
|
643
|
+
nodes,
|
|
644
|
+
matrix,
|
|
645
|
+
size
|
|
646
|
+
};
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
/**
|
|
650
|
+
* Get comprehensive graph statistics
|
|
651
|
+
*/
|
|
652
|
+
getStatistics() {
|
|
653
|
+
const importance = this.calculateImportance();
|
|
654
|
+
const cycles = this.detectCycles();
|
|
655
|
+
|
|
656
|
+
const stats = {
|
|
657
|
+
...this.stats,
|
|
658
|
+
nodes: this.nodes.size,
|
|
659
|
+
links: this.linkMetadata.size,
|
|
660
|
+
density: this.nodes.size > 1 ?
|
|
661
|
+
(this.linkMetadata.size / (this.nodes.size * (this.nodes.size - 1))) : 0,
|
|
662
|
+
avgOutboundLinks: 0,
|
|
663
|
+
avgInboundLinks: 0,
|
|
664
|
+
maxOutboundLinks: 0,
|
|
665
|
+
maxInboundLinks: 0,
|
|
666
|
+
cycles: cycles.length,
|
|
667
|
+
stronglyConnectedComponents: this.countStronglyConnectedComponents(),
|
|
668
|
+
importanceDistribution: this.getImportanceDistribution(importance),
|
|
669
|
+
domainDistribution: this.getDomainDistribution(),
|
|
670
|
+
pathLengthDistribution: this.getPathLengthDistribution()
|
|
671
|
+
};
|
|
672
|
+
|
|
673
|
+
// Calculate link statistics
|
|
674
|
+
let totalOutbound = 0;
|
|
675
|
+
let totalInbound = 0;
|
|
676
|
+
let maxOut = 0;
|
|
677
|
+
let maxIn = 0;
|
|
678
|
+
|
|
679
|
+
for (const node of this.nodes.keys()) {
|
|
680
|
+
const outCount = this.getOutboundLinks(node).length;
|
|
681
|
+
const inCount = this.getInboundLinks(node).length;
|
|
682
|
+
|
|
683
|
+
totalOutbound += outCount;
|
|
684
|
+
totalInbound += inCount;
|
|
685
|
+
maxOut = Math.max(maxOut, outCount);
|
|
686
|
+
maxIn = Math.max(maxIn, inCount);
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
stats.avgOutboundLinks = this.nodes.size > 0 ? totalOutbound / this.nodes.size : 0;
|
|
690
|
+
stats.avgInboundLinks = this.nodes.size > 0 ? totalInbound / this.nodes.size : 0;
|
|
691
|
+
stats.maxOutboundLinks = maxOut;
|
|
692
|
+
stats.maxInboundLinks = maxIn;
|
|
693
|
+
|
|
694
|
+
return stats;
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
/**
|
|
698
|
+
* Helper method to calculate cycle strength
|
|
699
|
+
*/
|
|
700
|
+
calculateCycleStrength(cycle) {
|
|
701
|
+
let strength = 0;
|
|
702
|
+
for (let i = 0; i < cycle.length - 1; i++) {
|
|
703
|
+
const linkKey = `${cycle[i]}|${cycle[i + 1]}`;
|
|
704
|
+
const linkData = this.linkMetadata.get(linkKey);
|
|
705
|
+
strength += linkData ? linkData.count : 1;
|
|
706
|
+
}
|
|
707
|
+
return strength / (cycle.length - 1);
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
/**
|
|
711
|
+
* Helper method to get cycle metadata
|
|
712
|
+
*/
|
|
713
|
+
getCycleMetadata(cycle) {
|
|
714
|
+
const metadata = [];
|
|
715
|
+
for (let i = 0; i < cycle.length - 1; i++) {
|
|
716
|
+
const linkKey = `${cycle[i]}|${cycle[i + 1]}`;
|
|
717
|
+
const linkData = this.linkMetadata.get(linkKey);
|
|
718
|
+
metadata.push({
|
|
719
|
+
from: cycle[i],
|
|
720
|
+
to: cycle[i + 1],
|
|
721
|
+
anchorText: linkData?.anchorText,
|
|
722
|
+
count: linkData?.count || 1
|
|
723
|
+
});
|
|
724
|
+
}
|
|
725
|
+
return metadata;
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
/**
|
|
729
|
+
* Helper method to get path metadata
|
|
730
|
+
*/
|
|
731
|
+
getPathMetadata(path) {
|
|
732
|
+
const metadata = [];
|
|
733
|
+
for (let i = 0; i < path.length - 1; i++) {
|
|
734
|
+
const linkKey = `${path[i]}|${path[i + 1]}`;
|
|
735
|
+
const linkData = this.linkMetadata.get(linkKey);
|
|
736
|
+
metadata.push({
|
|
737
|
+
from: path[i],
|
|
738
|
+
to: path[i + 1],
|
|
739
|
+
anchorText: linkData?.anchorText,
|
|
740
|
+
count: linkData?.count || 1
|
|
741
|
+
});
|
|
742
|
+
}
|
|
743
|
+
return metadata;
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
/**
|
|
747
|
+
* Helper method to deduplicate cycles
|
|
748
|
+
*/
|
|
749
|
+
deduplicateCycles(cycles) {
|
|
750
|
+
const seen = new Set();
|
|
751
|
+
return cycles.filter(cycle => {
|
|
752
|
+
const normalized = this.normalizeCycle(cycle.nodes);
|
|
753
|
+
if (seen.has(normalized)) {
|
|
754
|
+
return false;
|
|
755
|
+
}
|
|
756
|
+
seen.add(normalized);
|
|
757
|
+
return true;
|
|
758
|
+
});
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
/**
|
|
762
|
+
* Normalize cycle for deduplication
|
|
763
|
+
*/
|
|
764
|
+
normalizeCycle(nodes) {
|
|
765
|
+
// Find the lexicographically smallest node as starting point
|
|
766
|
+
let minIndex = 0;
|
|
767
|
+
for (let i = 1; i < nodes.length - 1; i++) {
|
|
768
|
+
if (nodes[i] < nodes[minIndex]) {
|
|
769
|
+
minIndex = i;
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
// Rotate cycle to start with smallest node
|
|
774
|
+
const normalized = [
|
|
775
|
+
...nodes.slice(minIndex, -1),
|
|
776
|
+
...nodes.slice(0, minIndex),
|
|
777
|
+
nodes[minIndex]
|
|
778
|
+
];
|
|
779
|
+
|
|
780
|
+
return normalized.join('->');
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
/**
|
|
784
|
+
* Count strongly connected components using Tarjan's algorithm
|
|
785
|
+
*/
|
|
786
|
+
countStronglyConnectedComponents() {
|
|
787
|
+
let index = 0;
|
|
788
|
+
let componentCount = 0;
|
|
789
|
+
const stack = [];
|
|
790
|
+
const indices = new Map();
|
|
791
|
+
const lowLinks = new Map();
|
|
792
|
+
const onStack = new Set();
|
|
793
|
+
|
|
794
|
+
const strongConnect = (node) => {
|
|
795
|
+
indices.set(node, index);
|
|
796
|
+
lowLinks.set(node, index);
|
|
797
|
+
index++;
|
|
798
|
+
stack.push(node);
|
|
799
|
+
onStack.add(node);
|
|
800
|
+
|
|
801
|
+
const neighbors = this.getOutboundLinks(node);
|
|
802
|
+
for (const neighbor of neighbors) {
|
|
803
|
+
if (!indices.has(neighbor)) {
|
|
804
|
+
strongConnect(neighbor);
|
|
805
|
+
lowLinks.set(node, Math.min(lowLinks.get(node), lowLinks.get(neighbor)));
|
|
806
|
+
} else if (onStack.has(neighbor)) {
|
|
807
|
+
lowLinks.set(node, Math.min(lowLinks.get(node), indices.get(neighbor)));
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
if (lowLinks.get(node) === indices.get(node)) {
|
|
812
|
+
componentCount++;
|
|
813
|
+
let component;
|
|
814
|
+
do {
|
|
815
|
+
component = stack.pop();
|
|
816
|
+
onStack.delete(component);
|
|
817
|
+
} while (component !== node);
|
|
818
|
+
}
|
|
819
|
+
};
|
|
820
|
+
|
|
821
|
+
for (const node of this.nodes.keys()) {
|
|
822
|
+
if (!indices.has(node)) {
|
|
823
|
+
strongConnect(node);
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
return componentCount;
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
/**
|
|
831
|
+
* Get importance distribution statistics
|
|
832
|
+
*/
|
|
833
|
+
getImportanceDistribution(importanceMap) {
|
|
834
|
+
const values = Array.from(importanceMap.values()).sort((a, b) => b - a);
|
|
835
|
+
|
|
836
|
+
if (values.length === 0) {
|
|
837
|
+
return { min: 0, max: 0, mean: 0, median: 0, stdDev: 0 };
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
const min = values[values.length - 1];
|
|
841
|
+
const max = values[0];
|
|
842
|
+
const mean = values.reduce((sum, val) => sum + val, 0) / values.length;
|
|
843
|
+
const median = values.length % 2 === 0 ?
|
|
844
|
+
(values[Math.floor(values.length / 2) - 1] + values[Math.floor(values.length / 2)]) / 2 :
|
|
845
|
+
values[Math.floor(values.length / 2)];
|
|
846
|
+
|
|
847
|
+
const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / values.length;
|
|
848
|
+
const stdDev = Math.sqrt(variance);
|
|
849
|
+
|
|
850
|
+
return { min, max, mean, median, stdDev };
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
/**
|
|
854
|
+
* Get domain distribution
|
|
855
|
+
*/
|
|
856
|
+
getDomainDistribution() {
|
|
857
|
+
const domains = new Map();
|
|
858
|
+
for (const node of this.nodes.values()) {
|
|
859
|
+
domains.set(node.domain, (domains.get(node.domain) || 0) + 1);
|
|
860
|
+
}
|
|
861
|
+
return Object.fromEntries(domains);
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
/**
|
|
865
|
+
* Get path length distribution
|
|
866
|
+
*/
|
|
867
|
+
getPathLengthDistribution() {
|
|
868
|
+
const lengths = new Map();
|
|
869
|
+
for (const node of this.nodes.values()) {
|
|
870
|
+
const pathLength = node.path.split('/').filter(s => s).length;
|
|
871
|
+
lengths.set(pathLength, (lengths.get(pathLength) || 0) + 1);
|
|
872
|
+
}
|
|
873
|
+
return Object.fromEntries(lengths);
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
/**
|
|
877
|
+
* Helper methods for caching
|
|
878
|
+
*/
|
|
879
|
+
setCacheEntry(key, value) {
|
|
880
|
+
if (!this.cache) return;
|
|
881
|
+
|
|
882
|
+
if (this.cache.size >= this.maxCacheSize) {
|
|
883
|
+
const firstKey = this.cache.keys().next().value;
|
|
884
|
+
this.cache.delete(firstKey);
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
this.cache.set(key, value);
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
clearStructuralCaches() {
|
|
891
|
+
if (!this.cache) return;
|
|
892
|
+
|
|
893
|
+
for (const key of this.cache.keys()) {
|
|
894
|
+
if (key.startsWith('cycles_') || key.startsWith('path_') || key.startsWith('importance_')) {
|
|
895
|
+
this.cache.delete(key);
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
/**
|
|
901
|
+
* Helper methods for export formats
|
|
902
|
+
*/
|
|
903
|
+
getDOTNodeId(url) {
|
|
904
|
+
return `"${url.replace(/"/g, '\\"')}"`;
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
escapeCSV(value) {
|
|
908
|
+
if (typeof value !== 'string') return value;
|
|
909
|
+
if (value.includes(',') || value.includes('"') || value.includes('\n')) {
|
|
910
|
+
return `"${value.replace(/"/g, '""')}"`;
|
|
911
|
+
}
|
|
912
|
+
return value;
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
/**
|
|
916
|
+
* Clear all data
|
|
917
|
+
*/
|
|
918
|
+
clear() {
|
|
919
|
+
this.nodes.clear();
|
|
920
|
+
this.outboundLinks.clear();
|
|
921
|
+
this.inboundLinks.clear();
|
|
922
|
+
this.linkMetadata.clear();
|
|
923
|
+
|
|
924
|
+
if (this.cache) {
|
|
925
|
+
this.cache.clear();
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
this.stats = {
|
|
929
|
+
nodesCount: 0,
|
|
930
|
+
linksCount: 0,
|
|
931
|
+
lastAnalysisTime: null,
|
|
932
|
+
totalAnalyses: 0,
|
|
933
|
+
cacheHits: 0,
|
|
934
|
+
cacheMisses: 0
|
|
935
|
+
};
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
/**
|
|
939
|
+
* Merge another LinkAnalyzer into this one
|
|
940
|
+
*/
|
|
941
|
+
merge(other) {
|
|
942
|
+
if (!(other instanceof LinkAnalyzer)) {
|
|
943
|
+
throw new Error('Can only merge with another LinkAnalyzer instance');
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
// Merge nodes
|
|
947
|
+
for (const [url, nodeData] of other.nodes) {
|
|
948
|
+
if (!this.nodes.has(url)) {
|
|
949
|
+
this.nodes.set(url, { ...nodeData });
|
|
950
|
+
} else {
|
|
951
|
+
// Update with more recent data
|
|
952
|
+
const existing = this.nodes.get(url);
|
|
953
|
+
if (new Date(nodeData.discovered) > new Date(existing.discovered)) {
|
|
954
|
+
this.nodes.set(url, { ...existing, ...nodeData });
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
// Merge links
|
|
960
|
+
for (const [linkKey, linkData] of other.linkMetadata) {
|
|
961
|
+
const [from, to] = linkKey.split('|');
|
|
962
|
+
this.addLink(from, to, linkData);
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
this.stats.nodesCount = this.nodes.size;
|
|
966
|
+
this.stats.linksCount = this.linkMetadata.size;
|
|
967
|
+
|
|
968
|
+
return this;
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
export default LinkAnalyzer;
|