crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,676 @@
|
|
|
1
|
+
import { CacheManager } from '../../../core/cache/CacheManager.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Advanced search result deduplication system using multiple similarity algorithms
|
|
5
|
+
*/
|
|
6
|
+
export class ResultDeduplicator {
|
|
7
|
+
constructor(options = {}) {
|
|
8
|
+
this.options = {
|
|
9
|
+
// Similarity thresholds
|
|
10
|
+
thresholds: {
|
|
11
|
+
url: 0.8, // URL similarity threshold
|
|
12
|
+
title: 0.75, // Title similarity threshold
|
|
13
|
+
content: 0.7, // Content similarity threshold
|
|
14
|
+
combined: 0.6 // Combined similarity threshold for final decision
|
|
15
|
+
},
|
|
16
|
+
|
|
17
|
+
// Deduplication strategies
|
|
18
|
+
strategies: {
|
|
19
|
+
urlNormalization: true, // Normalize URLs for comparison
|
|
20
|
+
titleFuzzy: true, // Use fuzzy title matching
|
|
21
|
+
contentSimhash: true, // Use SimHash for content comparison
|
|
22
|
+
domainClustering: true // Cluster results by domain
|
|
23
|
+
},
|
|
24
|
+
|
|
25
|
+
// URL normalization options
|
|
26
|
+
urlNormalization: {
|
|
27
|
+
removeProtocol: true, // Remove http/https difference
|
|
28
|
+
removeWww: true, // Remove www prefix
|
|
29
|
+
removeTrailingSlash: true, // Remove trailing slashes
|
|
30
|
+
removeDefaultPorts: true, // Remove default ports (80, 443)
|
|
31
|
+
sortQueryParams: true, // Sort query parameters
|
|
32
|
+
removeEmptyParams: true, // Remove empty query parameters
|
|
33
|
+
lowercaseDomain: true // Convert domain to lowercase
|
|
34
|
+
},
|
|
35
|
+
|
|
36
|
+
// Content similarity options
|
|
37
|
+
contentSimilarity: {
|
|
38
|
+
minLength: 10, // Minimum content length to compare
|
|
39
|
+
ngramSize: 3, // N-gram size for comparison
|
|
40
|
+
simhashBits: 64, // SimHash bit size
|
|
41
|
+
hammingThreshold: 8 // Hamming distance threshold for SimHash
|
|
42
|
+
},
|
|
43
|
+
|
|
44
|
+
// Merge strategy
|
|
45
|
+
mergeStrategy: {
|
|
46
|
+
preserveBestRank: true, // Keep the best ranking result as primary
|
|
47
|
+
combineMetadata: true, // Combine metadata from duplicates
|
|
48
|
+
preferHttps: true, // Prefer HTTPS URLs when merging
|
|
49
|
+
preferShorterUrl: true // Prefer shorter, cleaner URLs
|
|
50
|
+
},
|
|
51
|
+
|
|
52
|
+
// Performance options
|
|
53
|
+
cacheEnabled: true,
|
|
54
|
+
cacheTTL: 3600000, // 1 hour
|
|
55
|
+
...options
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
// Initialize cache for deduplication computation
|
|
59
|
+
this.cache = this.options.cacheEnabled ?
|
|
60
|
+
new CacheManager({ ttl: this.options.cacheTTL }) : null;
|
|
61
|
+
|
|
62
|
+
// Statistics tracking
|
|
63
|
+
this.stats = {
|
|
64
|
+
totalProcessed: 0,
|
|
65
|
+
duplicatesFound: 0,
|
|
66
|
+
urlDuplicates: 0,
|
|
67
|
+
titleDuplicates: 0,
|
|
68
|
+
contentDuplicates: 0,
|
|
69
|
+
merged: 0
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Deduplicate search results using multiple similarity algorithms
|
|
75
|
+
* @param {Array} results - Array of search results
|
|
76
|
+
* @param {Object} options - Deduplication options
|
|
77
|
+
* @returns {Promise<Array>} Deduplicated results
|
|
78
|
+
*/
|
|
79
|
+
async deduplicateResults(results, options = {}) {
|
|
80
|
+
if (!results || results.length <= 1) {
|
|
81
|
+
return results;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const dedupeOptions = { ...this.options, ...options };
|
|
85
|
+
this.stats.totalProcessed += results.length;
|
|
86
|
+
|
|
87
|
+
// Generate cache key for deduplication computation
|
|
88
|
+
const cacheKey = this.cache ? this.cache.generateKey('deduplication', {
|
|
89
|
+
resultsHash: this.hashResults(results),
|
|
90
|
+
options: dedupeOptions
|
|
91
|
+
}) : null;
|
|
92
|
+
|
|
93
|
+
// Check cache
|
|
94
|
+
if (this.cache) {
|
|
95
|
+
const cached = await this.cache.get(cacheKey);
|
|
96
|
+
if (cached) {
|
|
97
|
+
return cached.results;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
try {
|
|
102
|
+
// Step 1: Normalize URLs for all results
|
|
103
|
+
const normalizedResults = results.map((result, index) => ({
|
|
104
|
+
...result,
|
|
105
|
+
originalIndex: index,
|
|
106
|
+
normalizedUrl: this.normalizeUrl(result.link || result.url, dedupeOptions.urlNormalization),
|
|
107
|
+
contentHash: this.computeContentHash(result, dedupeOptions.contentSimilarity),
|
|
108
|
+
titleTokens: this.tokenizeTitle(result.title || ''),
|
|
109
|
+
deduplicationInfo: {
|
|
110
|
+
originalUrl: result.link || result.url,
|
|
111
|
+
duplicateOf: null,
|
|
112
|
+
duplicateReasons: [],
|
|
113
|
+
merged: false
|
|
114
|
+
}
|
|
115
|
+
}));
|
|
116
|
+
|
|
117
|
+
// Step 2: Find duplicate groups
|
|
118
|
+
const duplicateGroups = this.findDuplicateGroups(normalizedResults, dedupeOptions);
|
|
119
|
+
|
|
120
|
+
// Step 3: Merge duplicates within each group
|
|
121
|
+
const mergedResults = this.mergeDuplicateGroups(duplicateGroups, dedupeOptions.mergeStrategy);
|
|
122
|
+
|
|
123
|
+
// Step 4: Add deduplication metadata
|
|
124
|
+
const finalResults = mergedResults.map(result => ({
|
|
125
|
+
...result,
|
|
126
|
+
deduplicationInfo: {
|
|
127
|
+
...result.deduplicationInfo,
|
|
128
|
+
totalDuplicatesFound: duplicateGroups.find(group =>
|
|
129
|
+
group.some(r => r.originalIndex === result.originalIndex)
|
|
130
|
+
)?.length - 1 || 0
|
|
131
|
+
}
|
|
132
|
+
}));
|
|
133
|
+
|
|
134
|
+
// Update statistics
|
|
135
|
+
this.stats.duplicatesFound += results.length - finalResults.length;
|
|
136
|
+
|
|
137
|
+
// Cache the results
|
|
138
|
+
if (this.cache) {
|
|
139
|
+
await this.cache.set(cacheKey, {
|
|
140
|
+
results: finalResults,
|
|
141
|
+
stats: this.getDeduplicationStats(results.length, finalResults.length)
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return finalResults;
|
|
146
|
+
} catch (error) {
|
|
147
|
+
console.error('Deduplication failed:', error);
|
|
148
|
+
// Return original results with error info
|
|
149
|
+
return results.map(result => ({
|
|
150
|
+
...result,
|
|
151
|
+
deduplicationInfo: {
|
|
152
|
+
error: error.message,
|
|
153
|
+
originalUrl: result.link || result.url,
|
|
154
|
+
duplicateOf: null,
|
|
155
|
+
duplicateReasons: [],
|
|
156
|
+
merged: false,
|
|
157
|
+
totalDuplicatesFound: 0
|
|
158
|
+
}
|
|
159
|
+
}));
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Find groups of duplicate results
|
|
165
|
+
*/
|
|
166
|
+
findDuplicateGroups(results, options) {
|
|
167
|
+
const groups = [];
|
|
168
|
+
const processed = new Set();
|
|
169
|
+
|
|
170
|
+
for (let i = 0; i < results.length; i++) {
|
|
171
|
+
if (processed.has(i)) continue;
|
|
172
|
+
|
|
173
|
+
const currentGroup = [results[i]];
|
|
174
|
+
processed.add(i);
|
|
175
|
+
|
|
176
|
+
// Find all duplicates of current result
|
|
177
|
+
for (let j = i + 1; j < results.length; j++) {
|
|
178
|
+
if (processed.has(j)) continue;
|
|
179
|
+
|
|
180
|
+
if (this.areDuplicates(results[i], results[j], options)) {
|
|
181
|
+
currentGroup.push(results[j]);
|
|
182
|
+
processed.add(j);
|
|
183
|
+
|
|
184
|
+
// Track duplicate reasons
|
|
185
|
+
results[j].deduplicationInfo.duplicateOf = results[i].originalIndex;
|
|
186
|
+
results[j].deduplicationInfo.duplicateReasons = this.getDuplicateReasons(
|
|
187
|
+
results[i], results[j], options
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
groups.push(currentGroup);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return groups;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Check if two results are duplicates
|
|
200
|
+
*/
|
|
201
|
+
areDuplicates(result1, result2, options) {
|
|
202
|
+
const similarities = this.computeSimilarities(result1, result2, options);
|
|
203
|
+
|
|
204
|
+
// URL-based duplicate detection
|
|
205
|
+
if (similarities.url >= options.thresholds.url) {
|
|
206
|
+
this.stats.urlDuplicates++;
|
|
207
|
+
return true;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Title-based duplicate detection
|
|
211
|
+
if (similarities.title >= options.thresholds.title) {
|
|
212
|
+
this.stats.titleDuplicates++;
|
|
213
|
+
return true;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Content-based duplicate detection
|
|
217
|
+
if (similarities.content >= options.thresholds.content) {
|
|
218
|
+
this.stats.contentDuplicates++;
|
|
219
|
+
return true;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Combined similarity score
|
|
223
|
+
const combinedScore = this.computeCombinedSimilarity(similarities);
|
|
224
|
+
if (combinedScore >= options.thresholds.combined) {
|
|
225
|
+
return true;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return false;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Compute similarity scores between two results
|
|
233
|
+
*/
|
|
234
|
+
computeSimilarities(result1, result2, options) {
|
|
235
|
+
return {
|
|
236
|
+
url: this.computeUrlSimilarity(result1, result2),
|
|
237
|
+
title: this.computeTitleSimilarity(result1, result2),
|
|
238
|
+
content: this.computeContentSimilarity(result1, result2, options.contentSimilarity)
|
|
239
|
+
};
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Compute URL similarity
|
|
244
|
+
*/
|
|
245
|
+
computeUrlSimilarity(result1, result2) {
|
|
246
|
+
const url1 = result1.normalizedUrl;
|
|
247
|
+
const url2 = result2.normalizedUrl;
|
|
248
|
+
|
|
249
|
+
if (url1 === url2) return 1.0;
|
|
250
|
+
|
|
251
|
+
// Use edit distance for URL comparison
|
|
252
|
+
return 1 - (this.editDistance(url1, url2) / Math.max(url1.length, url2.length));
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Compute title similarity using fuzzy matching
|
|
257
|
+
*/
|
|
258
|
+
computeTitleSimilarity(result1, result2) {
|
|
259
|
+
const title1 = (result1.title || '').toLowerCase().trim();
|
|
260
|
+
const title2 = (result2.title || '').toLowerCase().trim();
|
|
261
|
+
|
|
262
|
+
if (!title1 || !title2) return 0;
|
|
263
|
+
if (title1 === title2) return 1.0;
|
|
264
|
+
|
|
265
|
+
// Compute Jaccard similarity on title tokens
|
|
266
|
+
const tokens1 = new Set(result1.titleTokens);
|
|
267
|
+
const tokens2 = new Set(result2.titleTokens);
|
|
268
|
+
|
|
269
|
+
const intersection = new Set([...tokens1].filter(x => tokens2.has(x)));
|
|
270
|
+
const union = new Set([...tokens1, ...tokens2]);
|
|
271
|
+
|
|
272
|
+
const jaccardSimilarity = intersection.size / union.size;
|
|
273
|
+
|
|
274
|
+
// Also compute edit distance similarity
|
|
275
|
+
const editSimilarity = 1 - (this.editDistance(title1, title2) / Math.max(title1.length, title2.length));
|
|
276
|
+
|
|
277
|
+
// Return weighted combination
|
|
278
|
+
return (jaccardSimilarity * 0.7) + (editSimilarity * 0.3);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Compute content similarity using SimHash
|
|
283
|
+
*/
|
|
284
|
+
computeContentSimilarity(result1, result2, contentOptions) {
|
|
285
|
+
const content1 = this.extractContent(result1);
|
|
286
|
+
const content2 = this.extractContent(result2);
|
|
287
|
+
|
|
288
|
+
if (content1.length < contentOptions.minLength || content2.length < contentOptions.minLength) {
|
|
289
|
+
return 0;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Use pre-computed content hashes
|
|
293
|
+
const hash1 = result1.contentHash;
|
|
294
|
+
const hash2 = result2.contentHash;
|
|
295
|
+
|
|
296
|
+
// Compute Hamming distance between hashes
|
|
297
|
+
const hammingDistance = this.hammingDistance(hash1, hash2);
|
|
298
|
+
|
|
299
|
+
// Convert to similarity score
|
|
300
|
+
const maxDistance = contentOptions.simhashBits;
|
|
301
|
+
const similarity = 1 - (hammingDistance / maxDistance);
|
|
302
|
+
|
|
303
|
+
return Math.max(0, similarity);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Compute combined similarity score
|
|
308
|
+
*/
|
|
309
|
+
computeCombinedSimilarity(similarities) {
|
|
310
|
+
// Weighted combination of similarity scores
|
|
311
|
+
return (
|
|
312
|
+
similarities.url * 0.4 +
|
|
313
|
+
similarities.title * 0.35 +
|
|
314
|
+
similarities.content * 0.25
|
|
315
|
+
);
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Get reasons why two results are considered duplicates
|
|
320
|
+
*/
|
|
321
|
+
getDuplicateReasons(result1, result2, options) {
|
|
322
|
+
const reasons = [];
|
|
323
|
+
const similarities = this.computeSimilarities(result1, result2, options);
|
|
324
|
+
|
|
325
|
+
if (similarities.url >= options.thresholds.url) {
|
|
326
|
+
reasons.push(`URL similarity: ${(similarities.url * 100).toFixed(1)}%`);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
if (similarities.title >= options.thresholds.title) {
|
|
330
|
+
reasons.push(`Title similarity: ${(similarities.title * 100).toFixed(1)}%`);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
if (similarities.content >= options.thresholds.content) {
|
|
334
|
+
reasons.push(`Content similarity: ${(similarities.content * 100).toFixed(1)}%`);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
const combined = this.computeCombinedSimilarity(similarities);
|
|
338
|
+
if (combined >= options.thresholds.combined) {
|
|
339
|
+
reasons.push(`Combined similarity: ${(combined * 100).toFixed(1)}%`);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
return reasons;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/**
|
|
346
|
+
* Merge duplicate groups, keeping the best result as primary
|
|
347
|
+
*/
|
|
348
|
+
mergeDuplicateGroups(groups, mergeStrategy) {
|
|
349
|
+
return groups.map(group => {
|
|
350
|
+
if (group.length === 1) {
|
|
351
|
+
return group[0]; // No duplicates to merge
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Find the best result in the group to keep as primary
|
|
355
|
+
const primaryResult = this.selectPrimaryResult(group, mergeStrategy);
|
|
356
|
+
const duplicates = group.filter(r => r !== primaryResult);
|
|
357
|
+
|
|
358
|
+
// Merge metadata from duplicates
|
|
359
|
+
if (mergeStrategy.combineMetadata) {
|
|
360
|
+
this.mergeMetadata(primaryResult, duplicates);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// Update deduplication info
|
|
364
|
+
primaryResult.deduplicationInfo.merged = true;
|
|
365
|
+
primaryResult.deduplicationInfo.mergedDuplicates = duplicates.length;
|
|
366
|
+
primaryResult.deduplicationInfo.duplicateUrls = duplicates.map(d => d.originalUrl);
|
|
367
|
+
|
|
368
|
+
this.stats.merged += duplicates.length;
|
|
369
|
+
|
|
370
|
+
return primaryResult;
|
|
371
|
+
});
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
/**
|
|
375
|
+
* Select the primary result from a duplicate group
|
|
376
|
+
*/
|
|
377
|
+
selectPrimaryResult(group, mergeStrategy) {
|
|
378
|
+
// Sort by multiple criteria
|
|
379
|
+
return group.sort((a, b) => {
|
|
380
|
+
// 1. Prefer results with better ranking (if available)
|
|
381
|
+
if (mergeStrategy.preserveBestRank) {
|
|
382
|
+
const rankA = a.finalScore || a.rankingDetails?.finalScore || 0;
|
|
383
|
+
const rankB = b.finalScore || b.rankingDetails?.finalScore || 0;
|
|
384
|
+
if (rankA !== rankB) return rankB - rankA;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
// 2. Prefer HTTPS URLs
|
|
388
|
+
if (mergeStrategy.preferHttps) {
|
|
389
|
+
const httpsA = a.link?.startsWith('https://') ? 1 : 0;
|
|
390
|
+
const httpsB = b.link?.startsWith('https://') ? 1 : 0;
|
|
391
|
+
if (httpsA !== httpsB) return httpsB - httpsA;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// 3. Prefer shorter URLs (often more canonical)
|
|
395
|
+
if (mergeStrategy.preferShorterUrl) {
|
|
396
|
+
const lengthA = (a.link || '').length;
|
|
397
|
+
const lengthB = (b.link || '').length;
|
|
398
|
+
if (lengthA !== lengthB) return lengthA - lengthB;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
// 4. Prefer original order
|
|
402
|
+
return a.originalIndex - b.originalIndex;
|
|
403
|
+
})[0];
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* Merge metadata from duplicate results
|
|
408
|
+
*/
|
|
409
|
+
mergeMetadata(primaryResult, duplicates) {
|
|
410
|
+
// Combine unique information from duplicates
|
|
411
|
+
const allSnippets = [primaryResult.snippet || ''];
|
|
412
|
+
const allTitles = [primaryResult.title || ''];
|
|
413
|
+
const allUrls = [primaryResult.link || ''];
|
|
414
|
+
|
|
415
|
+
duplicates.forEach(duplicate => {
|
|
416
|
+
if (duplicate.snippet && !allSnippets.includes(duplicate.snippet)) {
|
|
417
|
+
allSnippets.push(duplicate.snippet);
|
|
418
|
+
}
|
|
419
|
+
if (duplicate.title && !allTitles.includes(duplicate.title)) {
|
|
420
|
+
allTitles.push(duplicate.title);
|
|
421
|
+
}
|
|
422
|
+
if (duplicate.link && !allUrls.includes(duplicate.link)) {
|
|
423
|
+
allUrls.push(duplicate.link);
|
|
424
|
+
}
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
// Store additional information
|
|
428
|
+
if (!primaryResult.deduplicationInfo) {
|
|
429
|
+
primaryResult.deduplicationInfo = {};
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
primaryResult.deduplicationInfo.alternateSnippets = allSnippets.slice(1);
|
|
433
|
+
primaryResult.deduplicationInfo.alternateTitles = allTitles.slice(1);
|
|
434
|
+
primaryResult.deduplicationInfo.alternateUrls = allUrls.slice(1);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
/**
|
|
438
|
+
* Normalize URL for comparison
|
|
439
|
+
*/
|
|
440
|
+
normalizeUrl(url, normalizationOptions) {
|
|
441
|
+
if (!url) return '';
|
|
442
|
+
|
|
443
|
+
try {
|
|
444
|
+
let normalized = url;
|
|
445
|
+
|
|
446
|
+
// Parse URL
|
|
447
|
+
const urlObj = new URL(url);
|
|
448
|
+
|
|
449
|
+
// Remove protocol if specified
|
|
450
|
+
if (normalizationOptions.removeProtocol) {
|
|
451
|
+
normalized = normalized.replace(/^https?:\/\//, '');
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// Remove www prefix
|
|
455
|
+
if (normalizationOptions.removeWww) {
|
|
456
|
+
urlObj.hostname = urlObj.hostname.replace(/^www\./, '');
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// Convert domain to lowercase
|
|
460
|
+
if (normalizationOptions.lowercaseDomain) {
|
|
461
|
+
urlObj.hostname = urlObj.hostname.toLowerCase();
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
// Remove default ports
|
|
465
|
+
if (normalizationOptions.removeDefaultPorts) {
|
|
466
|
+
if ((urlObj.protocol === 'http:' && urlObj.port === '80') ||
|
|
467
|
+
(urlObj.protocol === 'https:' && urlObj.port === '443')) {
|
|
468
|
+
urlObj.port = '';
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Remove trailing slash
|
|
473
|
+
if (normalizationOptions.removeTrailingSlash) {
|
|
474
|
+
urlObj.pathname = urlObj.pathname.replace(/\/$/, '') || '/';
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// Sort and clean query parameters
|
|
478
|
+
if (normalizationOptions.sortQueryParams || normalizationOptions.removeEmptyParams) {
|
|
479
|
+
const params = new URLSearchParams(urlObj.search);
|
|
480
|
+
const sortedParams = new URLSearchParams();
|
|
481
|
+
|
|
482
|
+
// Get sorted parameter names
|
|
483
|
+
const paramNames = Array.from(params.keys()).sort();
|
|
484
|
+
|
|
485
|
+
for (const name of paramNames) {
|
|
486
|
+
const value = params.get(name);
|
|
487
|
+
if (!normalizationOptions.removeEmptyParams || (value && value.trim())) {
|
|
488
|
+
sortedParams.set(name, value);
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
urlObj.search = sortedParams.toString();
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
return urlObj.toString();
|
|
496
|
+
} catch (error) {
|
|
497
|
+
// If URL parsing fails, return cleaned original
|
|
498
|
+
return url.toLowerCase().replace(/^https?:\/\/(www\.)?/, '');
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Compute content hash using SimHash algorithm
|
|
504
|
+
*/
|
|
505
|
+
computeContentHash(result, contentOptions) {
|
|
506
|
+
const content = this.extractContent(result);
|
|
507
|
+
|
|
508
|
+
if (content.length < contentOptions.minLength) {
|
|
509
|
+
return '0'; // Default hash for short content
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
return this.simHash(content, contentOptions.simhashBits);
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
/**
|
|
516
|
+
* Extract content from result for comparison
|
|
517
|
+
*/
|
|
518
|
+
extractContent(result) {
|
|
519
|
+
const parts = [
|
|
520
|
+
result.title || '',
|
|
521
|
+
result.snippet || '',
|
|
522
|
+
result.htmlSnippet || '',
|
|
523
|
+
result.displayLink || ''
|
|
524
|
+
];
|
|
525
|
+
|
|
526
|
+
return parts.join(' ').toLowerCase().replace(/[^\w\s]/g, ' ').replace(/\s+/g, ' ').trim();
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
/**
|
|
530
|
+
* Tokenize title for similarity comparison
|
|
531
|
+
*/
|
|
532
|
+
tokenizeTitle(title) {
|
|
533
|
+
return title
|
|
534
|
+
.toLowerCase()
|
|
535
|
+
.replace(/[^\w\s]/g, ' ')
|
|
536
|
+
.split(/\s+/)
|
|
537
|
+
.filter(token => token.length > 1);
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
/**
|
|
541
|
+
* SimHash implementation for content similarity
|
|
542
|
+
*/
|
|
543
|
+
simHash(text, bits = 64) {
|
|
544
|
+
const tokens = text.split(/\s+/);
|
|
545
|
+
const hashBits = new Array(bits).fill(0);
|
|
546
|
+
|
|
547
|
+
for (const token of tokens) {
|
|
548
|
+
const hash = this.stringHash(token);
|
|
549
|
+
|
|
550
|
+
for (let i = 0; i < bits; i++) {
|
|
551
|
+
const bit = (hash >> i) & 1;
|
|
552
|
+
hashBits[i] += bit ? 1 : -1;
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// Convert to binary string
|
|
557
|
+
return hashBits.map(bit => bit > 0 ? '1' : '0').join('');
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
/**
|
|
561
|
+
* String hash function
|
|
562
|
+
*/
|
|
563
|
+
stringHash(str) {
|
|
564
|
+
let hash = 0;
|
|
565
|
+
for (let i = 0; i < str.length; i++) {
|
|
566
|
+
const char = str.charCodeAt(i);
|
|
567
|
+
hash = ((hash << 5) - hash) + char;
|
|
568
|
+
hash = hash & hash; // Convert to 32-bit integer
|
|
569
|
+
}
|
|
570
|
+
return hash;
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
/**
|
|
574
|
+
* Compute Hamming distance between two binary strings
|
|
575
|
+
*/
|
|
576
|
+
hammingDistance(hash1, hash2) {
|
|
577
|
+
if (hash1.length !== hash2.length) return Math.max(hash1.length, hash2.length);
|
|
578
|
+
|
|
579
|
+
let distance = 0;
|
|
580
|
+
for (let i = 0; i < hash1.length; i++) {
|
|
581
|
+
if (hash1[i] !== hash2[i]) distance++;
|
|
582
|
+
}
|
|
583
|
+
return distance;
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
/**
|
|
587
|
+
* Compute edit distance (Levenshtein) between two strings
|
|
588
|
+
*/
|
|
589
|
+
editDistance(str1, str2) {
|
|
590
|
+
const matrix = [];
|
|
591
|
+
|
|
592
|
+
for (let i = 0; i <= str2.length; i++) {
|
|
593
|
+
matrix[i] = [i];
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
for (let j = 0; j <= str1.length; j++) {
|
|
597
|
+
matrix[0][j] = j;
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
for (let i = 1; i <= str2.length; i++) {
|
|
601
|
+
for (let j = 1; j <= str1.length; j++) {
|
|
602
|
+
if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
|
|
603
|
+
matrix[i][j] = matrix[i - 1][j - 1];
|
|
604
|
+
} else {
|
|
605
|
+
matrix[i][j] = Math.min(
|
|
606
|
+
matrix[i - 1][j - 1] + 1, // substitution
|
|
607
|
+
matrix[i][j - 1] + 1, // insertion
|
|
608
|
+
matrix[i - 1][j] + 1 // deletion
|
|
609
|
+
);
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
return matrix[str2.length][str1.length];
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
/**
|
|
618
|
+
* Generate hash of results for caching
|
|
619
|
+
*/
|
|
620
|
+
hashResults(results) {
|
|
621
|
+
const key = results.map(r => (r.link || r.url) + (r.title || '')).join('|');
|
|
622
|
+
return this.stringHash(key).toString();
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
/**
|
|
626
|
+
* Get deduplication statistics
|
|
627
|
+
*/
|
|
628
|
+
getDeduplicationStats(originalCount, finalCount) {
|
|
629
|
+
return {
|
|
630
|
+
originalCount,
|
|
631
|
+
finalCount,
|
|
632
|
+
duplicatesRemoved: originalCount - finalCount,
|
|
633
|
+
deduplicationRate: ((originalCount - finalCount) / originalCount * 100).toFixed(1) + '%'
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
/**
|
|
638
|
+
* Get comprehensive statistics
|
|
639
|
+
*/
|
|
640
|
+
getStats() {
|
|
641
|
+
return {
|
|
642
|
+
...this.stats,
|
|
643
|
+
cacheStats: this.cache ? this.cache.getStats() : null,
|
|
644
|
+
configuration: {
|
|
645
|
+
thresholds: this.options.thresholds,
|
|
646
|
+
strategies: this.options.strategies,
|
|
647
|
+
urlNormalization: this.options.urlNormalization,
|
|
648
|
+
contentSimilarity: this.options.contentSimilarity,
|
|
649
|
+
mergeStrategy: this.options.mergeStrategy
|
|
650
|
+
}
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
/**
|
|
655
|
+
* Update similarity thresholds dynamically
|
|
656
|
+
*/
|
|
657
|
+
updateThresholds(newThresholds) {
|
|
658
|
+
this.options.thresholds = { ...this.options.thresholds, ...newThresholds };
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
/**
|
|
662
|
+
* Reset statistics
|
|
663
|
+
*/
|
|
664
|
+
resetStats() {
|
|
665
|
+
this.stats = {
|
|
666
|
+
totalProcessed: 0,
|
|
667
|
+
duplicatesFound: 0,
|
|
668
|
+
urlDuplicates: 0,
|
|
669
|
+
titleDuplicates: 0,
|
|
670
|
+
contentDuplicates: 0,
|
|
671
|
+
merged: 0
|
|
672
|
+
};
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
export default ResultDeduplicator;
|