crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,676 @@
1
+ import { CacheManager } from '../../../core/cache/CacheManager.js';
2
+
3
+ /**
4
+ * Advanced search result deduplication system using multiple similarity algorithms
5
+ */
6
+ export class ResultDeduplicator {
7
+ constructor(options = {}) {
8
+ this.options = {
9
+ // Similarity thresholds
10
+ thresholds: {
11
+ url: 0.8, // URL similarity threshold
12
+ title: 0.75, // Title similarity threshold
13
+ content: 0.7, // Content similarity threshold
14
+ combined: 0.6 // Combined similarity threshold for final decision
15
+ },
16
+
17
+ // Deduplication strategies
18
+ strategies: {
19
+ urlNormalization: true, // Normalize URLs for comparison
20
+ titleFuzzy: true, // Use fuzzy title matching
21
+ contentSimhash: true, // Use SimHash for content comparison
22
+ domainClustering: true // Cluster results by domain
23
+ },
24
+
25
+ // URL normalization options
26
+ urlNormalization: {
27
+ removeProtocol: true, // Remove http/https difference
28
+ removeWww: true, // Remove www prefix
29
+ removeTrailingSlash: true, // Remove trailing slashes
30
+ removeDefaultPorts: true, // Remove default ports (80, 443)
31
+ sortQueryParams: true, // Sort query parameters
32
+ removeEmptyParams: true, // Remove empty query parameters
33
+ lowercaseDomain: true // Convert domain to lowercase
34
+ },
35
+
36
+ // Content similarity options
37
+ contentSimilarity: {
38
+ minLength: 10, // Minimum content length to compare
39
+ ngramSize: 3, // N-gram size for comparison
40
+ simhashBits: 64, // SimHash bit size
41
+ hammingThreshold: 8 // Hamming distance threshold for SimHash
42
+ },
43
+
44
+ // Merge strategy
45
+ mergeStrategy: {
46
+ preserveBestRank: true, // Keep the best ranking result as primary
47
+ combineMetadata: true, // Combine metadata from duplicates
48
+ preferHttps: true, // Prefer HTTPS URLs when merging
49
+ preferShorterUrl: true // Prefer shorter, cleaner URLs
50
+ },
51
+
52
+ // Performance options
53
+ cacheEnabled: true,
54
+ cacheTTL: 3600000, // 1 hour
55
+ ...options
56
+ };
57
+
58
+ // Initialize cache for deduplication computation
59
+ this.cache = this.options.cacheEnabled ?
60
+ new CacheManager({ ttl: this.options.cacheTTL }) : null;
61
+
62
+ // Statistics tracking
63
+ this.stats = {
64
+ totalProcessed: 0,
65
+ duplicatesFound: 0,
66
+ urlDuplicates: 0,
67
+ titleDuplicates: 0,
68
+ contentDuplicates: 0,
69
+ merged: 0
70
+ };
71
+ }
72
+
73
+ /**
74
+ * Deduplicate search results using multiple similarity algorithms
75
+ * @param {Array} results - Array of search results
76
+ * @param {Object} options - Deduplication options
77
+ * @returns {Promise<Array>} Deduplicated results
78
+ */
79
+ async deduplicateResults(results, options = {}) {
80
+ if (!results || results.length <= 1) {
81
+ return results;
82
+ }
83
+
84
+ const dedupeOptions = { ...this.options, ...options };
85
+ this.stats.totalProcessed += results.length;
86
+
87
+ // Generate cache key for deduplication computation
88
+ const cacheKey = this.cache ? this.cache.generateKey('deduplication', {
89
+ resultsHash: this.hashResults(results),
90
+ options: dedupeOptions
91
+ }) : null;
92
+
93
+ // Check cache
94
+ if (this.cache) {
95
+ const cached = await this.cache.get(cacheKey);
96
+ if (cached) {
97
+ return cached.results;
98
+ }
99
+ }
100
+
101
+ try {
102
+ // Step 1: Normalize URLs for all results
103
+ const normalizedResults = results.map((result, index) => ({
104
+ ...result,
105
+ originalIndex: index,
106
+ normalizedUrl: this.normalizeUrl(result.link || result.url, dedupeOptions.urlNormalization),
107
+ contentHash: this.computeContentHash(result, dedupeOptions.contentSimilarity),
108
+ titleTokens: this.tokenizeTitle(result.title || ''),
109
+ deduplicationInfo: {
110
+ originalUrl: result.link || result.url,
111
+ duplicateOf: null,
112
+ duplicateReasons: [],
113
+ merged: false
114
+ }
115
+ }));
116
+
117
+ // Step 2: Find duplicate groups
118
+ const duplicateGroups = this.findDuplicateGroups(normalizedResults, dedupeOptions);
119
+
120
+ // Step 3: Merge duplicates within each group
121
+ const mergedResults = this.mergeDuplicateGroups(duplicateGroups, dedupeOptions.mergeStrategy);
122
+
123
+ // Step 4: Add deduplication metadata
124
+ const finalResults = mergedResults.map(result => ({
125
+ ...result,
126
+ deduplicationInfo: {
127
+ ...result.deduplicationInfo,
128
+ totalDuplicatesFound: duplicateGroups.find(group =>
129
+ group.some(r => r.originalIndex === result.originalIndex)
130
+ )?.length - 1 || 0
131
+ }
132
+ }));
133
+
134
+ // Update statistics
135
+ this.stats.duplicatesFound += results.length - finalResults.length;
136
+
137
+ // Cache the results
138
+ if (this.cache) {
139
+ await this.cache.set(cacheKey, {
140
+ results: finalResults,
141
+ stats: this.getDeduplicationStats(results.length, finalResults.length)
142
+ });
143
+ }
144
+
145
+ return finalResults;
146
+ } catch (error) {
147
+ console.error('Deduplication failed:', error);
148
+ // Return original results with error info
149
+ return results.map(result => ({
150
+ ...result,
151
+ deduplicationInfo: {
152
+ error: error.message,
153
+ originalUrl: result.link || result.url,
154
+ duplicateOf: null,
155
+ duplicateReasons: [],
156
+ merged: false,
157
+ totalDuplicatesFound: 0
158
+ }
159
+ }));
160
+ }
161
+ }
162
+
163
+ /**
164
+ * Find groups of duplicate results
165
+ */
166
+ findDuplicateGroups(results, options) {
167
+ const groups = [];
168
+ const processed = new Set();
169
+
170
+ for (let i = 0; i < results.length; i++) {
171
+ if (processed.has(i)) continue;
172
+
173
+ const currentGroup = [results[i]];
174
+ processed.add(i);
175
+
176
+ // Find all duplicates of current result
177
+ for (let j = i + 1; j < results.length; j++) {
178
+ if (processed.has(j)) continue;
179
+
180
+ if (this.areDuplicates(results[i], results[j], options)) {
181
+ currentGroup.push(results[j]);
182
+ processed.add(j);
183
+
184
+ // Track duplicate reasons
185
+ results[j].deduplicationInfo.duplicateOf = results[i].originalIndex;
186
+ results[j].deduplicationInfo.duplicateReasons = this.getDuplicateReasons(
187
+ results[i], results[j], options
188
+ );
189
+ }
190
+ }
191
+
192
+ groups.push(currentGroup);
193
+ }
194
+
195
+ return groups;
196
+ }
197
+
198
+ /**
199
+ * Check if two results are duplicates
200
+ */
201
+ areDuplicates(result1, result2, options) {
202
+ const similarities = this.computeSimilarities(result1, result2, options);
203
+
204
+ // URL-based duplicate detection
205
+ if (similarities.url >= options.thresholds.url) {
206
+ this.stats.urlDuplicates++;
207
+ return true;
208
+ }
209
+
210
+ // Title-based duplicate detection
211
+ if (similarities.title >= options.thresholds.title) {
212
+ this.stats.titleDuplicates++;
213
+ return true;
214
+ }
215
+
216
+ // Content-based duplicate detection
217
+ if (similarities.content >= options.thresholds.content) {
218
+ this.stats.contentDuplicates++;
219
+ return true;
220
+ }
221
+
222
+ // Combined similarity score
223
+ const combinedScore = this.computeCombinedSimilarity(similarities);
224
+ if (combinedScore >= options.thresholds.combined) {
225
+ return true;
226
+ }
227
+
228
+ return false;
229
+ }
230
+
231
+ /**
232
+ * Compute similarity scores between two results
233
+ */
234
+ computeSimilarities(result1, result2, options) {
235
+ return {
236
+ url: this.computeUrlSimilarity(result1, result2),
237
+ title: this.computeTitleSimilarity(result1, result2),
238
+ content: this.computeContentSimilarity(result1, result2, options.contentSimilarity)
239
+ };
240
+ }
241
+
242
+ /**
243
+ * Compute URL similarity
244
+ */
245
+ computeUrlSimilarity(result1, result2) {
246
+ const url1 = result1.normalizedUrl;
247
+ const url2 = result2.normalizedUrl;
248
+
249
+ if (url1 === url2) return 1.0;
250
+
251
+ // Use edit distance for URL comparison
252
+ return 1 - (this.editDistance(url1, url2) / Math.max(url1.length, url2.length));
253
+ }
254
+
255
+ /**
256
+ * Compute title similarity using fuzzy matching
257
+ */
258
+ computeTitleSimilarity(result1, result2) {
259
+ const title1 = (result1.title || '').toLowerCase().trim();
260
+ const title2 = (result2.title || '').toLowerCase().trim();
261
+
262
+ if (!title1 || !title2) return 0;
263
+ if (title1 === title2) return 1.0;
264
+
265
+ // Compute Jaccard similarity on title tokens
266
+ const tokens1 = new Set(result1.titleTokens);
267
+ const tokens2 = new Set(result2.titleTokens);
268
+
269
+ const intersection = new Set([...tokens1].filter(x => tokens2.has(x)));
270
+ const union = new Set([...tokens1, ...tokens2]);
271
+
272
+ const jaccardSimilarity = intersection.size / union.size;
273
+
274
+ // Also compute edit distance similarity
275
+ const editSimilarity = 1 - (this.editDistance(title1, title2) / Math.max(title1.length, title2.length));
276
+
277
+ // Return weighted combination
278
+ return (jaccardSimilarity * 0.7) + (editSimilarity * 0.3);
279
+ }
280
+
281
+ /**
282
+ * Compute content similarity using SimHash
283
+ */
284
+ computeContentSimilarity(result1, result2, contentOptions) {
285
+ const content1 = this.extractContent(result1);
286
+ const content2 = this.extractContent(result2);
287
+
288
+ if (content1.length < contentOptions.minLength || content2.length < contentOptions.minLength) {
289
+ return 0;
290
+ }
291
+
292
+ // Use pre-computed content hashes
293
+ const hash1 = result1.contentHash;
294
+ const hash2 = result2.contentHash;
295
+
296
+ // Compute Hamming distance between hashes
297
+ const hammingDistance = this.hammingDistance(hash1, hash2);
298
+
299
+ // Convert to similarity score
300
+ const maxDistance = contentOptions.simhashBits;
301
+ const similarity = 1 - (hammingDistance / maxDistance);
302
+
303
+ return Math.max(0, similarity);
304
+ }
305
+
306
+ /**
307
+ * Compute combined similarity score
308
+ */
309
+ computeCombinedSimilarity(similarities) {
310
+ // Weighted combination of similarity scores
311
+ return (
312
+ similarities.url * 0.4 +
313
+ similarities.title * 0.35 +
314
+ similarities.content * 0.25
315
+ );
316
+ }
317
+
318
+ /**
319
+ * Get reasons why two results are considered duplicates
320
+ */
321
+ getDuplicateReasons(result1, result2, options) {
322
+ const reasons = [];
323
+ const similarities = this.computeSimilarities(result1, result2, options);
324
+
325
+ if (similarities.url >= options.thresholds.url) {
326
+ reasons.push(`URL similarity: ${(similarities.url * 100).toFixed(1)}%`);
327
+ }
328
+
329
+ if (similarities.title >= options.thresholds.title) {
330
+ reasons.push(`Title similarity: ${(similarities.title * 100).toFixed(1)}%`);
331
+ }
332
+
333
+ if (similarities.content >= options.thresholds.content) {
334
+ reasons.push(`Content similarity: ${(similarities.content * 100).toFixed(1)}%`);
335
+ }
336
+
337
+ const combined = this.computeCombinedSimilarity(similarities);
338
+ if (combined >= options.thresholds.combined) {
339
+ reasons.push(`Combined similarity: ${(combined * 100).toFixed(1)}%`);
340
+ }
341
+
342
+ return reasons;
343
+ }
344
+
345
+ /**
346
+ * Merge duplicate groups, keeping the best result as primary
347
+ */
348
+ mergeDuplicateGroups(groups, mergeStrategy) {
349
+ return groups.map(group => {
350
+ if (group.length === 1) {
351
+ return group[0]; // No duplicates to merge
352
+ }
353
+
354
+ // Find the best result in the group to keep as primary
355
+ const primaryResult = this.selectPrimaryResult(group, mergeStrategy);
356
+ const duplicates = group.filter(r => r !== primaryResult);
357
+
358
+ // Merge metadata from duplicates
359
+ if (mergeStrategy.combineMetadata) {
360
+ this.mergeMetadata(primaryResult, duplicates);
361
+ }
362
+
363
+ // Update deduplication info
364
+ primaryResult.deduplicationInfo.merged = true;
365
+ primaryResult.deduplicationInfo.mergedDuplicates = duplicates.length;
366
+ primaryResult.deduplicationInfo.duplicateUrls = duplicates.map(d => d.originalUrl);
367
+
368
+ this.stats.merged += duplicates.length;
369
+
370
+ return primaryResult;
371
+ });
372
+ }
373
+
374
+ /**
375
+ * Select the primary result from a duplicate group
376
+ */
377
+ selectPrimaryResult(group, mergeStrategy) {
378
+ // Sort by multiple criteria
379
+ return group.sort((a, b) => {
380
+ // 1. Prefer results with better ranking (if available)
381
+ if (mergeStrategy.preserveBestRank) {
382
+ const rankA = a.finalScore || a.rankingDetails?.finalScore || 0;
383
+ const rankB = b.finalScore || b.rankingDetails?.finalScore || 0;
384
+ if (rankA !== rankB) return rankB - rankA;
385
+ }
386
+
387
+ // 2. Prefer HTTPS URLs
388
+ if (mergeStrategy.preferHttps) {
389
+ const httpsA = a.link?.startsWith('https://') ? 1 : 0;
390
+ const httpsB = b.link?.startsWith('https://') ? 1 : 0;
391
+ if (httpsA !== httpsB) return httpsB - httpsA;
392
+ }
393
+
394
+ // 3. Prefer shorter URLs (often more canonical)
395
+ if (mergeStrategy.preferShorterUrl) {
396
+ const lengthA = (a.link || '').length;
397
+ const lengthB = (b.link || '').length;
398
+ if (lengthA !== lengthB) return lengthA - lengthB;
399
+ }
400
+
401
+ // 4. Prefer original order
402
+ return a.originalIndex - b.originalIndex;
403
+ })[0];
404
+ }
405
+
406
+ /**
407
+ * Merge metadata from duplicate results
408
+ */
409
+ mergeMetadata(primaryResult, duplicates) {
410
+ // Combine unique information from duplicates
411
+ const allSnippets = [primaryResult.snippet || ''];
412
+ const allTitles = [primaryResult.title || ''];
413
+ const allUrls = [primaryResult.link || ''];
414
+
415
+ duplicates.forEach(duplicate => {
416
+ if (duplicate.snippet && !allSnippets.includes(duplicate.snippet)) {
417
+ allSnippets.push(duplicate.snippet);
418
+ }
419
+ if (duplicate.title && !allTitles.includes(duplicate.title)) {
420
+ allTitles.push(duplicate.title);
421
+ }
422
+ if (duplicate.link && !allUrls.includes(duplicate.link)) {
423
+ allUrls.push(duplicate.link);
424
+ }
425
+ });
426
+
427
+ // Store additional information
428
+ if (!primaryResult.deduplicationInfo) {
429
+ primaryResult.deduplicationInfo = {};
430
+ }
431
+
432
+ primaryResult.deduplicationInfo.alternateSnippets = allSnippets.slice(1);
433
+ primaryResult.deduplicationInfo.alternateTitles = allTitles.slice(1);
434
+ primaryResult.deduplicationInfo.alternateUrls = allUrls.slice(1);
435
+ }
436
+
437
+ /**
438
+ * Normalize URL for comparison
439
+ */
440
+ normalizeUrl(url, normalizationOptions) {
441
+ if (!url) return '';
442
+
443
+ try {
444
+ let normalized = url;
445
+
446
+ // Parse URL
447
+ const urlObj = new URL(url);
448
+
449
+ // Remove protocol if specified
450
+ if (normalizationOptions.removeProtocol) {
451
+ normalized = normalized.replace(/^https?:\/\//, '');
452
+ }
453
+
454
+ // Remove www prefix
455
+ if (normalizationOptions.removeWww) {
456
+ urlObj.hostname = urlObj.hostname.replace(/^www\./, '');
457
+ }
458
+
459
+ // Convert domain to lowercase
460
+ if (normalizationOptions.lowercaseDomain) {
461
+ urlObj.hostname = urlObj.hostname.toLowerCase();
462
+ }
463
+
464
+ // Remove default ports
465
+ if (normalizationOptions.removeDefaultPorts) {
466
+ if ((urlObj.protocol === 'http:' && urlObj.port === '80') ||
467
+ (urlObj.protocol === 'https:' && urlObj.port === '443')) {
468
+ urlObj.port = '';
469
+ }
470
+ }
471
+
472
+ // Remove trailing slash
473
+ if (normalizationOptions.removeTrailingSlash) {
474
+ urlObj.pathname = urlObj.pathname.replace(/\/$/, '') || '/';
475
+ }
476
+
477
+ // Sort and clean query parameters
478
+ if (normalizationOptions.sortQueryParams || normalizationOptions.removeEmptyParams) {
479
+ const params = new URLSearchParams(urlObj.search);
480
+ const sortedParams = new URLSearchParams();
481
+
482
+ // Get sorted parameter names
483
+ const paramNames = Array.from(params.keys()).sort();
484
+
485
+ for (const name of paramNames) {
486
+ const value = params.get(name);
487
+ if (!normalizationOptions.removeEmptyParams || (value && value.trim())) {
488
+ sortedParams.set(name, value);
489
+ }
490
+ }
491
+
492
+ urlObj.search = sortedParams.toString();
493
+ }
494
+
495
+ return urlObj.toString();
496
+ } catch (error) {
497
+ // If URL parsing fails, return cleaned original
498
+ return url.toLowerCase().replace(/^https?:\/\/(www\.)?/, '');
499
+ }
500
+ }
501
+
502
+ /**
503
+ * Compute content hash using SimHash algorithm
504
+ */
505
+ computeContentHash(result, contentOptions) {
506
+ const content = this.extractContent(result);
507
+
508
+ if (content.length < contentOptions.minLength) {
509
+ return '0'; // Default hash for short content
510
+ }
511
+
512
+ return this.simHash(content, contentOptions.simhashBits);
513
+ }
514
+
515
+ /**
516
+ * Extract content from result for comparison
517
+ */
518
+ extractContent(result) {
519
+ const parts = [
520
+ result.title || '',
521
+ result.snippet || '',
522
+ result.htmlSnippet || '',
523
+ result.displayLink || ''
524
+ ];
525
+
526
+ return parts.join(' ').toLowerCase().replace(/[^\w\s]/g, ' ').replace(/\s+/g, ' ').trim();
527
+ }
528
+
529
+ /**
530
+ * Tokenize title for similarity comparison
531
+ */
532
+ tokenizeTitle(title) {
533
+ return title
534
+ .toLowerCase()
535
+ .replace(/[^\w\s]/g, ' ')
536
+ .split(/\s+/)
537
+ .filter(token => token.length > 1);
538
+ }
539
+
540
+ /**
541
+ * SimHash implementation for content similarity
542
+ */
543
+ simHash(text, bits = 64) {
544
+ const tokens = text.split(/\s+/);
545
+ const hashBits = new Array(bits).fill(0);
546
+
547
+ for (const token of tokens) {
548
+ const hash = this.stringHash(token);
549
+
550
+ for (let i = 0; i < bits; i++) {
551
+ const bit = (hash >> i) & 1;
552
+ hashBits[i] += bit ? 1 : -1;
553
+ }
554
+ }
555
+
556
+ // Convert to binary string
557
+ return hashBits.map(bit => bit > 0 ? '1' : '0').join('');
558
+ }
559
+
560
+ /**
561
+ * String hash function
562
+ */
563
+ stringHash(str) {
564
+ let hash = 0;
565
+ for (let i = 0; i < str.length; i++) {
566
+ const char = str.charCodeAt(i);
567
+ hash = ((hash << 5) - hash) + char;
568
+ hash = hash & hash; // Convert to 32-bit integer
569
+ }
570
+ return hash;
571
+ }
572
+
573
+ /**
574
+ * Compute Hamming distance between two binary strings
575
+ */
576
+ hammingDistance(hash1, hash2) {
577
+ if (hash1.length !== hash2.length) return Math.max(hash1.length, hash2.length);
578
+
579
+ let distance = 0;
580
+ for (let i = 0; i < hash1.length; i++) {
581
+ if (hash1[i] !== hash2[i]) distance++;
582
+ }
583
+ return distance;
584
+ }
585
+
586
+ /**
587
+ * Compute edit distance (Levenshtein) between two strings
588
+ */
589
+ editDistance(str1, str2) {
590
+ const matrix = [];
591
+
592
+ for (let i = 0; i <= str2.length; i++) {
593
+ matrix[i] = [i];
594
+ }
595
+
596
+ for (let j = 0; j <= str1.length; j++) {
597
+ matrix[0][j] = j;
598
+ }
599
+
600
+ for (let i = 1; i <= str2.length; i++) {
601
+ for (let j = 1; j <= str1.length; j++) {
602
+ if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
603
+ matrix[i][j] = matrix[i - 1][j - 1];
604
+ } else {
605
+ matrix[i][j] = Math.min(
606
+ matrix[i - 1][j - 1] + 1, // substitution
607
+ matrix[i][j - 1] + 1, // insertion
608
+ matrix[i - 1][j] + 1 // deletion
609
+ );
610
+ }
611
+ }
612
+ }
613
+
614
+ return matrix[str2.length][str1.length];
615
+ }
616
+
617
+ /**
618
+ * Generate hash of results for caching
619
+ */
620
+ hashResults(results) {
621
+ const key = results.map(r => (r.link || r.url) + (r.title || '')).join('|');
622
+ return this.stringHash(key).toString();
623
+ }
624
+
625
+ /**
626
+ * Get deduplication statistics
627
+ */
628
+ getDeduplicationStats(originalCount, finalCount) {
629
+ return {
630
+ originalCount,
631
+ finalCount,
632
+ duplicatesRemoved: originalCount - finalCount,
633
+ deduplicationRate: ((originalCount - finalCount) / originalCount * 100).toFixed(1) + '%'
634
+ };
635
+ }
636
+
637
+ /**
638
+ * Get comprehensive statistics
639
+ */
640
+ getStats() {
641
+ return {
642
+ ...this.stats,
643
+ cacheStats: this.cache ? this.cache.getStats() : null,
644
+ configuration: {
645
+ thresholds: this.options.thresholds,
646
+ strategies: this.options.strategies,
647
+ urlNormalization: this.options.urlNormalization,
648
+ contentSimilarity: this.options.contentSimilarity,
649
+ mergeStrategy: this.options.mergeStrategy
650
+ }
651
+ };
652
+ }
653
+
654
+ /**
655
+ * Update similarity thresholds dynamically
656
+ */
657
+ updateThresholds(newThresholds) {
658
+ this.options.thresholds = { ...this.options.thresholds, ...newThresholds };
659
+ }
660
+
661
+ /**
662
+ * Reset statistics
663
+ */
664
+ resetStats() {
665
+ this.stats = {
666
+ totalProcessed: 0,
667
+ duplicatesFound: 0,
668
+ urlDuplicates: 0,
669
+ titleDuplicates: 0,
670
+ contentDuplicates: 0,
671
+ merged: 0
672
+ };
673
+ }
674
+ }
675
+
676
+ export default ResultDeduplicator;