crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,497 @@
1
+ import { CacheManager } from '../../../core/cache/CacheManager.js';
2
+
3
+ /**
4
+ * Advanced search result ranking system with multiple scoring algorithms
5
+ */
6
+ export class ResultRanker {
7
+ constructor(options = {}) {
8
+ this.options = {
9
+ // Ranking weight configuration
10
+ weights: {
11
+ bm25: 0.4, // BM25 keyword relevance
12
+ semantic: 0.3, // Semantic similarity
13
+ authority: 0.2, // URL/domain authority
14
+ freshness: 0.1 // Content freshness
15
+ },
16
+
17
+ // BM25 parameters
18
+ bm25: {
19
+ k1: 1.5, // Term frequency saturation parameter
20
+ b: 0.75 // Length normalization parameter
21
+ },
22
+
23
+ // Authority scoring parameters
24
+ authority: {
25
+ domainBoosts: { // Domain authority boosts
26
+ 'wikipedia.org': 1.0,
27
+ 'github.com': 0.9,
28
+ 'stackoverflow.com': 0.9,
29
+ 'mozilla.org': 0.8,
30
+ 'w3.org': 0.8
31
+ },
32
+ httpsBoost: 0.1, // HTTPS boost
33
+ pathDepthPenalty: 0.02 // Penalty per path segment
34
+ },
35
+
36
+ // Freshness parameters
37
+ freshness: {
38
+ maxAgeMonths: 24, // Content older than this gets 0 freshness score
39
+ decayRate: 0.1 // Exponential decay rate per month
40
+ },
41
+
42
+ // Performance options
43
+ cacheEnabled: true,
44
+ cacheTTL: 3600000, // 1 hour
45
+ ...options
46
+ };
47
+
48
+ // Initialize cache for score computation
49
+ this.cache = this.options.cacheEnabled ?
50
+ new CacheManager({ ttl: this.options.cacheTTL }) : null;
51
+
52
+ // Precompute domain authority scores
53
+ this.domainAuthorityMap = new Map();
54
+ this.initializeDomainAuthority();
55
+ }
56
+
57
+ /**
58
+ * Rank search results using combined scoring algorithm
59
+ * @param {Array} results - Array of search results
60
+ * @param {string} query - Original search query
61
+ * @param {Object} options - Ranking options
62
+ * @returns {Promise<Array>} Ranked results with scores
63
+ */
64
+ async rankResults(results, query, options = {}) {
65
+ if (!results || results.length === 0) {
66
+ return [];
67
+ }
68
+
69
+ const rankingOptions = { ...this.options, ...options };
70
+
71
+ // Generate cache key for ranking computation
72
+ const cacheKey = this.cache ? this.cache.generateKey('ranking', {
73
+ query,
74
+ resultsHash: this.hashResults(results),
75
+ options: rankingOptions
76
+ }) : null;
77
+
78
+ // Check cache
79
+ if (this.cache) {
80
+ const cached = await this.cache.get(cacheKey);
81
+ if (cached) {
82
+ return cached;
83
+ }
84
+ }
85
+
86
+ try {
87
+ // Compute individual scores for each result
88
+ const scoredResults = await Promise.all(
89
+ results.map(async (result, index) => ({
90
+ ...result,
91
+ originalIndex: index,
92
+ scores: await this.computeScores(result, query, results, rankingOptions)
93
+ }))
94
+ );
95
+
96
+ // Compute final combined scores
97
+ const rankedResults = scoredResults.map(result => ({
98
+ ...result,
99
+ finalScore: this.computeFinalScore(result.scores, rankingOptions.weights),
100
+ rankingDetails: {
101
+ scores: result.scores,
102
+ weights: rankingOptions.weights,
103
+ originalIndex: result.originalIndex
104
+ }
105
+ }));
106
+
107
+ // Sort by final score (descending)
108
+ rankedResults.sort((a, b) => b.finalScore - a.finalScore);
109
+
110
+ // Add ranking positions
111
+ rankedResults.forEach((result, index) => {
112
+ result.rankingDetails.newRank = index + 1;
113
+ result.rankingDetails.rankChange = result.originalIndex - index;
114
+ });
115
+
116
+ // Cache the results
117
+ if (this.cache) {
118
+ await this.cache.set(cacheKey, rankedResults);
119
+ }
120
+
121
+ return rankedResults;
122
+ } catch (error) {
123
+ console.error('Ranking failed:', error);
124
+ // Return original results with default scores
125
+ return results.map((result, index) => ({
126
+ ...result,
127
+ originalIndex: index,
128
+ finalScore: 1.0 - (index * 0.1), // Simple fallback scoring
129
+ rankingDetails: {
130
+ error: error.message,
131
+ originalIndex: index,
132
+ newRank: index + 1,
133
+ rankChange: 0
134
+ }
135
+ }));
136
+ }
137
+ }
138
+
139
+ /**
140
+ * Compute individual scoring components for a result
141
+ */
142
+ async computeScores(result, query, allResults, options) {
143
+ const scores = {};
144
+
145
+ // BM25 Score
146
+ scores.bm25 = this.computeBM25Score(result, query, allResults, options.bm25);
147
+
148
+ // Semantic Similarity Score
149
+ scores.semantic = this.computeSemanticScore(result, query);
150
+
151
+ // Authority Score
152
+ scores.authority = this.computeAuthorityScore(result, options.authority);
153
+
154
+ // Freshness Score
155
+ scores.freshness = this.computeFreshnessScore(result, options.freshness);
156
+
157
+ return scores;
158
+ }
159
+
160
+ /**
161
+ * BM25 algorithm implementation for keyword relevance
162
+ */
163
+ computeBM25Score(result, query, allResults, bm25Options) {
164
+ const { k1, b } = bm25Options;
165
+
166
+ // Prepare text content for analysis
167
+ const content = [
168
+ result.title || '',
169
+ result.snippet || '',
170
+ result.htmlSnippet || ''
171
+ ].join(' ').toLowerCase();
172
+
173
+ // Tokenize query and content
174
+ const queryTerms = this.tokenize(query.toLowerCase());
175
+ const contentTerms = this.tokenize(content);
176
+ const contentLength = contentTerms.length;
177
+
178
+ // Calculate average document length across all results
179
+ const avgDocLength = allResults.reduce((sum, r) => {
180
+ const rContent = [r.title || '', r.snippet || '', r.htmlSnippet || ''].join(' ');
181
+ return sum + this.tokenize(rContent).length;
182
+ }, 0) / allResults.length;
183
+
184
+ // Calculate term frequencies
185
+ const termFreqs = this.getTermFrequencies(contentTerms);
186
+
187
+ let score = 0;
188
+ for (const term of queryTerms) {
189
+ const tf = termFreqs[term] || 0;
190
+ if (tf > 0) {
191
+ // Document frequency (simplified - assume term appears in some docs)
192
+ const df = Math.min(allResults.length * 0.1, 1); // Conservative estimate
193
+ const idf = Math.log((allResults.length - df + 0.5) / (df + 0.5));
194
+
195
+ // BM25 formula
196
+ const numerator = tf * (k1 + 1);
197
+ const denominator = tf + k1 * (1 - b + b * (contentLength / avgDocLength));
198
+
199
+ score += idf * (numerator / denominator);
200
+ }
201
+ }
202
+
203
+ return Math.max(0, Math.min(1, score / queryTerms.length));
204
+ }
205
+
206
+ /**
207
+ * Semantic similarity scoring using cosine similarity
208
+ */
209
+ computeSemanticScore(result, query) {
210
+ // Prepare text content
211
+ const content = [
212
+ result.title || '',
213
+ result.snippet || '',
214
+ result.htmlSnippet || ''
215
+ ].join(' ').toLowerCase();
216
+
217
+ // Simple word embedding approximation using term vectors
218
+ const queryVector = this.createTermVector(this.tokenize(query.toLowerCase()));
219
+ const contentVector = this.createTermVector(this.tokenize(content));
220
+
221
+ // Compute cosine similarity
222
+ const similarity = this.cosineSimilarity(queryVector, contentVector);
223
+
224
+ // Boost for exact phrase matches
225
+ const phraseBoost = content.includes(query.toLowerCase()) ? 0.2 : 0;
226
+
227
+ return Math.min(1, similarity + phraseBoost);
228
+ }
229
+
230
+ /**
231
+ * URL and domain authority scoring
232
+ */
233
+ computeAuthorityScore(result, authorityOptions) {
234
+ let score = 0;
235
+
236
+ try {
237
+ const url = new URL(result.link);
238
+ const domain = url.hostname.toLowerCase();
239
+
240
+ // Domain authority boost
241
+ const domainAuthority = this.getDomainAuthority(domain);
242
+ score += domainAuthority * 0.6;
243
+
244
+ // HTTPS boost
245
+ if (url.protocol === 'https:') {
246
+ score += authorityOptions.httpsBoost;
247
+ }
248
+
249
+ // Path depth penalty (shorter paths are generally more authoritative)
250
+ const pathSegments = url.pathname.split('/').filter(s => s.length > 0);
251
+ score -= Math.min(0.3, pathSegments.length * authorityOptions.pathDepthPenalty);
252
+
253
+ // URL cleanliness bonus (no query params, clean structure)
254
+ if (!url.search && pathSegments.length <= 3) {
255
+ score += 0.1;
256
+ }
257
+
258
+ // Subdomain penalty (www is ok)
259
+ const subdomains = domain.split('.');
260
+ if (subdomains.length > 2 && subdomains[0] !== 'www') {
261
+ score -= 0.1;
262
+ }
263
+
264
+ } catch (error) {
265
+ // Invalid URL, give default low score
266
+ score = 0.1;
267
+ }
268
+
269
+ return Math.max(0, Math.min(1, score));
270
+ }
271
+
272
+ /**
273
+ * Content freshness scoring
274
+ */
275
+ computeFreshnessScore(result, freshnessOptions) {
276
+ // Extract date information from various sources
277
+ const dateString = this.extractDate(result);
278
+
279
+ if (!dateString) {
280
+ return 0.5; // Neutral score for unknown dates
281
+ }
282
+
283
+ try {
284
+ const contentDate = new Date(dateString);
285
+ const now = new Date();
286
+ const ageInMonths = (now - contentDate) / (1000 * 60 * 60 * 24 * 30.44);
287
+
288
+ if (ageInMonths < 0) {
289
+ return 1; // Future dates get max score
290
+ }
291
+
292
+ if (ageInMonths > freshnessOptions.maxAgeMonths) {
293
+ return 0; // Very old content gets 0 score
294
+ }
295
+
296
+ // Exponential decay
297
+ return Math.exp(-freshnessOptions.decayRate * ageInMonths);
298
+
299
+ } catch (error) {
300
+ return 0.5; // Invalid date, neutral score
301
+ }
302
+ }
303
+
304
+ /**
305
+ * Combine individual scores into final score
306
+ */
307
+ computeFinalScore(scores, weights) {
308
+ return (
309
+ scores.bm25 * weights.bm25 +
310
+ scores.semantic * weights.semantic +
311
+ scores.authority * weights.authority +
312
+ scores.freshness * weights.freshness
313
+ );
314
+ }
315
+
316
+ /**
317
+ * Initialize domain authority mapping
318
+ */
319
+ initializeDomainAuthority() {
320
+ // Precompute normalized domain authority scores
321
+ Object.entries(this.options.authority.domainBoosts).forEach(([domain, boost]) => {
322
+ this.domainAuthorityMap.set(domain, boost);
323
+ });
324
+ }
325
+
326
+ /**
327
+ * Get domain authority score
328
+ */
329
+ getDomainAuthority(domain) {
330
+ // Check exact match
331
+ if (this.domainAuthorityMap.has(domain)) {
332
+ return this.domainAuthorityMap.get(domain);
333
+ }
334
+
335
+ // Check for parent domain matches
336
+ const parts = domain.split('.');
337
+ if (parts.length > 2) {
338
+ const parentDomain = parts.slice(-2).join('.');
339
+ if (this.domainAuthorityMap.has(parentDomain)) {
340
+ return this.domainAuthorityMap.get(parentDomain) * 0.8; // Subdomain penalty
341
+ }
342
+ }
343
+
344
+ // Default score based on domain characteristics
345
+ let score = 0.3; // Base score
346
+
347
+ // Educational institutions
348
+ if (domain.endsWith('.edu')) score = 0.8;
349
+ // Government sites
350
+ else if (domain.endsWith('.gov')) score = 0.9;
351
+ // Organization sites
352
+ else if (domain.endsWith('.org')) score = 0.6;
353
+ // Commercial sites
354
+ else if (domain.endsWith('.com')) score = 0.4;
355
+
356
+ return score;
357
+ }
358
+
359
+ /**
360
+ * Extract date from result metadata
361
+ */
362
+ extractDate(result) {
363
+ // Try various date sources
364
+ const sources = [
365
+ result.pagemap?.metatags?.publishedTime,
366
+ result.pagemap?.metatags?.modifiedTime,
367
+ result.metadata?.lastModified,
368
+ result.pubDate,
369
+ result.publishedDate
370
+ ];
371
+
372
+ for (const source of sources) {
373
+ if (source && typeof source === 'string') {
374
+ return source;
375
+ }
376
+ }
377
+
378
+ return null;
379
+ }
380
+
381
+ /**
382
+ * Tokenize text into terms
383
+ */
384
+ tokenize(text) {
385
+ return text
386
+ .toLowerCase()
387
+ .replace(/[^\w\s]/g, ' ')
388
+ .split(/\s+/)
389
+ .filter(term => term.length > 1);
390
+ }
391
+
392
+ /**
393
+ * Get term frequencies
394
+ */
395
+ getTermFrequencies(terms) {
396
+ const freqs = {};
397
+ for (const term of terms) {
398
+ freqs[term] = (freqs[term] || 0) + 1;
399
+ }
400
+ return freqs;
401
+ }
402
+
403
+ /**
404
+ * Create term vector for similarity calculation
405
+ */
406
+ createTermVector(terms) {
407
+ const freqs = this.getTermFrequencies(terms);
408
+ const vector = {};
409
+
410
+ // Simple TF-IDF approximation
411
+ for (const [term, freq] of Object.entries(freqs)) {
412
+ vector[term] = freq / terms.length; // Normalized frequency
413
+ }
414
+
415
+ return vector;
416
+ }
417
+
418
+ /**
419
+ * Compute cosine similarity between two term vectors
420
+ */
421
+ cosineSimilarity(vectorA, vectorB) {
422
+ const allTerms = new Set([...Object.keys(vectorA), ...Object.keys(vectorB)]);
423
+
424
+ let dotProduct = 0;
425
+ let normA = 0;
426
+ let normB = 0;
427
+
428
+ for (const term of allTerms) {
429
+ const a = vectorA[term] || 0;
430
+ const b = vectorB[term] || 0;
431
+
432
+ dotProduct += a * b;
433
+ normA += a * a;
434
+ normB += b * b;
435
+ }
436
+
437
+ const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
438
+ return magnitude === 0 ? 0 : dotProduct / magnitude;
439
+ }
440
+
441
+ /**
442
+ * Generate hash of results for caching
443
+ */
444
+ hashResults(results) {
445
+ const key = results.map(r => r.link || r.url).join('|');
446
+ return this.simpleHash(key);
447
+ }
448
+
449
+ /**
450
+ * Simple hash function
451
+ */
452
+ simpleHash(str) {
453
+ let hash = 0;
454
+ for (let i = 0; i < str.length; i++) {
455
+ const char = str.charCodeAt(i);
456
+ hash = ((hash << 5) - hash) + char;
457
+ hash = hash & hash; // Convert to 32-bit integer
458
+ }
459
+ return hash.toString();
460
+ }
461
+
462
+ /**
463
+ * Get ranking statistics
464
+ */
465
+ getStats() {
466
+ return {
467
+ cacheStats: this.cache ? this.cache.getStats() : null,
468
+ domainAuthorityEntries: this.domainAuthorityMap.size,
469
+ configuration: {
470
+ weights: this.options.weights,
471
+ bm25: this.options.bm25,
472
+ authority: {
473
+ ...this.options.authority,
474
+ domainBoosts: Object.keys(this.options.authority.domainBoosts).length
475
+ },
476
+ freshness: this.options.freshness
477
+ }
478
+ };
479
+ }
480
+
481
+ /**
482
+ * Update ranking weights dynamically
483
+ */
484
+ updateWeights(newWeights) {
485
+ this.options.weights = { ...this.options.weights, ...newWeights };
486
+
487
+ // Ensure weights sum to 1
488
+ const total = Object.values(this.options.weights).reduce((sum, w) => sum + w, 0);
489
+ if (total !== 1) {
490
+ Object.keys(this.options.weights).forEach(key => {
491
+ this.options.weights[key] /= total;
492
+ });
493
+ }
494
+ }
495
+ }
496
+
497
+ export default ResultRanker;