crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
import { CacheManager } from '../../../core/cache/CacheManager.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Advanced search result ranking system with multiple scoring algorithms
|
|
5
|
+
*/
|
|
6
|
+
export class ResultRanker {
|
|
7
|
+
constructor(options = {}) {
|
|
8
|
+
this.options = {
|
|
9
|
+
// Ranking weight configuration
|
|
10
|
+
weights: {
|
|
11
|
+
bm25: 0.4, // BM25 keyword relevance
|
|
12
|
+
semantic: 0.3, // Semantic similarity
|
|
13
|
+
authority: 0.2, // URL/domain authority
|
|
14
|
+
freshness: 0.1 // Content freshness
|
|
15
|
+
},
|
|
16
|
+
|
|
17
|
+
// BM25 parameters
|
|
18
|
+
bm25: {
|
|
19
|
+
k1: 1.5, // Term frequency saturation parameter
|
|
20
|
+
b: 0.75 // Length normalization parameter
|
|
21
|
+
},
|
|
22
|
+
|
|
23
|
+
// Authority scoring parameters
|
|
24
|
+
authority: {
|
|
25
|
+
domainBoosts: { // Domain authority boosts
|
|
26
|
+
'wikipedia.org': 1.0,
|
|
27
|
+
'github.com': 0.9,
|
|
28
|
+
'stackoverflow.com': 0.9,
|
|
29
|
+
'mozilla.org': 0.8,
|
|
30
|
+
'w3.org': 0.8
|
|
31
|
+
},
|
|
32
|
+
httpsBoost: 0.1, // HTTPS boost
|
|
33
|
+
pathDepthPenalty: 0.02 // Penalty per path segment
|
|
34
|
+
},
|
|
35
|
+
|
|
36
|
+
// Freshness parameters
|
|
37
|
+
freshness: {
|
|
38
|
+
maxAgeMonths: 24, // Content older than this gets 0 freshness score
|
|
39
|
+
decayRate: 0.1 // Exponential decay rate per month
|
|
40
|
+
},
|
|
41
|
+
|
|
42
|
+
// Performance options
|
|
43
|
+
cacheEnabled: true,
|
|
44
|
+
cacheTTL: 3600000, // 1 hour
|
|
45
|
+
...options
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
// Initialize cache for score computation
|
|
49
|
+
this.cache = this.options.cacheEnabled ?
|
|
50
|
+
new CacheManager({ ttl: this.options.cacheTTL }) : null;
|
|
51
|
+
|
|
52
|
+
// Precompute domain authority scores
|
|
53
|
+
this.domainAuthorityMap = new Map();
|
|
54
|
+
this.initializeDomainAuthority();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Rank search results using combined scoring algorithm
|
|
59
|
+
* @param {Array} results - Array of search results
|
|
60
|
+
* @param {string} query - Original search query
|
|
61
|
+
* @param {Object} options - Ranking options
|
|
62
|
+
* @returns {Promise<Array>} Ranked results with scores
|
|
63
|
+
*/
|
|
64
|
+
async rankResults(results, query, options = {}) {
|
|
65
|
+
if (!results || results.length === 0) {
|
|
66
|
+
return [];
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const rankingOptions = { ...this.options, ...options };
|
|
70
|
+
|
|
71
|
+
// Generate cache key for ranking computation
|
|
72
|
+
const cacheKey = this.cache ? this.cache.generateKey('ranking', {
|
|
73
|
+
query,
|
|
74
|
+
resultsHash: this.hashResults(results),
|
|
75
|
+
options: rankingOptions
|
|
76
|
+
}) : null;
|
|
77
|
+
|
|
78
|
+
// Check cache
|
|
79
|
+
if (this.cache) {
|
|
80
|
+
const cached = await this.cache.get(cacheKey);
|
|
81
|
+
if (cached) {
|
|
82
|
+
return cached;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
try {
|
|
87
|
+
// Compute individual scores for each result
|
|
88
|
+
const scoredResults = await Promise.all(
|
|
89
|
+
results.map(async (result, index) => ({
|
|
90
|
+
...result,
|
|
91
|
+
originalIndex: index,
|
|
92
|
+
scores: await this.computeScores(result, query, results, rankingOptions)
|
|
93
|
+
}))
|
|
94
|
+
);
|
|
95
|
+
|
|
96
|
+
// Compute final combined scores
|
|
97
|
+
const rankedResults = scoredResults.map(result => ({
|
|
98
|
+
...result,
|
|
99
|
+
finalScore: this.computeFinalScore(result.scores, rankingOptions.weights),
|
|
100
|
+
rankingDetails: {
|
|
101
|
+
scores: result.scores,
|
|
102
|
+
weights: rankingOptions.weights,
|
|
103
|
+
originalIndex: result.originalIndex
|
|
104
|
+
}
|
|
105
|
+
}));
|
|
106
|
+
|
|
107
|
+
// Sort by final score (descending)
|
|
108
|
+
rankedResults.sort((a, b) => b.finalScore - a.finalScore);
|
|
109
|
+
|
|
110
|
+
// Add ranking positions
|
|
111
|
+
rankedResults.forEach((result, index) => {
|
|
112
|
+
result.rankingDetails.newRank = index + 1;
|
|
113
|
+
result.rankingDetails.rankChange = result.originalIndex - index;
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
// Cache the results
|
|
117
|
+
if (this.cache) {
|
|
118
|
+
await this.cache.set(cacheKey, rankedResults);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return rankedResults;
|
|
122
|
+
} catch (error) {
|
|
123
|
+
console.error('Ranking failed:', error);
|
|
124
|
+
// Return original results with default scores
|
|
125
|
+
return results.map((result, index) => ({
|
|
126
|
+
...result,
|
|
127
|
+
originalIndex: index,
|
|
128
|
+
finalScore: 1.0 - (index * 0.1), // Simple fallback scoring
|
|
129
|
+
rankingDetails: {
|
|
130
|
+
error: error.message,
|
|
131
|
+
originalIndex: index,
|
|
132
|
+
newRank: index + 1,
|
|
133
|
+
rankChange: 0
|
|
134
|
+
}
|
|
135
|
+
}));
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Compute individual scoring components for a result
|
|
141
|
+
*/
|
|
142
|
+
async computeScores(result, query, allResults, options) {
|
|
143
|
+
const scores = {};
|
|
144
|
+
|
|
145
|
+
// BM25 Score
|
|
146
|
+
scores.bm25 = this.computeBM25Score(result, query, allResults, options.bm25);
|
|
147
|
+
|
|
148
|
+
// Semantic Similarity Score
|
|
149
|
+
scores.semantic = this.computeSemanticScore(result, query);
|
|
150
|
+
|
|
151
|
+
// Authority Score
|
|
152
|
+
scores.authority = this.computeAuthorityScore(result, options.authority);
|
|
153
|
+
|
|
154
|
+
// Freshness Score
|
|
155
|
+
scores.freshness = this.computeFreshnessScore(result, options.freshness);
|
|
156
|
+
|
|
157
|
+
return scores;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* BM25 algorithm implementation for keyword relevance
|
|
162
|
+
*/
|
|
163
|
+
computeBM25Score(result, query, allResults, bm25Options) {
|
|
164
|
+
const { k1, b } = bm25Options;
|
|
165
|
+
|
|
166
|
+
// Prepare text content for analysis
|
|
167
|
+
const content = [
|
|
168
|
+
result.title || '',
|
|
169
|
+
result.snippet || '',
|
|
170
|
+
result.htmlSnippet || ''
|
|
171
|
+
].join(' ').toLowerCase();
|
|
172
|
+
|
|
173
|
+
// Tokenize query and content
|
|
174
|
+
const queryTerms = this.tokenize(query.toLowerCase());
|
|
175
|
+
const contentTerms = this.tokenize(content);
|
|
176
|
+
const contentLength = contentTerms.length;
|
|
177
|
+
|
|
178
|
+
// Calculate average document length across all results
|
|
179
|
+
const avgDocLength = allResults.reduce((sum, r) => {
|
|
180
|
+
const rContent = [r.title || '', r.snippet || '', r.htmlSnippet || ''].join(' ');
|
|
181
|
+
return sum + this.tokenize(rContent).length;
|
|
182
|
+
}, 0) / allResults.length;
|
|
183
|
+
|
|
184
|
+
// Calculate term frequencies
|
|
185
|
+
const termFreqs = this.getTermFrequencies(contentTerms);
|
|
186
|
+
|
|
187
|
+
let score = 0;
|
|
188
|
+
for (const term of queryTerms) {
|
|
189
|
+
const tf = termFreqs[term] || 0;
|
|
190
|
+
if (tf > 0) {
|
|
191
|
+
// Document frequency (simplified - assume term appears in some docs)
|
|
192
|
+
const df = Math.min(allResults.length * 0.1, 1); // Conservative estimate
|
|
193
|
+
const idf = Math.log((allResults.length - df + 0.5) / (df + 0.5));
|
|
194
|
+
|
|
195
|
+
// BM25 formula
|
|
196
|
+
const numerator = tf * (k1 + 1);
|
|
197
|
+
const denominator = tf + k1 * (1 - b + b * (contentLength / avgDocLength));
|
|
198
|
+
|
|
199
|
+
score += idf * (numerator / denominator);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return Math.max(0, Math.min(1, score / queryTerms.length));
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Semantic similarity scoring using cosine similarity
|
|
208
|
+
*/
|
|
209
|
+
computeSemanticScore(result, query) {
|
|
210
|
+
// Prepare text content
|
|
211
|
+
const content = [
|
|
212
|
+
result.title || '',
|
|
213
|
+
result.snippet || '',
|
|
214
|
+
result.htmlSnippet || ''
|
|
215
|
+
].join(' ').toLowerCase();
|
|
216
|
+
|
|
217
|
+
// Simple word embedding approximation using term vectors
|
|
218
|
+
const queryVector = this.createTermVector(this.tokenize(query.toLowerCase()));
|
|
219
|
+
const contentVector = this.createTermVector(this.tokenize(content));
|
|
220
|
+
|
|
221
|
+
// Compute cosine similarity
|
|
222
|
+
const similarity = this.cosineSimilarity(queryVector, contentVector);
|
|
223
|
+
|
|
224
|
+
// Boost for exact phrase matches
|
|
225
|
+
const phraseBoost = content.includes(query.toLowerCase()) ? 0.2 : 0;
|
|
226
|
+
|
|
227
|
+
return Math.min(1, similarity + phraseBoost);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* URL and domain authority scoring
|
|
232
|
+
*/
|
|
233
|
+
computeAuthorityScore(result, authorityOptions) {
|
|
234
|
+
let score = 0;
|
|
235
|
+
|
|
236
|
+
try {
|
|
237
|
+
const url = new URL(result.link);
|
|
238
|
+
const domain = url.hostname.toLowerCase();
|
|
239
|
+
|
|
240
|
+
// Domain authority boost
|
|
241
|
+
const domainAuthority = this.getDomainAuthority(domain);
|
|
242
|
+
score += domainAuthority * 0.6;
|
|
243
|
+
|
|
244
|
+
// HTTPS boost
|
|
245
|
+
if (url.protocol === 'https:') {
|
|
246
|
+
score += authorityOptions.httpsBoost;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Path depth penalty (shorter paths are generally more authoritative)
|
|
250
|
+
const pathSegments = url.pathname.split('/').filter(s => s.length > 0);
|
|
251
|
+
score -= Math.min(0.3, pathSegments.length * authorityOptions.pathDepthPenalty);
|
|
252
|
+
|
|
253
|
+
// URL cleanliness bonus (no query params, clean structure)
|
|
254
|
+
if (!url.search && pathSegments.length <= 3) {
|
|
255
|
+
score += 0.1;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Subdomain penalty (www is ok)
|
|
259
|
+
const subdomains = domain.split('.');
|
|
260
|
+
if (subdomains.length > 2 && subdomains[0] !== 'www') {
|
|
261
|
+
score -= 0.1;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
} catch (error) {
|
|
265
|
+
// Invalid URL, give default low score
|
|
266
|
+
score = 0.1;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
return Math.max(0, Math.min(1, score));
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
* Content freshness scoring
|
|
274
|
+
*/
|
|
275
|
+
computeFreshnessScore(result, freshnessOptions) {
|
|
276
|
+
// Extract date information from various sources
|
|
277
|
+
const dateString = this.extractDate(result);
|
|
278
|
+
|
|
279
|
+
if (!dateString) {
|
|
280
|
+
return 0.5; // Neutral score for unknown dates
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
try {
|
|
284
|
+
const contentDate = new Date(dateString);
|
|
285
|
+
const now = new Date();
|
|
286
|
+
const ageInMonths = (now - contentDate) / (1000 * 60 * 60 * 24 * 30.44);
|
|
287
|
+
|
|
288
|
+
if (ageInMonths < 0) {
|
|
289
|
+
return 1; // Future dates get max score
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if (ageInMonths > freshnessOptions.maxAgeMonths) {
|
|
293
|
+
return 0; // Very old content gets 0 score
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Exponential decay
|
|
297
|
+
return Math.exp(-freshnessOptions.decayRate * ageInMonths);
|
|
298
|
+
|
|
299
|
+
} catch (error) {
|
|
300
|
+
return 0.5; // Invalid date, neutral score
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Combine individual scores into final score
|
|
306
|
+
*/
|
|
307
|
+
computeFinalScore(scores, weights) {
|
|
308
|
+
return (
|
|
309
|
+
scores.bm25 * weights.bm25 +
|
|
310
|
+
scores.semantic * weights.semantic +
|
|
311
|
+
scores.authority * weights.authority +
|
|
312
|
+
scores.freshness * weights.freshness
|
|
313
|
+
);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Initialize domain authority mapping
|
|
318
|
+
*/
|
|
319
|
+
initializeDomainAuthority() {
|
|
320
|
+
// Precompute normalized domain authority scores
|
|
321
|
+
Object.entries(this.options.authority.domainBoosts).forEach(([domain, boost]) => {
|
|
322
|
+
this.domainAuthorityMap.set(domain, boost);
|
|
323
|
+
});
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
/**
|
|
327
|
+
* Get domain authority score
|
|
328
|
+
*/
|
|
329
|
+
getDomainAuthority(domain) {
|
|
330
|
+
// Check exact match
|
|
331
|
+
if (this.domainAuthorityMap.has(domain)) {
|
|
332
|
+
return this.domainAuthorityMap.get(domain);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Check for parent domain matches
|
|
336
|
+
const parts = domain.split('.');
|
|
337
|
+
if (parts.length > 2) {
|
|
338
|
+
const parentDomain = parts.slice(-2).join('.');
|
|
339
|
+
if (this.domainAuthorityMap.has(parentDomain)) {
|
|
340
|
+
return this.domainAuthorityMap.get(parentDomain) * 0.8; // Subdomain penalty
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Default score based on domain characteristics
|
|
345
|
+
let score = 0.3; // Base score
|
|
346
|
+
|
|
347
|
+
// Educational institutions
|
|
348
|
+
if (domain.endsWith('.edu')) score = 0.8;
|
|
349
|
+
// Government sites
|
|
350
|
+
else if (domain.endsWith('.gov')) score = 0.9;
|
|
351
|
+
// Organization sites
|
|
352
|
+
else if (domain.endsWith('.org')) score = 0.6;
|
|
353
|
+
// Commercial sites
|
|
354
|
+
else if (domain.endsWith('.com')) score = 0.4;
|
|
355
|
+
|
|
356
|
+
return score;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/**
|
|
360
|
+
* Extract date from result metadata
|
|
361
|
+
*/
|
|
362
|
+
extractDate(result) {
|
|
363
|
+
// Try various date sources
|
|
364
|
+
const sources = [
|
|
365
|
+
result.pagemap?.metatags?.publishedTime,
|
|
366
|
+
result.pagemap?.metatags?.modifiedTime,
|
|
367
|
+
result.metadata?.lastModified,
|
|
368
|
+
result.pubDate,
|
|
369
|
+
result.publishedDate
|
|
370
|
+
];
|
|
371
|
+
|
|
372
|
+
for (const source of sources) {
|
|
373
|
+
if (source && typeof source === 'string') {
|
|
374
|
+
return source;
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
return null;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* Tokenize text into terms
|
|
383
|
+
*/
|
|
384
|
+
tokenize(text) {
|
|
385
|
+
return text
|
|
386
|
+
.toLowerCase()
|
|
387
|
+
.replace(/[^\w\s]/g, ' ')
|
|
388
|
+
.split(/\s+/)
|
|
389
|
+
.filter(term => term.length > 1);
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* Get term frequencies
|
|
394
|
+
*/
|
|
395
|
+
getTermFrequencies(terms) {
|
|
396
|
+
const freqs = {};
|
|
397
|
+
for (const term of terms) {
|
|
398
|
+
freqs[term] = (freqs[term] || 0) + 1;
|
|
399
|
+
}
|
|
400
|
+
return freqs;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
/**
|
|
404
|
+
* Create term vector for similarity calculation
|
|
405
|
+
*/
|
|
406
|
+
createTermVector(terms) {
|
|
407
|
+
const freqs = this.getTermFrequencies(terms);
|
|
408
|
+
const vector = {};
|
|
409
|
+
|
|
410
|
+
// Simple TF-IDF approximation
|
|
411
|
+
for (const [term, freq] of Object.entries(freqs)) {
|
|
412
|
+
vector[term] = freq / terms.length; // Normalized frequency
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
return vector;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
/**
|
|
419
|
+
* Compute cosine similarity between two term vectors
|
|
420
|
+
*/
|
|
421
|
+
cosineSimilarity(vectorA, vectorB) {
|
|
422
|
+
const allTerms = new Set([...Object.keys(vectorA), ...Object.keys(vectorB)]);
|
|
423
|
+
|
|
424
|
+
let dotProduct = 0;
|
|
425
|
+
let normA = 0;
|
|
426
|
+
let normB = 0;
|
|
427
|
+
|
|
428
|
+
for (const term of allTerms) {
|
|
429
|
+
const a = vectorA[term] || 0;
|
|
430
|
+
const b = vectorB[term] || 0;
|
|
431
|
+
|
|
432
|
+
dotProduct += a * b;
|
|
433
|
+
normA += a * a;
|
|
434
|
+
normB += b * b;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
|
|
438
|
+
return magnitude === 0 ? 0 : dotProduct / magnitude;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Generate hash of results for caching
|
|
443
|
+
*/
|
|
444
|
+
hashResults(results) {
|
|
445
|
+
const key = results.map(r => r.link || r.url).join('|');
|
|
446
|
+
return this.simpleHash(key);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
/**
|
|
450
|
+
* Simple hash function
|
|
451
|
+
*/
|
|
452
|
+
simpleHash(str) {
|
|
453
|
+
let hash = 0;
|
|
454
|
+
for (let i = 0; i < str.length; i++) {
|
|
455
|
+
const char = str.charCodeAt(i);
|
|
456
|
+
hash = ((hash << 5) - hash) + char;
|
|
457
|
+
hash = hash & hash; // Convert to 32-bit integer
|
|
458
|
+
}
|
|
459
|
+
return hash.toString();
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
/**
|
|
463
|
+
* Get ranking statistics
|
|
464
|
+
*/
|
|
465
|
+
getStats() {
|
|
466
|
+
return {
|
|
467
|
+
cacheStats: this.cache ? this.cache.getStats() : null,
|
|
468
|
+
domainAuthorityEntries: this.domainAuthorityMap.size,
|
|
469
|
+
configuration: {
|
|
470
|
+
weights: this.options.weights,
|
|
471
|
+
bm25: this.options.bm25,
|
|
472
|
+
authority: {
|
|
473
|
+
...this.options.authority,
|
|
474
|
+
domainBoosts: Object.keys(this.options.authority.domainBoosts).length
|
|
475
|
+
},
|
|
476
|
+
freshness: this.options.freshness
|
|
477
|
+
}
|
|
478
|
+
};
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Update ranking weights dynamically
|
|
483
|
+
*/
|
|
484
|
+
updateWeights(newWeights) {
|
|
485
|
+
this.options.weights = { ...this.options.weights, ...newWeights };
|
|
486
|
+
|
|
487
|
+
// Ensure weights sum to 1
|
|
488
|
+
const total = Object.values(this.options.weights).reduce((sum, w) => sum + w, 0);
|
|
489
|
+
if (total !== 1) {
|
|
490
|
+
Object.keys(this.options.weights).forEach(key => {
|
|
491
|
+
this.options.weights[key] /= total;
|
|
492
|
+
});
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
export default ResultRanker;
|