crawlforge-mcp-server 3.0.17 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -0
- package/README.md +1 -0
- package/package.json +6 -2
- package/server.js +192 -1277
- package/src/constants/config.js +2 -1
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +230 -32
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/ResearchOrchestrator.js +86 -5
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/core/endpointGuard.js +37 -0
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +295 -0
- package/src/tools/research/deepResearch.js +33 -8
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SearchResultCache — unified cache layer for search ranking and deduplication.
|
|
3
|
+
*
|
|
4
|
+
* Both ResultRanker and ResultDeduplicator previously held separate CacheManager
|
|
5
|
+
* instances with identical TTL configuration. This module provides a single
|
|
6
|
+
* shared cache they can both use, halving the number of LRU cache instances
|
|
7
|
+
* created per SearchWebTool instantiation.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* const cache = new SearchResultCache({ ttl: 3600000 });
|
|
11
|
+
* // pass to ResultRanker and ResultDeduplicator via options.sharedCache
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { CacheManager } from '../../../core/cache/CacheManager.js';
|
|
15
|
+
|
|
16
|
+
export class SearchResultCache {
|
|
17
|
+
/**
|
|
18
|
+
* @param {Object} [options]
|
|
19
|
+
* @param {number} [options.ttl=3600000] — cache TTL in milliseconds
|
|
20
|
+
* @param {boolean} [options.enabled=true] — disable to skip caching
|
|
21
|
+
*/
|
|
22
|
+
constructor(options = {}) {
|
|
23
|
+
const { ttl = 3600000, enabled = true } = options;
|
|
24
|
+
this.enabled = enabled;
|
|
25
|
+
this._cache = enabled ? new CacheManager({ ttl }) : null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Retrieve a cached value by key (returns undefined on miss or when disabled). */
|
|
29
|
+
async get(key) {
|
|
30
|
+
if (!this.enabled || !this._cache) return undefined;
|
|
31
|
+
return this._cache.get(key);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Store a value under the given key. */
|
|
35
|
+
async set(key, value) {
|
|
36
|
+
if (!this.enabled || !this._cache) return;
|
|
37
|
+
return this._cache.set(key, value);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Generate a deterministic cache key from an arbitrary descriptor object. */
|
|
41
|
+
generateKey(namespace, descriptor) {
|
|
42
|
+
if (!this._cache) return null;
|
|
43
|
+
return this._cache.generateKey(namespace, descriptor);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Return underlying cache stats (or null when disabled). */
|
|
47
|
+
getStats() {
|
|
48
|
+
return this._cache ? this._cache.getStats() : null;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export default SearchResultCache;
|
|
@@ -4,11 +4,14 @@ import { CacheManager } from '../../core/cache/CacheManager.js';
|
|
|
4
4
|
import { QueryExpander } from './queryExpander.js';
|
|
5
5
|
import { ResultRanker } from './ranking/ResultRanker.js';
|
|
6
6
|
import { ResultDeduplicator } from './ranking/ResultDeduplicator.js';
|
|
7
|
+
import { SearchResultCache } from './ranking/SearchResultCache.js';
|
|
7
8
|
import LocalizationManager from '../../core/LocalizationManager.js';
|
|
8
9
|
import { isCreatorModeVerified } from '../../core/creatorMode.js';
|
|
10
|
+
import { searchViaSearxng } from './providers/searxng.js';
|
|
9
11
|
|
|
10
12
|
const SearchWebSchema = z.object({
|
|
11
13
|
query: z.string().min(1),
|
|
14
|
+
provider: z.enum(['crawlforge', 'searxng']).optional().default('crawlforge'),
|
|
12
15
|
limit: z.number().min(1).max(100).optional().default(10),
|
|
13
16
|
offset: z.number().min(0).optional().default(0),
|
|
14
17
|
lang: z.string().optional().default('en'),
|
|
@@ -92,13 +95,16 @@ export class SearchWebTool {
|
|
|
92
95
|
}
|
|
93
96
|
|
|
94
97
|
this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
|
|
95
|
-
|
|
98
|
+
|
|
96
99
|
// Initialize query expander
|
|
97
100
|
this.queryExpander = new QueryExpander(expanderOptions);
|
|
98
|
-
|
|
99
|
-
//
|
|
100
|
-
|
|
101
|
-
|
|
101
|
+
|
|
102
|
+
// Shared cache for ranking + deduplication — avoids two separate LRU instances
|
|
103
|
+
const sharedRankingCache = new SearchResultCache({ ttl: cacheTTL, enabled: cacheEnabled });
|
|
104
|
+
|
|
105
|
+
// Initialize ranking and deduplication systems (both share the same cache)
|
|
106
|
+
this.resultRanker = new ResultRanker({ cacheEnabled, cacheTTL, sharedCache: sharedRankingCache, ...rankingOptions });
|
|
107
|
+
this.resultDeduplicator = new ResultDeduplicator({ cacheEnabled, cacheTTL, sharedCache: sharedRankingCache, ...deduplicationOptions });
|
|
102
108
|
|
|
103
109
|
// Initialize localization manager
|
|
104
110
|
this.localizationManager = new LocalizationManager({
|
|
@@ -110,7 +116,13 @@ export class SearchWebTool {
|
|
|
110
116
|
async execute(params) {
|
|
111
117
|
try {
|
|
112
118
|
const validated = SearchWebSchema.parse(params);
|
|
113
|
-
|
|
119
|
+
|
|
120
|
+
// --- SearXNG provider short-circuit ---
|
|
121
|
+
if (validated.provider === 'searxng') {
|
|
122
|
+
return await this._executeViaSearxng(validated);
|
|
123
|
+
}
|
|
124
|
+
// --- end SearXNG short-circuit ---
|
|
125
|
+
|
|
114
126
|
// Apply localization if specified
|
|
115
127
|
let localizedParams = validated;
|
|
116
128
|
if (validated.localization) {
|
|
@@ -336,6 +348,100 @@ export class SearchWebTool {
|
|
|
336
348
|
}
|
|
337
349
|
}
|
|
338
350
|
|
|
351
|
+
/**
|
|
352
|
+
* Execute search via a self-hosted SearXNG instance.
|
|
353
|
+
* Results are normalised to the same shape as the CrawlForge/Google path.
|
|
354
|
+
*
|
|
355
|
+
* @param {Object} validated - Parsed & validated parameters from SearchWebSchema
|
|
356
|
+
* @returns {Promise<Object>} Standard search_web response object
|
|
357
|
+
*/
|
|
358
|
+
async _executeViaSearxng(validated) {
|
|
359
|
+
// page is 1-based; offset is 0-based items, so map via limit
|
|
360
|
+
const page = Math.floor(validated.offset / validated.limit) + 1;
|
|
361
|
+
|
|
362
|
+
const adapterResult = await searchViaSearxng({
|
|
363
|
+
query: validated.query,
|
|
364
|
+
limit: validated.limit,
|
|
365
|
+
page,
|
|
366
|
+
safeSearch: validated.safe_search,
|
|
367
|
+
language: validated.lang
|
|
368
|
+
});
|
|
369
|
+
|
|
370
|
+
// Run through shared post-processing (deduplication, ranking)
|
|
371
|
+
let processedResults = await this.processResults(adapterResult);
|
|
372
|
+
|
|
373
|
+
let deduplicationInfo = null;
|
|
374
|
+
if (validated.enable_deduplication && processedResults.length > 1) {
|
|
375
|
+
const dedupeOptions = validated.deduplication_thresholds
|
|
376
|
+
? { thresholds: validated.deduplication_thresholds }
|
|
377
|
+
: {};
|
|
378
|
+
const originalCount = processedResults.length;
|
|
379
|
+
processedResults = await this.resultDeduplicator.deduplicateResults(
|
|
380
|
+
processedResults,
|
|
381
|
+
dedupeOptions
|
|
382
|
+
);
|
|
383
|
+
deduplicationInfo = {
|
|
384
|
+
originalCount,
|
|
385
|
+
finalCount: processedResults.length,
|
|
386
|
+
duplicatesRemoved: originalCount - processedResults.length,
|
|
387
|
+
deduplicationRate:
|
|
388
|
+
((originalCount - processedResults.length) / originalCount * 100).toFixed(1) + '%'
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
let rankingInfo = null;
|
|
393
|
+
if (validated.enable_ranking && processedResults.length > 1) {
|
|
394
|
+
const rankingOptions = validated.ranking_weights
|
|
395
|
+
? { weights: validated.ranking_weights }
|
|
396
|
+
: {};
|
|
397
|
+
processedResults = await this.resultRanker.rankResults(
|
|
398
|
+
processedResults,
|
|
399
|
+
validated.query,
|
|
400
|
+
rankingOptions
|
|
401
|
+
);
|
|
402
|
+
rankingInfo = {
|
|
403
|
+
algorithmsUsed: ['bm25', 'semantic', 'authority', 'freshness'],
|
|
404
|
+
weightsApplied: this.resultRanker.options.weights,
|
|
405
|
+
totalResults: processedResults.length
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
if (!validated.include_ranking_details) {
|
|
410
|
+
processedResults = processedResults.map(({ rankingDetails, ...r }) => r);
|
|
411
|
+
}
|
|
412
|
+
if (!validated.include_deduplication_details) {
|
|
413
|
+
processedResults = processedResults.map(({ deduplicationInfo: _d, ...r }) => r);
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
query: validated.query,
|
|
418
|
+
results: processedResults,
|
|
419
|
+
total_results: adapterResult.searchInformation?.totalResults || 0,
|
|
420
|
+
search_time: adapterResult.searchInformation?.searchTime || 0,
|
|
421
|
+
offset: validated.offset,
|
|
422
|
+
limit: validated.limit,
|
|
423
|
+
cached: false,
|
|
424
|
+
provider: {
|
|
425
|
+
name: 'searxng',
|
|
426
|
+
backend: 'SearXNG (self-hosted)',
|
|
427
|
+
instanceUrl: process.env.CRAWLFORGE_SEARXNG_URL || null,
|
|
428
|
+
capabilities: {
|
|
429
|
+
requiresApiKey: false,
|
|
430
|
+
supportsPagination: true,
|
|
431
|
+
supportsLanguageFilter: true,
|
|
432
|
+
supportsSafeSearch: true
|
|
433
|
+
}
|
|
434
|
+
},
|
|
435
|
+
localization: null,
|
|
436
|
+
processing: {
|
|
437
|
+
ranking: rankingInfo,
|
|
438
|
+
deduplication: deduplicationInfo,
|
|
439
|
+
query_expansion: null,
|
|
440
|
+
localization_applied: false
|
|
441
|
+
}
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
339
445
|
async processResults(searchResults) {
|
|
340
446
|
if (!searchResults.items || searchResults.items.length === 0) {
|
|
341
447
|
return [];
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TrackChanges — differ module.
|
|
3
|
+
* URL content fetching and history/stat helper functions.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Fetch the HTML/text content of a URL with change-tracking headers.
|
|
8
|
+
* @param {string} url
|
|
9
|
+
* @returns {Promise<{ content: string, metadata: Object }>}
|
|
10
|
+
*/
|
|
11
|
+
export async function fetchContent(url) {
|
|
12
|
+
try {
|
|
13
|
+
const response = await fetch(url, {
|
|
14
|
+
headers: {
|
|
15
|
+
'User-Agent': 'MCP-WebScraper-ChangeTracker/3.0',
|
|
16
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
17
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
18
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
19
|
+
'Cache-Control': 'no-cache'
|
|
20
|
+
},
|
|
21
|
+
timeout: 30000
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
if (!response.ok) {
|
|
25
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const content = await response.text();
|
|
29
|
+
|
|
30
|
+
return {
|
|
31
|
+
content,
|
|
32
|
+
metadata: {
|
|
33
|
+
statusCode: response.status,
|
|
34
|
+
contentType: response.headers.get('content-type'),
|
|
35
|
+
contentLength: content.length,
|
|
36
|
+
lastModified: response.headers.get('last-modified'),
|
|
37
|
+
etag: response.headers.get('etag'),
|
|
38
|
+
fetchedAt: Date.now()
|
|
39
|
+
}
|
|
40
|
+
};
|
|
41
|
+
} catch (error) {
|
|
42
|
+
throw new Error(`Failed to fetch content: ${error.message}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Merge change-tracker history entries with snapshot history entries.
|
|
48
|
+
* Deduplicates by timestamp proximity (within 60 s).
|
|
49
|
+
*/
|
|
50
|
+
export function mergeHistoryData(changeHistory, snapshotHistory) {
|
|
51
|
+
const merged = [];
|
|
52
|
+
|
|
53
|
+
changeHistory.forEach(entry => {
|
|
54
|
+
merged.push({ ...entry, source: 'change_tracker', hasSnapshot: false });
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
snapshotHistory.forEach(entry => {
|
|
58
|
+
const existing = merged.find(m => Math.abs(m.timestamp - entry.timestamp) < 60000);
|
|
59
|
+
if (existing) {
|
|
60
|
+
existing.hasSnapshot = true;
|
|
61
|
+
existing.snapshotId = entry.snapshotId;
|
|
62
|
+
} else {
|
|
63
|
+
merged.push({ ...entry, source: 'snapshot', hasSnapshot: true });
|
|
64
|
+
}
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
return merged.sort((a, b) => b.timestamp - a.timestamp);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Return true if entry.significance is at or above the filter level. */
|
|
71
|
+
export function matchesSignificanceFilter(entry, filter) {
|
|
72
|
+
const levels = ['none', 'minor', 'moderate', 'major', 'critical'];
|
|
73
|
+
return levels.indexOf(entry.significance || 'none') >= levels.indexOf(filter);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** Return true if significance meets the notification threshold. */
|
|
77
|
+
export function meetsNotificationThreshold(significance, threshold) {
|
|
78
|
+
const levels = ['none', 'minor', 'moderate', 'major', 'critical'];
|
|
79
|
+
return levels.indexOf(significance) >= levels.indexOf(threshold);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export function calculateAverageInterval(changeHistory) {
|
|
83
|
+
if (changeHistory.length < 2) return null;
|
|
84
|
+
let total = 0;
|
|
85
|
+
for (let i = 1; i < changeHistory.length; i++) {
|
|
86
|
+
total += changeHistory[i - 1].timestamp - changeHistory[i].timestamp;
|
|
87
|
+
}
|
|
88
|
+
return total / (changeHistory.length - 1);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export function calculateSignificanceDistribution(changeHistory) {
|
|
92
|
+
const dist = { none: 0, minor: 0, moderate: 0, major: 0, critical: 0 };
|
|
93
|
+
changeHistory.forEach(entry => {
|
|
94
|
+
const sig = entry.significance || 'none';
|
|
95
|
+
if (Object.prototype.hasOwnProperty.call(dist, sig)) dist[sig]++;
|
|
96
|
+
});
|
|
97
|
+
return dist;
|
|
98
|
+
}
|