crawlforge-mcp-server 3.0.17 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CLAUDE.md +2 -0
  2. package/README.md +1 -0
  3. package/package.json +6 -2
  4. package/server.js +192 -1277
  5. package/src/constants/config.js +2 -1
  6. package/src/core/ActionExecutor.js +2 -43
  7. package/src/core/AuthManager.js +230 -32
  8. package/src/core/BrowserContextPool.js +187 -0
  9. package/src/core/JobManager.js +7 -5
  10. package/src/core/LocalizationManager.js +14 -125
  11. package/src/core/ResearchOrchestrator.js +86 -5
  12. package/src/core/StealthBrowserManager.js +26 -18
  13. package/src/core/cache/CacheManager.js +4 -1
  14. package/src/core/crawlers/BFSCrawler.js +19 -5
  15. package/src/core/endpointGuard.js +37 -0
  16. package/src/observability/metrics.js +137 -0
  17. package/src/observability/tracing.js +74 -0
  18. package/src/server/auth/oauth.js +388 -0
  19. package/src/server/registerTool.js +41 -0
  20. package/src/server/schemas/common.js +29 -0
  21. package/src/server/transports/http.js +22 -0
  22. package/src/server/transports/stdio.js +16 -0
  23. package/src/server/transports/streamableHttp.js +226 -0
  24. package/src/server/withAuth.js +121 -0
  25. package/src/tools/advanced/BatchScrapeTool.js +12 -1086
  26. package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
  27. package/src/tools/advanced/batchScrape/index.js +328 -0
  28. package/src/tools/advanced/batchScrape/queue.js +91 -0
  29. package/src/tools/advanced/batchScrape/reporter.js +26 -0
  30. package/src/tools/advanced/batchScrape/schema.js +37 -0
  31. package/src/tools/advanced/batchScrape/worker.js +179 -0
  32. package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
  33. package/src/tools/basic/_fetch.js +35 -0
  34. package/src/tools/basic/extractLinks.js +74 -0
  35. package/src/tools/basic/extractMetadata.js +74 -0
  36. package/src/tools/basic/extractText.js +46 -0
  37. package/src/tools/basic/fetchUrl.js +44 -0
  38. package/src/tools/basic/scrapeStructured.js +58 -0
  39. package/src/tools/crawl/_sessionContext.js +234 -0
  40. package/src/tools/crawl/crawlDeep.js +55 -5
  41. package/src/tools/crawl/mapSite.js +23 -2
  42. package/src/tools/extract/_fetchAndParse.js +57 -0
  43. package/src/tools/extract/extractStructured.js +3 -19
  44. package/src/tools/extract/extractWithLlm.js +295 -0
  45. package/src/tools/research/deepResearch.js +33 -8
  46. package/src/tools/search/providers/searxng.js +126 -0
  47. package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
  48. package/src/tools/search/ranking/ResultRanker.js +17 -10
  49. package/src/tools/search/ranking/SearchResultCache.js +52 -0
  50. package/src/tools/search/searchWeb.js +112 -6
  51. package/src/tools/tracking/trackChanges/differ.js +98 -0
  52. package/src/tools/tracking/trackChanges/index.js +432 -0
  53. package/src/tools/tracking/trackChanges/monitor.js +93 -0
  54. package/src/tools/tracking/trackChanges/notifier.js +105 -0
  55. package/src/tools/tracking/trackChanges/schema.js +127 -0
  56. package/src/tools/tracking/trackChanges.js +12 -1374
@@ -0,0 +1,52 @@
1
+ /**
2
+ * SearchResultCache — unified cache layer for search ranking and deduplication.
3
+ *
4
+ * Both ResultRanker and ResultDeduplicator previously held separate CacheManager
5
+ * instances with identical TTL configuration. This module provides a single
6
+ * shared cache they can both use, halving the number of LRU cache instances
7
+ * created per SearchWebTool instantiation.
8
+ *
9
+ * Usage:
10
+ * const cache = new SearchResultCache({ ttl: 3600000 });
11
+ * // pass to ResultRanker and ResultDeduplicator via options.sharedCache
12
+ */
13
+
14
+ import { CacheManager } from '../../../core/cache/CacheManager.js';
15
+
16
+ export class SearchResultCache {
17
+ /**
18
+ * @param {Object} [options]
19
+ * @param {number} [options.ttl=3600000] — cache TTL in milliseconds
20
+ * @param {boolean} [options.enabled=true] — disable to skip caching
21
+ */
22
+ constructor(options = {}) {
23
+ const { ttl = 3600000, enabled = true } = options;
24
+ this.enabled = enabled;
25
+ this._cache = enabled ? new CacheManager({ ttl }) : null;
26
+ }
27
+
28
+ /** Retrieve a cached value by key (returns undefined on miss or when disabled). */
29
+ async get(key) {
30
+ if (!this.enabled || !this._cache) return undefined;
31
+ return this._cache.get(key);
32
+ }
33
+
34
+ /** Store a value under the given key. */
35
+ async set(key, value) {
36
+ if (!this.enabled || !this._cache) return;
37
+ return this._cache.set(key, value);
38
+ }
39
+
40
+ /** Generate a deterministic cache key from an arbitrary descriptor object. */
41
+ generateKey(namespace, descriptor) {
42
+ if (!this._cache) return null;
43
+ return this._cache.generateKey(namespace, descriptor);
44
+ }
45
+
46
+ /** Return underlying cache stats (or null when disabled). */
47
+ getStats() {
48
+ return this._cache ? this._cache.getStats() : null;
49
+ }
50
+ }
51
+
52
+ export default SearchResultCache;
@@ -4,11 +4,14 @@ import { CacheManager } from '../../core/cache/CacheManager.js';
4
4
  import { QueryExpander } from './queryExpander.js';
5
5
  import { ResultRanker } from './ranking/ResultRanker.js';
6
6
  import { ResultDeduplicator } from './ranking/ResultDeduplicator.js';
7
+ import { SearchResultCache } from './ranking/SearchResultCache.js';
7
8
  import LocalizationManager from '../../core/LocalizationManager.js';
8
9
  import { isCreatorModeVerified } from '../../core/creatorMode.js';
10
+ import { searchViaSearxng } from './providers/searxng.js';
9
11
 
10
12
  const SearchWebSchema = z.object({
11
13
  query: z.string().min(1),
14
+ provider: z.enum(['crawlforge', 'searxng']).optional().default('crawlforge'),
12
15
  limit: z.number().min(1).max(100).optional().default(10),
13
16
  offset: z.number().min(0).optional().default(0),
14
17
  lang: z.string().optional().default('en'),
@@ -92,13 +95,16 @@ export class SearchWebTool {
92
95
  }
93
96
 
94
97
  this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
95
-
98
+
96
99
  // Initialize query expander
97
100
  this.queryExpander = new QueryExpander(expanderOptions);
98
-
99
- // Initialize ranking and deduplication systems
100
- this.resultRanker = new ResultRanker({ cacheEnabled, cacheTTL, ...rankingOptions });
101
- this.resultDeduplicator = new ResultDeduplicator({ cacheEnabled, cacheTTL, ...deduplicationOptions });
101
+
102
+ // Shared cache for ranking + deduplication — avoids two separate LRU instances
103
+ const sharedRankingCache = new SearchResultCache({ ttl: cacheTTL, enabled: cacheEnabled });
104
+
105
+ // Initialize ranking and deduplication systems (both share the same cache)
106
+ this.resultRanker = new ResultRanker({ cacheEnabled, cacheTTL, sharedCache: sharedRankingCache, ...rankingOptions });
107
+ this.resultDeduplicator = new ResultDeduplicator({ cacheEnabled, cacheTTL, sharedCache: sharedRankingCache, ...deduplicationOptions });
102
108
 
103
109
  // Initialize localization manager
104
110
  this.localizationManager = new LocalizationManager({
@@ -110,7 +116,13 @@ export class SearchWebTool {
110
116
  async execute(params) {
111
117
  try {
112
118
  const validated = SearchWebSchema.parse(params);
113
-
119
+
120
+ // --- SearXNG provider short-circuit ---
121
+ if (validated.provider === 'searxng') {
122
+ return await this._executeViaSearxng(validated);
123
+ }
124
+ // --- end SearXNG short-circuit ---
125
+
114
126
  // Apply localization if specified
115
127
  let localizedParams = validated;
116
128
  if (validated.localization) {
@@ -336,6 +348,100 @@ export class SearchWebTool {
336
348
  }
337
349
  }
338
350
 
351
+ /**
352
+ * Execute search via a self-hosted SearXNG instance.
353
+ * Results are normalised to the same shape as the CrawlForge/Google path.
354
+ *
355
+ * @param {Object} validated - Parsed & validated parameters from SearchWebSchema
356
+ * @returns {Promise<Object>} Standard search_web response object
357
+ */
358
+ async _executeViaSearxng(validated) {
359
+ // page is 1-based; offset is 0-based items, so map via limit
360
+ const page = Math.floor(validated.offset / validated.limit) + 1;
361
+
362
+ const adapterResult = await searchViaSearxng({
363
+ query: validated.query,
364
+ limit: validated.limit,
365
+ page,
366
+ safeSearch: validated.safe_search,
367
+ language: validated.lang
368
+ });
369
+
370
+ // Run through shared post-processing (deduplication, ranking)
371
+ let processedResults = await this.processResults(adapterResult);
372
+
373
+ let deduplicationInfo = null;
374
+ if (validated.enable_deduplication && processedResults.length > 1) {
375
+ const dedupeOptions = validated.deduplication_thresholds
376
+ ? { thresholds: validated.deduplication_thresholds }
377
+ : {};
378
+ const originalCount = processedResults.length;
379
+ processedResults = await this.resultDeduplicator.deduplicateResults(
380
+ processedResults,
381
+ dedupeOptions
382
+ );
383
+ deduplicationInfo = {
384
+ originalCount,
385
+ finalCount: processedResults.length,
386
+ duplicatesRemoved: originalCount - processedResults.length,
387
+ deduplicationRate:
388
+ ((originalCount - processedResults.length) / originalCount * 100).toFixed(1) + '%'
389
+ };
390
+ }
391
+
392
+ let rankingInfo = null;
393
+ if (validated.enable_ranking && processedResults.length > 1) {
394
+ const rankingOptions = validated.ranking_weights
395
+ ? { weights: validated.ranking_weights }
396
+ : {};
397
+ processedResults = await this.resultRanker.rankResults(
398
+ processedResults,
399
+ validated.query,
400
+ rankingOptions
401
+ );
402
+ rankingInfo = {
403
+ algorithmsUsed: ['bm25', 'semantic', 'authority', 'freshness'],
404
+ weightsApplied: this.resultRanker.options.weights,
405
+ totalResults: processedResults.length
406
+ };
407
+ }
408
+
409
+ if (!validated.include_ranking_details) {
410
+ processedResults = processedResults.map(({ rankingDetails, ...r }) => r);
411
+ }
412
+ if (!validated.include_deduplication_details) {
413
+ processedResults = processedResults.map(({ deduplicationInfo: _d, ...r }) => r);
414
+ }
415
+
416
+ return {
417
+ query: validated.query,
418
+ results: processedResults,
419
+ total_results: adapterResult.searchInformation?.totalResults || 0,
420
+ search_time: adapterResult.searchInformation?.searchTime || 0,
421
+ offset: validated.offset,
422
+ limit: validated.limit,
423
+ cached: false,
424
+ provider: {
425
+ name: 'searxng',
426
+ backend: 'SearXNG (self-hosted)',
427
+ instanceUrl: process.env.CRAWLFORGE_SEARXNG_URL || null,
428
+ capabilities: {
429
+ requiresApiKey: false,
430
+ supportsPagination: true,
431
+ supportsLanguageFilter: true,
432
+ supportsSafeSearch: true
433
+ }
434
+ },
435
+ localization: null,
436
+ processing: {
437
+ ranking: rankingInfo,
438
+ deduplication: deduplicationInfo,
439
+ query_expansion: null,
440
+ localization_applied: false
441
+ }
442
+ };
443
+ }
444
+
339
445
  async processResults(searchResults) {
340
446
  if (!searchResults.items || searchResults.items.length === 0) {
341
447
  return [];
@@ -0,0 +1,98 @@
1
+ /**
2
+ * TrackChanges — differ module.
3
+ * URL content fetching and history/stat helper functions.
4
+ */
5
+
6
+ /**
7
+ * Fetch the HTML/text content of a URL with change-tracking headers.
8
+ * @param {string} url
9
+ * @returns {Promise<{ content: string, metadata: Object }>}
10
+ */
11
+ export async function fetchContent(url) {
12
+ try {
13
+ const response = await fetch(url, {
14
+ headers: {
15
+ 'User-Agent': 'MCP-WebScraper-ChangeTracker/3.0',
16
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17
+ 'Accept-Language': 'en-US,en;q=0.5',
18
+ 'Accept-Encoding': 'gzip, deflate',
19
+ 'Cache-Control': 'no-cache'
20
+ },
21
+ timeout: 30000
22
+ });
23
+
24
+ if (!response.ok) {
25
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
26
+ }
27
+
28
+ const content = await response.text();
29
+
30
+ return {
31
+ content,
32
+ metadata: {
33
+ statusCode: response.status,
34
+ contentType: response.headers.get('content-type'),
35
+ contentLength: content.length,
36
+ lastModified: response.headers.get('last-modified'),
37
+ etag: response.headers.get('etag'),
38
+ fetchedAt: Date.now()
39
+ }
40
+ };
41
+ } catch (error) {
42
+ throw new Error(`Failed to fetch content: ${error.message}`);
43
+ }
44
+ }
45
+
46
+ /**
47
+ * Merge change-tracker history entries with snapshot history entries.
48
+ * Deduplicates by timestamp proximity (within 60 s).
49
+ */
50
+ export function mergeHistoryData(changeHistory, snapshotHistory) {
51
+ const merged = [];
52
+
53
+ changeHistory.forEach(entry => {
54
+ merged.push({ ...entry, source: 'change_tracker', hasSnapshot: false });
55
+ });
56
+
57
+ snapshotHistory.forEach(entry => {
58
+ const existing = merged.find(m => Math.abs(m.timestamp - entry.timestamp) < 60000);
59
+ if (existing) {
60
+ existing.hasSnapshot = true;
61
+ existing.snapshotId = entry.snapshotId;
62
+ } else {
63
+ merged.push({ ...entry, source: 'snapshot', hasSnapshot: true });
64
+ }
65
+ });
66
+
67
+ return merged.sort((a, b) => b.timestamp - a.timestamp);
68
+ }
69
+
70
+ /** Return true if entry.significance is at or above the filter level. */
71
+ export function matchesSignificanceFilter(entry, filter) {
72
+ const levels = ['none', 'minor', 'moderate', 'major', 'critical'];
73
+ return levels.indexOf(entry.significance || 'none') >= levels.indexOf(filter);
74
+ }
75
+
76
+ /** Return true if significance meets the notification threshold. */
77
+ export function meetsNotificationThreshold(significance, threshold) {
78
+ const levels = ['none', 'minor', 'moderate', 'major', 'critical'];
79
+ return levels.indexOf(significance) >= levels.indexOf(threshold);
80
+ }
81
+
82
+ export function calculateAverageInterval(changeHistory) {
83
+ if (changeHistory.length < 2) return null;
84
+ let total = 0;
85
+ for (let i = 1; i < changeHistory.length; i++) {
86
+ total += changeHistory[i - 1].timestamp - changeHistory[i].timestamp;
87
+ }
88
+ return total / (changeHistory.length - 1);
89
+ }
90
+
91
+ export function calculateSignificanceDistribution(changeHistory) {
92
+ const dist = { none: 0, minor: 0, moderate: 0, major: 0, critical: 0 };
93
+ changeHistory.forEach(entry => {
94
+ const sig = entry.significance || 'none';
95
+ if (Object.prototype.hasOwnProperty.call(dist, sig)) dist[sig]++;
96
+ });
97
+ return dist;
98
+ }