crawlforge-mcp-server 3.0.18 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +5 -2
  2. package/server.js +192 -1277
  3. package/src/core/ActionExecutor.js +2 -43
  4. package/src/core/AuthManager.js +127 -14
  5. package/src/core/BrowserContextPool.js +187 -0
  6. package/src/core/JobManager.js +7 -5
  7. package/src/core/LocalizationManager.js +14 -125
  8. package/src/core/StealthBrowserManager.js +26 -18
  9. package/src/core/cache/CacheManager.js +4 -1
  10. package/src/core/crawlers/BFSCrawler.js +19 -5
  11. package/src/observability/metrics.js +137 -0
  12. package/src/observability/tracing.js +74 -0
  13. package/src/server/auth/oauth.js +388 -0
  14. package/src/server/registerTool.js +41 -0
  15. package/src/server/schemas/common.js +29 -0
  16. package/src/server/transports/http.js +22 -0
  17. package/src/server/transports/stdio.js +16 -0
  18. package/src/server/transports/streamableHttp.js +226 -0
  19. package/src/server/withAuth.js +121 -0
  20. package/src/tools/advanced/BatchScrapeTool.js +12 -1086
  21. package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
  22. package/src/tools/advanced/batchScrape/index.js +328 -0
  23. package/src/tools/advanced/batchScrape/queue.js +91 -0
  24. package/src/tools/advanced/batchScrape/reporter.js +26 -0
  25. package/src/tools/advanced/batchScrape/schema.js +37 -0
  26. package/src/tools/advanced/batchScrape/worker.js +179 -0
  27. package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
  28. package/src/tools/basic/_fetch.js +35 -0
  29. package/src/tools/basic/extractLinks.js +74 -0
  30. package/src/tools/basic/extractMetadata.js +74 -0
  31. package/src/tools/basic/extractText.js +46 -0
  32. package/src/tools/basic/fetchUrl.js +44 -0
  33. package/src/tools/basic/scrapeStructured.js +58 -0
  34. package/src/tools/crawl/_sessionContext.js +234 -0
  35. package/src/tools/crawl/crawlDeep.js +55 -5
  36. package/src/tools/crawl/mapSite.js +23 -2
  37. package/src/tools/extract/_fetchAndParse.js +57 -0
  38. package/src/tools/extract/extractStructured.js +3 -19
  39. package/src/tools/extract/extractWithLlm.js +365 -0
  40. package/src/tools/search/providers/searxng.js +126 -0
  41. package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
  42. package/src/tools/search/ranking/ResultRanker.js +17 -10
  43. package/src/tools/search/ranking/SearchResultCache.js +52 -0
  44. package/src/tools/search/searchWeb.js +112 -6
  45. package/src/tools/tracking/trackChanges/differ.js +98 -0
  46. package/src/tools/tracking/trackChanges/index.js +432 -0
  47. package/src/tools/tracking/trackChanges/monitor.js +93 -0
  48. package/src/tools/tracking/trackChanges/notifier.js +105 -0
  49. package/src/tools/tracking/trackChanges/schema.js +127 -0
  50. package/src/tools/tracking/trackChanges.js +12 -1374
@@ -12,6 +12,7 @@ import { chromium } from 'playwright';
12
12
  import { z } from 'zod';
13
13
  import crypto from 'crypto';
14
14
  import HumanBehaviorSimulator from '../utils/HumanBehaviorSimulator.js';
15
+ import { BrowserContextPool } from './BrowserContextPool.js';
15
16
 
16
17
  const StealthConfigSchema = z.object({
17
18
  level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
@@ -59,7 +60,15 @@ const StealthConfigSchema = z.object({
59
60
  export class StealthBrowserManager {
60
61
  constructor(options = {}) {
61
62
  this.browser = null;
62
- this.contexts = new Map();
63
+ this.contexts = new BrowserContextPool({
64
+ maxContexts: parseInt(process.env.MAX_BROWSER_CONTEXTS || '10', 10),
65
+ periodicRefreshAfter: 200,
66
+ closeIdleAfterMs: 30 * 60 * 1000,
67
+ waitTimeoutMs: 10_000,
68
+ onContextExpired: (contextId) => {
69
+ this.fingerprints.delete(contextId);
70
+ }
71
+ });
63
72
  this.fingerprints = new Map();
64
73
 
65
74
  // Enhanced stealth components
@@ -367,7 +376,7 @@ export class StealthBrowserManager {
367
376
  // Apply stealth scripts and configurations
368
377
  await this.applyAdvancedStealthConfigurations(context, validatedConfig, fingerprint);
369
378
 
370
- this.contexts.set(contextId, { context, fingerprint, config: validatedConfig });
379
+ await this.contexts.set(contextId, { context, fingerprint, config: validatedConfig });
371
380
  this.fingerprints.set(contextId, fingerprint);
372
381
 
373
382
  return { context, contextId, fingerprint };
@@ -1493,11 +1502,20 @@ export class StealthBrowserManager {
1493
1502
  throw new Error('Context not found');
1494
1503
  }
1495
1504
 
1505
+ // Record use and check if context needs periodic refresh
1506
+ const needsRefresh = this.contexts.recordUse(contextId);
1507
+ if (needsRefresh) {
1508
+ // Dispose old context; caller should create a fresh one
1509
+ await this.contexts.dispose(contextId);
1510
+ this.fingerprints.delete(contextId);
1511
+ throw new Error(`StealthBrowserManager: context ${contextId} has reached its use limit and was recycled. Create a new context.`);
1512
+ }
1513
+
1496
1514
  const page = await contextData.context.newPage();
1497
-
1515
+
1498
1516
  // Apply additional page-level stealth measures
1499
1517
  await this.applyPageStealthMeasures(page, contextData.config, contextData.fingerprint);
1500
-
1518
+
1501
1519
  return page;
1502
1520
  }
1503
1521
 
@@ -1678,10 +1696,8 @@ export class StealthBrowserManager {
1678
1696
  * Close specific context
1679
1697
  */
1680
1698
  async closeContext(contextId) {
1681
- const contextData = this.contexts.get(contextId);
1682
- if (contextData) {
1683
- await contextData.context.close();
1684
- this.contexts.delete(contextId);
1699
+ if (this.contexts.has(contextId)) {
1700
+ await this.contexts.dispose(contextId);
1685
1701
  this.fingerprints.delete(contextId);
1686
1702
  }
1687
1703
  }
@@ -1690,16 +1706,8 @@ export class StealthBrowserManager {
1690
1706
  * Close all contexts and browser
1691
1707
  */
1692
1708
  async cleanup() {
1693
- // Close all contexts
1694
- for (const [contextId, contextData] of this.contexts.entries()) {
1695
- try {
1696
- await contextData.context.close();
1697
- } catch (error) {
1698
- console.warn(`Failed to close context ${contextId}:`, error.message);
1699
- }
1700
- }
1701
-
1702
- this.contexts.clear();
1709
+ // Close all contexts via pool (handles idle timer cleanup + wait queue drain)
1710
+ await this.contexts.destroy();
1703
1711
  this.fingerprints.clear();
1704
1712
 
1705
1713
  // Reset human behavior simulator
@@ -82,10 +82,12 @@ export class CacheManager extends EventEmitter {
82
82
  this.startMonitoring(monitoringInterval);
83
83
  }
84
84
 
85
- // Initialize auto cleanup
85
+ // Initialize auto cleanup. .unref() so the timer never blocks process exit
86
+ // — short-lived CLI invocations and tests don't need an explicit destroy().
86
87
  this.cleanupTimer = setInterval(() => {
87
88
  this.cleanupExpired();
88
89
  }, autoCleanupInterval);
90
+ if (typeof this.cleanupTimer.unref === 'function') this.cleanupTimer.unref();
89
91
 
90
92
  // Eviction tracking is handled in the LRU cache dispose callback above
91
93
  }
@@ -546,6 +548,7 @@ export class CacheManager extends EventEmitter {
546
548
  this.updateStats();
547
549
  this.emit('monitoring', this.getDetailedStats());
548
550
  }, interval);
551
+ if (typeof this.monitoringTimer.unref === 'function') this.monitoringTimer.unref();
549
552
  }
550
553
 
551
554
  /**
@@ -19,7 +19,8 @@ export class BFSCrawler {
19
19
  concurrency = 10,
20
20
  domainFilter = null,
21
21
  enableLinkAnalysis = true,
22
- linkAnalyzerOptions = {}
22
+ linkAnalyzerOptions = {},
23
+ sessionContext = null
23
24
  } = options;
24
25
 
25
26
  this.maxDepth = maxDepth;
@@ -28,6 +29,8 @@ export class BFSCrawler {
28
29
  this.respectRobots = respectRobots;
29
30
  this.userAgent = userAgent;
30
31
  this.timeout = timeout;
32
+ // Session context for cookie jar + persistent headers (null = stateless)
33
+ this.sessionContext = sessionContext;
31
34
 
32
35
  this.visited = new Set();
33
36
  this.results = [];
@@ -254,21 +257,32 @@ export class BFSCrawler {
254
257
  'Connection': 'keep-alive',
255
258
  'Upgrade-Insecure-Requests': '1'
256
259
  };
257
-
258
- const headers = { ...defaultHeaders, ...domainRules.customHeaders };
260
+
261
+ let headers = { ...defaultHeaders, ...domainRules.customHeaders };
262
+
263
+ // If a session is active, layer in session headers + cookie jar
264
+ if (this.sessionContext) {
265
+ headers = this.sessionContext.applyToHeaders(url, headers);
266
+ }
267
+
259
268
  const effectiveTimeout = domainRules.timeout || this.timeout;
260
-
269
+
261
270
  // Update timeout if different
262
271
  if (effectiveTimeout !== this.timeout) {
263
272
  clearTimeout(timeoutId);
264
273
  setTimeout(() => controller.abort(), effectiveTimeout);
265
274
  }
266
-
275
+
267
276
  const response = await fetch(url, {
268
277
  signal: controller.signal,
269
278
  headers
270
279
  });
271
280
 
281
+ // Capture any cookies the server sets during the crawl
282
+ if (this.sessionContext) {
283
+ this.sessionContext.recordCookies(response, url);
284
+ }
285
+
272
286
  clearTimeout(timeoutId);
273
287
 
274
288
  if (!response.ok) {
@@ -0,0 +1,137 @@
1
+ /**
2
+ * Prometheus metrics — dependency-free implementation.
3
+ *
4
+ * Why no prom-client? CrawlForge is shipped via npm and runs in stdio mode
5
+ * by default. Pulling in prom-client (and its dependency tree) just to
6
+ * expose four counters/gauges is overkill. This 150 LOC implementation
7
+ * conforms to the Prometheus exposition format 0.0.4.
8
+ *
9
+ * Disabled by default. Enable via `CRAWLFORGE_METRICS=true` in HTTP mode.
10
+ *
11
+ * Counters/gauges exposed:
12
+ * - crawlforge_tool_requests_total{tool,outcome}
13
+ * - crawlforge_tool_errors_total{tool,error_class}
14
+ * - crawlforge_tool_duration_ms{tool} (histogram, summed)
15
+ * - crawlforge_credits_consumed_total{tool}
16
+ * - crawlforge_browser_pool_in_use (gauge)
17
+ * - crawlforge_browser_pool_capacity (gauge)
18
+ */
19
+
20
+ const CONTENT_TYPE = 'text/plain; version=0.0.4; charset=utf-8';
21
+
22
+ export function createMetricsRegistry() {
23
+ const counters = new Map(); // name|labels -> number
24
+ const gauges = new Map(); // name|labels -> number
25
+ const histograms = new Map(); // name|labels -> { count, sum, buckets:{le->count} }
26
+
27
+ const HISTOGRAM_BUCKETS_MS = [10, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000];
28
+
29
+ function key(name, labels) {
30
+ const labelStr = Object.entries(labels ?? {})
31
+ .sort(([a], [b]) => a.localeCompare(b))
32
+ .map(([k, v]) => `${k}="${escapeLabel(String(v))}"`)
33
+ .join(',');
34
+ return labelStr ? `${name}{${labelStr}}` : name;
35
+ }
36
+
37
+ return {
38
+ contentType: CONTENT_TYPE,
39
+
40
+ incCounter(name, labels, by = 1) {
41
+ const k = key(name, labels);
42
+ counters.set(k, (counters.get(k) ?? 0) + by);
43
+ },
44
+
45
+ setGauge(name, labels, value) {
46
+ gauges.set(key(name, labels), value);
47
+ },
48
+
49
+ observeHistogram(name, labels, valueMs) {
50
+ const k = key(name, labels);
51
+ let h = histograms.get(k);
52
+ if (!h) {
53
+ h = { count: 0, sum: 0, buckets: Object.fromEntries(HISTOGRAM_BUCKETS_MS.map(b => [b, 0])) };
54
+ histograms.set(k, h);
55
+ }
56
+ h.count += 1;
57
+ h.sum += valueMs;
58
+ for (const b of HISTOGRAM_BUCKETS_MS) {
59
+ if (valueMs <= b) h.buckets[b] += 1;
60
+ }
61
+ },
62
+
63
+ async render() {
64
+ const lines = [];
65
+
66
+ // Counters
67
+ const counterNames = new Set();
68
+ for (const k of counters.keys()) counterNames.add(k.split('{')[0]);
69
+ for (const name of counterNames) {
70
+ lines.push(`# HELP ${name} ${describe(name)}`);
71
+ lines.push(`# TYPE ${name} counter`);
72
+ for (const [k, v] of counters.entries()) {
73
+ if (k.split('{')[0] === name) lines.push(`${k} ${v}`);
74
+ }
75
+ }
76
+
77
+ // Gauges
78
+ const gaugeNames = new Set();
79
+ for (const k of gauges.keys()) gaugeNames.add(k.split('{')[0]);
80
+ for (const name of gaugeNames) {
81
+ lines.push(`# HELP ${name} ${describe(name)}`);
82
+ lines.push(`# TYPE ${name} gauge`);
83
+ for (const [k, v] of gauges.entries()) {
84
+ if (k.split('{')[0] === name) lines.push(`${k} ${v}`);
85
+ }
86
+ }
87
+
88
+ // Histograms
89
+ const histNames = new Set();
90
+ for (const k of histograms.keys()) histNames.add(k.split('{')[0]);
91
+ for (const name of histNames) {
92
+ lines.push(`# HELP ${name} ${describe(name)}`);
93
+ lines.push(`# TYPE ${name} histogram`);
94
+ for (const [k, h] of histograms.entries()) {
95
+ if (k.split('{')[0] !== name) continue;
96
+ // Reconstruct base labels (everything inside { })
97
+ const baseLabels = k.includes('{') ? k.slice(k.indexOf('{') + 1, -1) : '';
98
+ const sep = baseLabels ? ',' : '';
99
+ for (const b of HISTOGRAM_BUCKETS_MS) {
100
+ lines.push(`${name}_bucket{${baseLabels}${sep}le="${b}"} ${h.buckets[b]}`);
101
+ }
102
+ lines.push(`${name}_bucket{${baseLabels}${sep}le="+Inf"} ${h.count}`);
103
+ lines.push(`${name}_sum${baseLabels ? `{${baseLabels}}` : ''} ${h.sum}`);
104
+ lines.push(`${name}_count${baseLabels ? `{${baseLabels}}` : ''} ${h.count}`);
105
+ }
106
+ }
107
+
108
+ return lines.join('\n') + '\n';
109
+ },
110
+
111
+ // Snapshot for tests
112
+ _snapshot() {
113
+ return {
114
+ counters: Object.fromEntries(counters.entries()),
115
+ gauges: Object.fromEntries(gauges.entries()),
116
+ histograms: Object.fromEntries(histograms.entries())
117
+ };
118
+ }
119
+ };
120
+ }
121
+
122
+ const HELP = {
123
+ crawlforge_tool_requests_total: 'Total number of MCP tool invocations',
124
+ crawlforge_tool_errors_total: 'Total number of MCP tool errors',
125
+ crawlforge_tool_duration_ms: 'MCP tool invocation duration in milliseconds',
126
+ crawlforge_credits_consumed_total: 'Total CrawlForge credits consumed',
127
+ crawlforge_browser_pool_in_use: 'Number of browser contexts currently leased from the pool',
128
+ crawlforge_browser_pool_capacity: 'Maximum browser context pool capacity'
129
+ };
130
+
131
+ function describe(name) {
132
+ return HELP[name] ?? name;
133
+ }
134
+
135
+ function escapeLabel(v) {
136
+ return v.replace(/\\/g, '\\\\').replace(/\n/g, '\\n').replace(/"/g, '\\"');
137
+ }
@@ -0,0 +1,74 @@
1
+ /**
2
+ * OpenTelemetry-style tracing facade.
3
+ *
4
+ * Disabled by default. When `OTEL_SDK_DISABLED !== 'false'`, all calls are
5
+ * no-ops with zero overhead (no SDK loaded). To enable, install
6
+ * `@opentelemetry/api` + `@opentelemetry/sdk-node` in the host application
7
+ * and set `OTEL_SDK_DISABLED=false`.
8
+ *
9
+ * Why a facade instead of importing `@opentelemetry/api` directly?
10
+ * - CrawlForge ships via npm; we don't want to force the OTel runtime
11
+ * on every user. The facade pattern matches `@opentelemetry/api`'s
12
+ * no-op-by-default design but doesn't add the dependency to package.json.
13
+ * - Operators who want tracing install the SDK themselves and configure
14
+ * OTEL_* env vars. We call into `globalThis.__otelTracer` if present.
15
+ *
16
+ * Span attributes set on every tool invocation:
17
+ * - mcp.tool.name
18
+ * - mcp.tool.duration_ms
19
+ * - mcp.tool.outcome
20
+ * - mcp.credit.cost
21
+ */
22
+
23
+ const NOOP_SPAN = {
24
+ setAttribute() { return this; },
25
+ setAttributes() { return this; },
26
+ setStatus() { return this; },
27
+ recordException() { return this; },
28
+ end() {}
29
+ };
30
+
31
+ export function isTracingEnabled() {
32
+ return process.env.OTEL_SDK_DISABLED === 'false' && Boolean(globalThis.__otelTracer);
33
+ }
34
+
35
+ export function startToolSpan(toolName) {
36
+ if (!isTracingEnabled()) return NOOP_SPAN;
37
+ try {
38
+ const tracer = globalThis.__otelTracer;
39
+ const span = tracer.startSpan(`mcp.tool.${toolName}`, {
40
+ attributes: { 'mcp.tool.name': toolName }
41
+ });
42
+ return span;
43
+ } catch {
44
+ return NOOP_SPAN;
45
+ }
46
+ }
47
+
48
+ /**
49
+ * Record a complete tool invocation. Convenience wrapper used by withAuth.
50
+ *
51
+ * @param {string} toolName
52
+ * @param {object} attrs — { duration_ms, outcome, credit_cost, creator_mode }
53
+ * @param {Error} [error]
54
+ */
55
+ export function recordToolInvocation(toolName, attrs, error) {
56
+ if (!isTracingEnabled()) return;
57
+ try {
58
+ const span = startToolSpan(toolName);
59
+ span.setAttributes({
60
+ 'mcp.tool.duration_ms': attrs.duration_ms,
61
+ 'mcp.tool.outcome': attrs.outcome,
62
+ 'mcp.credit.cost': attrs.credit_cost,
63
+ 'mcp.credit.outcome': attrs.outcome,
64
+ 'mcp.creator_mode': Boolean(attrs.creator_mode)
65
+ });
66
+ if (error) {
67
+ span.recordException(error);
68
+ span.setStatus({ code: 2, message: error.message });
69
+ }
70
+ span.end();
71
+ } catch {
72
+ // tracing must never break the request path
73
+ }
74
+ }