crawlforge-mcp-server 4.2.7 → 4.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawlforge-mcp-server",
3
- "version": "4.2.7",
3
+ "version": "4.2.9",
4
4
  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
5
5
  "main": "server.js",
6
6
  "bin": {
@@ -113,6 +113,7 @@
113
113
  "playwright": "^1.54.2",
114
114
  "robots-parser": "^3.0.1",
115
115
  "turndown": "^7.2.4",
116
+ "undici": "^7.24.0",
116
117
  "winston": "^3.11.0",
117
118
  "zod": "^3.23.8"
118
119
  },
@@ -11,8 +11,7 @@ export function register(program) {
11
11
  .command('actions <url>')
12
12
  .description('Run browser automation actions against a URL')
13
13
  .requiredOption('--script <file>', 'JSON file containing action script')
14
- .option('--screenshot', 'Capture screenshot after actions')
15
- .option('--wait <ms>', 'Wait time between actions in milliseconds', '500')
14
+ .option('--screenshot', 'Capture screenshots during action execution')
16
15
  .action(async (url, opts, cmd) => {
17
16
  const globals = cmd.parent.opts();
18
17
  const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
@@ -26,11 +25,12 @@ export function register(program) {
26
25
  }
27
26
 
28
27
  const tool = new ScrapeWithActionsTool(getToolConfig('scrape_with_actions'));
28
+ // ScrapeWithActionsSchema uses captureScreenshots (no between-action wait
29
+ // field — insert {type:'wait'} actions in the script for that).
29
30
  await runTool(tool, {
30
31
  url,
31
32
  actions,
32
- screenshot: !!opts.screenshot,
33
- wait_between_actions: parseInt(opts.wait, 10)
33
+ captureScreenshots: !!opts.screenshot
34
34
  }, cliFlags);
35
35
  });
36
36
  }
@@ -15,10 +15,12 @@ export function register(program) {
15
15
  const globals = cmd.parent.opts();
16
16
  const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
17
17
  const tool = new GenerateLLMsTxtTool(getToolConfig('generate_llms_txt'));
18
+ // GenerateLLMsTxtSchema expects: url, format ('both'|'llms-txt'|'llms-full-txt'),
19
+ // analysisOptions.maxPages.
18
20
  await runTool(tool, {
19
21
  url,
20
- include_full_txt: !!opts.includeFull,
21
- max_pages: parseInt(opts.maxPages, 10)
22
+ format: opts.includeFull ? 'both' : 'llms-txt',
23
+ analysisOptions: { maxPages: parseInt(opts.maxPages, 10) }
22
24
  }, cliFlags);
23
25
  });
24
26
  }
@@ -9,18 +9,18 @@ export function register(program) {
9
9
  program
10
10
  .command('map <url>')
11
11
  .description('Generate a sitemap for a website')
12
- .option('--depth <n>', 'Maximum crawl depth', '3')
13
- .option('--max-pages <n>', 'Maximum pages to include', '500')
14
- .option('--format <fmt>', 'Output format: json or xml', 'json')
12
+ .option('--max-pages <n>', 'Maximum URLs to discover', '500')
13
+ .option('--no-sitemap', 'Skip parsing sitemap.xml')
15
14
  .action(async (url, opts, cmd) => {
16
15
  const globals = cmd.parent.opts();
17
16
  const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
18
17
  const tool = new MapSiteTool(getToolConfig('map_site'));
18
+ // MapSiteSchema expects: url, max_urls, include_sitemap.
19
+ // (map_site has no crawl-depth or xml/json output toggle.)
19
20
  await runTool(tool, {
20
21
  url,
21
- max_depth: parseInt(opts.depth, 10),
22
- max_pages: parseInt(opts.maxPages, 10),
23
- output_format: opts.format
22
+ max_urls: parseInt(opts.maxPages, 10),
23
+ include_sitemap: opts.sitemap
24
24
  }, cliFlags);
25
25
  });
26
26
  }
@@ -17,14 +17,33 @@ export function register(program) {
17
17
  const globals = cmd.parent.opts();
18
18
  const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
19
19
  const tool = new TrackChangesTool(getToolConfig('track_changes'));
20
- // monitor runs continuously — do not auto-exit after the first result.
21
- await runTool(tool, {
20
+
21
+ // TrackChangesSchema shape: operation 'monitor' (setInterval poller);
22
+ // interval is ms (min 60s); selector → trackingOptions.customSelectors;
23
+ // threshold (%) → significanceThresholds; webhook → notificationOptions.webhook.
24
+ const t = Math.min(Math.max(parseFloat(opts.threshold) / 100, 0), 1);
25
+ const params = {
22
26
  url,
23
- scheduled: true,
24
- interval_seconds: parseInt(opts.interval, 10),
25
- selector: opts.selector,
26
- webhook_url: opts.webhook,
27
- change_threshold: parseFloat(opts.threshold)
28
- }, cliFlags, { exitOnSuccess: false });
27
+ trackingOptions: {
28
+ ...(opts.selector ? { customSelectors: [opts.selector] } : {}),
29
+ significanceThresholds: { minor: t, moderate: Math.max(0.3, t), major: Math.max(0.7, t) }
30
+ },
31
+ monitoringOptions: {
32
+ enabled: true,
33
+ interval: Math.max(parseInt(opts.interval, 10), 60) * 1000
34
+ },
35
+ ...(opts.webhook ? { notificationOptions: { webhook: { enabled: true, url: opts.webhook } } } : {})
36
+ };
37
+
38
+ // setupMonitoring polls compareWithBaseline, which needs a baseline; create
39
+ // one from the current page first so the monitor watches for changes from now.
40
+ const wrapperTool = {
41
+ execute: async (p) => {
42
+ await tool.execute({ ...p, operation: 'create_baseline' });
43
+ return await tool.execute({ ...p, operation: 'monitor' });
44
+ }
45
+ };
46
+ // monitor runs continuously — do not auto-exit after the first result.
47
+ await runTool(wrapperTool, params, cliFlags, { exitOnSuccess: false });
29
48
  });
30
49
  }
@@ -16,11 +16,14 @@ export function register(program) {
16
16
  const globals = cmd.parent.opts();
17
17
  const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
18
18
  const tool = new DeepResearchTool(getToolConfig('deep_research'));
19
+ // DeepResearchSchema expects: topic, maxDepth (1-10), maxUrls, outputFormat.
20
+ const depthMap = { basic: 2, standard: 5, deep: 8 };
21
+ const formatMap = { summary: 'summary', detailed: 'comprehensive' };
19
22
  await runTool(tool, {
20
- query: topic,
21
- depth: opts.depth,
22
- max_urls: parseInt(opts.maxUrls, 10),
23
- output_format: opts.outputFormat
23
+ topic,
24
+ maxDepth: depthMap[opts.depth] ?? 5,
25
+ maxUrls: parseInt(opts.maxUrls, 10),
26
+ outputFormat: formatMap[opts.outputFormat] ?? 'summary'
24
27
  }, cliFlags);
25
28
  });
26
29
  }
@@ -15,10 +15,27 @@ export function register(program) {
15
15
  const globals = cmd.parent.opts();
16
16
  const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
17
17
  const tool = new TrackChangesTool(getToolConfig('track_changes'));
18
- await runTool(tool, {
19
- url,
20
- selector: opts.selector,
21
- change_threshold: parseFloat(opts.threshold)
22
- }, cliFlags);
18
+
19
+ // TrackChangesSchema shape: selector → trackingOptions.customSelectors,
20
+ // threshold (%) → trackingOptions.significanceThresholds (0-1, ordered).
21
+ const t = Math.min(Math.max(parseFloat(opts.threshold) / 100, 0), 1);
22
+ const trackingOptions = {
23
+ ...(opts.selector ? { customSelectors: [opts.selector] } : {}),
24
+ significanceThresholds: { minor: t, moderate: Math.max(0.3, t), major: Math.max(0.7, t) }
25
+ };
26
+
27
+ // `compare` throws "No baseline found" on first run — bootstrap one, then
28
+ // the next invocation reports actual changes against it.
29
+ const params = { url, trackingOptions };
30
+ const wrapperTool = {
31
+ execute: async (p) => {
32
+ const res = await tool.execute({ ...p, operation: 'compare' });
33
+ if (res && res.success === false && /No baseline/i.test(res.error || '')) {
34
+ return await tool.execute({ ...p, operation: 'create_baseline' });
35
+ }
36
+ return res;
37
+ }
38
+ };
39
+ await runTool(wrapperTool, params, cliFlags);
23
40
  });
24
41
  }
package/src/cli/index.js CHANGED
@@ -16,6 +16,19 @@ import { createRequire } from 'node:module';
16
16
  import { fileURLToPath } from 'node:url';
17
17
  import { dirname, join } from 'node:path';
18
18
  import { readFileSync } from 'node:fs';
19
+ import { setGlobalDispatcher, EnvHttpProxyAgent } from 'undici';
20
+
21
+ // Node's global fetch() (undici) ignores HTTP(S)_PROXY by default. When a proxy
22
+ // is configured — e.g. inside a sandbox that only permits egress through it —
23
+ // route every fetch() through it so the CLI's API/scrape calls succeed without
24
+ // excluding the command from the sandbox. EnvHttpProxyAgent honors HTTPS_PROXY,
25
+ // HTTP_PROXY and NO_PROXY itself; this is a no-op when none are set.
26
+ if (process.env.HTTPS_PROXY || process.env.HTTP_PROXY || process.env.ALL_PROXY ||
27
+ process.env.https_proxy || process.env.http_proxy || process.env.all_proxy) {
28
+ try {
29
+ setGlobalDispatcher(new EnvHttpProxyAgent());
30
+ } catch { /* proxy agent unavailable — fall back to direct connections */ }
31
+ }
19
32
 
20
33
  // Load package.json for version
21
34
  const __filename = fileURLToPath(import.meta.url);
@@ -1523,6 +1523,43 @@ export class StealthBrowserManager {
1523
1523
  return page;
1524
1524
  }
1525
1525
 
1526
+ /**
1527
+ * One-shot stealth scrape: create a context + page, navigate to the URL,
1528
+ * extract content, and tear the context down. Convenience wrapper over the
1529
+ * operation-based API (createStealthContext → createStealthPage → goto).
1530
+ *
1531
+ * @param {Object} params
1532
+ * @param {string} params.url — URL to scrape
1533
+ * @param {string} [params.engine] — browser engine (forwarded to config; playwright by default)
1534
+ * @param {number} [params.wait_for] — extra wait after load, in ms
1535
+ * @param {boolean} [params.screenshot] — capture a base64 PNG screenshot
1536
+ * @param {Object} [params.stealthConfig] — stealth configuration overrides
1537
+ * @returns {Promise<{success:boolean, url:string, title:string, text:string, html:string, screenshot:?string}>}
1538
+ */
1539
+ async scrapeWithStealth({ url, engine, wait_for = 0, screenshot = false, stealthConfig = {} } = {}) {
1540
+ if (!url) throw new Error('scrapeWithStealth requires a url');
1541
+
1542
+ const { contextId } = await this.createStealthContext({ ...stealthConfig, engine });
1543
+ try {
1544
+ const page = await this.createStealthPage(contextId);
1545
+ await page.goto(url, { waitUntil: 'domcontentloaded' });
1546
+ if (wait_for > 0) await page.waitForTimeout(wait_for);
1547
+
1548
+ const [title, html, text] = await Promise.all([
1549
+ page.title().catch(() => ''),
1550
+ page.content().catch(() => ''),
1551
+ page.innerText('body').catch(() => '')
1552
+ ]);
1553
+ const shot = screenshot
1554
+ ? await page.screenshot({ encoding: 'base64', fullPage: false }).catch(() => null)
1555
+ : null;
1556
+
1557
+ return { success: true, url, title, text, html, screenshot: shot };
1558
+ } finally {
1559
+ await this.closeContext(contextId).catch(() => {});
1560
+ }
1561
+ }
1562
+
1526
1563
  /**
1527
1564
  * Apply page-level stealth measures
1528
1565
  */
@@ -391,8 +391,9 @@ export class Logger {
391
391
 
392
392
  this.winston.error(message, errorContext);
393
393
 
394
- // Track error for analysis
395
- if (this.enableErrorTracking) {
394
+ // Track error for analysis (only when an Error object was actually passed —
395
+ // logger.error(message) with no error must not reach trackError's error.name).
396
+ if (this.enableErrorTracking && error) {
396
397
  this.trackError(error, context, requestId);
397
398
  }
398
399
  }
@@ -425,11 +426,13 @@ export class Logger {
425
426
  trackError(error, context, requestId) {
426
427
  // Could be extended to send to error tracking service
427
428
  // For now, just log structured error data
429
+ // Null-safe: a shared logger must never throw, even if called with a
430
+ // non-Error (or null) value.
428
431
  this.winston.error('Error tracking', {
429
432
  errorTracking: {
430
- type: error.name,
431
- message: error.message,
432
- stack: error.stack,
433
+ type: error?.name ?? 'UnknownError',
434
+ message: error?.message ?? String(error ?? ''),
435
+ stack: error?.stack,
433
436
  context,
434
437
  requestId,
435
438
  timestamp: new Date().toISOString()