npm - crawlforge-mcp-server - Versions diffs - 4.2.8 → 4.2.10 - Mend

crawlforge-mcp-server 4.2.8 → 4.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json +2 -1
package/src/cli/commands/actions.js +4 -4
package/src/cli/commands/llmstxt.js +4 -2
package/src/cli/commands/map.js +6 -6
package/src/cli/commands/monitor.js +27 -8
package/src/cli/commands/research.js +7 -4
package/src/cli/commands/track.js +22 -5
package/src/cli/index.js +13 -0
package/src/core/StealthBrowserManager.js +40 -3
package/src/core/WebhookDispatcher.js +1 -1
package/src/core/crawlers/BFSCrawler.js +3 -3
package/src/tools/advanced/ScrapeWithActionsTool.js +2 -2
package/src/tools/extract/extractContent.js +1 -1
package/src/tools/extract/processDocument.js +1 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawlforge-mcp-server",
-  "version": "4.2.8",
+  "version": "4.2.10",
   "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
   "main": "server.js",
   "bin": {
@@ -113,6 +113,7 @@
     "playwright": "^1.54.2",
     "robots-parser": "^3.0.1",
     "turndown": "^7.2.4",
+    "undici": "^7.24.0",
     "winston": "^3.11.0",
     "zod": "^3.23.8"
   },

package/src/cli/commands/actions.js CHANGED Viewed

@@ -11,8 +11,7 @@ export function register(program) {
     .command('actions <url>')
     .description('Run browser automation actions against a URL')
     .requiredOption('--script <file>', 'JSON file containing action script')
-    .option('--screenshot', 'Capture screenshot after actions')
-    .option('--wait <ms>', 'Wait time between actions in milliseconds', '500')
+    .option('--screenshot', 'Capture screenshots during action execution')
     .action(async (url, opts, cmd) => {
       const globals = cmd.parent.opts();
       const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
@@ -26,11 +25,12 @@ export function register(program) {
       }
       const tool = new ScrapeWithActionsTool(getToolConfig('scrape_with_actions'));
+      // ScrapeWithActionsSchema uses captureScreenshots (no between-action wait
+      // field — insert {type:'wait'} actions in the script for that).
       await runTool(tool, {
         url,
         actions,
-        screenshot: !!opts.screenshot,
-        wait_between_actions: parseInt(opts.wait, 10)
+        captureScreenshots: !!opts.screenshot
       }, cliFlags);
     });
 }

package/src/cli/commands/llmstxt.js CHANGED Viewed

@@ -15,10 +15,12 @@ export function register(program) {
       const globals = cmd.parent.opts();
       const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
       const tool = new GenerateLLMsTxtTool(getToolConfig('generate_llms_txt'));
+      // GenerateLLMsTxtSchema expects: url, format ('both'|'llms-txt'|'llms-full-txt'),
+      // analysisOptions.maxPages.
       await runTool(tool, {
         url,
-        include_full_txt: !!opts.includeFull,
-        max_pages: parseInt(opts.maxPages, 10)
+        format: opts.includeFull ? 'both' : 'llms-txt',
+        analysisOptions: { maxPages: parseInt(opts.maxPages, 10) }
       }, cliFlags);
     });
 }

package/src/cli/commands/map.js CHANGED Viewed

@@ -9,18 +9,18 @@ export function register(program) {
   program
     .command('map <url>')
     .description('Generate a sitemap for a website')
-    .option('--depth <n>', 'Maximum crawl depth', '3')
-    .option('--max-pages <n>', 'Maximum pages to include', '500')
-    .option('--format <fmt>', 'Output format: json or xml', 'json')
+    .option('--max-pages <n>', 'Maximum URLs to discover', '500')
+    .option('--no-sitemap', 'Skip parsing sitemap.xml')
     .action(async (url, opts, cmd) => {
       const globals = cmd.parent.opts();
       const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
       const tool = new MapSiteTool(getToolConfig('map_site'));
+      // MapSiteSchema expects: url, max_urls, include_sitemap.
+      // (map_site has no crawl-depth or xml/json output toggle.)
       await runTool(tool, {
         url,
-        max_depth: parseInt(opts.depth, 10),
-        max_pages: parseInt(opts.maxPages, 10),
-        output_format: opts.format
+        max_urls: parseInt(opts.maxPages, 10),
+        include_sitemap: opts.sitemap
       }, cliFlags);
     });
 }

package/src/cli/commands/monitor.js CHANGED Viewed

@@ -17,14 +17,33 @@ export function register(program) {
       const globals = cmd.parent.opts();
       const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
       const tool = new TrackChangesTool(getToolConfig('track_changes'));
-      // monitor runs continuously — do not auto-exit after the first result.
-      await runTool(tool, {
+      // TrackChangesSchema shape: operation 'monitor' (setInterval poller);
+      // interval is ms (min 60s); selector → trackingOptions.customSelectors;
+      // threshold (%) → significanceThresholds; webhook → notificationOptions.webhook.
+      const t = Math.min(Math.max(parseFloat(opts.threshold) / 100, 0), 1);
+      const params = {
         url,
-        scheduled: true,
-        interval_seconds: parseInt(opts.interval, 10),
-        selector: opts.selector,
-        webhook_url: opts.webhook,
-        change_threshold: parseFloat(opts.threshold)
-      }, cliFlags, { exitOnSuccess: false });
+        trackingOptions: {
+          ...(opts.selector ? { customSelectors: [opts.selector] } : {}),
+          significanceThresholds: { minor: t, moderate: Math.max(0.3, t), major: Math.max(0.7, t) }
+        },
+        monitoringOptions: {
+          enabled: true,
+          interval: Math.max(parseInt(opts.interval, 10), 60) * 1000
+        },
+        ...(opts.webhook ? { notificationOptions: { webhook: { enabled: true, url: opts.webhook } } } : {})
+      };
+      // setupMonitoring polls compareWithBaseline, which needs a baseline; create
+      // one from the current page first so the monitor watches for changes from now.
+      const wrapperTool = {
+        execute: async (p) => {
+          await tool.execute({ ...p, operation: 'create_baseline' });
+          return await tool.execute({ ...p, operation: 'monitor' });
+        }
+      };
+      // monitor runs continuously — do not auto-exit after the first result.
+      await runTool(wrapperTool, params, cliFlags, { exitOnSuccess: false });
     });
 }

package/src/cli/commands/research.js CHANGED Viewed

@@ -16,11 +16,14 @@ export function register(program) {
       const globals = cmd.parent.opts();
       const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
       const tool = new DeepResearchTool(getToolConfig('deep_research'));
+      // DeepResearchSchema expects: topic, maxDepth (1-10), maxUrls, outputFormat.
+      const depthMap = { basic: 2, standard: 5, deep: 8 };
+      const formatMap = { summary: 'summary', detailed: 'comprehensive' };
       await runTool(tool, {
-        query: topic,
-        depth: opts.depth,
-        max_urls: parseInt(opts.maxUrls, 10),
-        output_format: opts.outputFormat
+        topic,
+        maxDepth: depthMap[opts.depth] ?? 5,
+        maxUrls: parseInt(opts.maxUrls, 10),
+        outputFormat: formatMap[opts.outputFormat] ?? 'summary'
       }, cliFlags);
     });
 }

package/src/cli/commands/track.js CHANGED Viewed

@@ -15,10 +15,27 @@ export function register(program) {
       const globals = cmd.parent.opts();
       const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
       const tool = new TrackChangesTool(getToolConfig('track_changes'));
-      await runTool(tool, {
-        url,
-        selector: opts.selector,
-        change_threshold: parseFloat(opts.threshold)
-      }, cliFlags);
+      // TrackChangesSchema shape: selector → trackingOptions.customSelectors,
+      // threshold (%) → trackingOptions.significanceThresholds (0-1, ordered).
+      const t = Math.min(Math.max(parseFloat(opts.threshold) / 100, 0), 1);
+      const trackingOptions = {
+        ...(opts.selector ? { customSelectors: [opts.selector] } : {}),
+        significanceThresholds: { minor: t, moderate: Math.max(0.3, t), major: Math.max(0.7, t) }
+      };
+      // `compare` throws "No baseline found" on first run — bootstrap one, then
+      // the next invocation reports actual changes against it.
+      const params = { url, trackingOptions };
+      const wrapperTool = {
+        execute: async (p) => {
+          const res = await tool.execute({ ...p, operation: 'compare' });
+          if (res && res.success === false && /No baseline/i.test(res.error || '')) {
+            return await tool.execute({ ...p, operation: 'create_baseline' });
+          }
+          return res;
+        }
+      };
+      await runTool(wrapperTool, params, cliFlags);
     });
 }

package/src/cli/index.js CHANGED Viewed

@@ -16,6 +16,19 @@ import { createRequire } from 'node:module';
 import { fileURLToPath } from 'node:url';
 import { dirname, join } from 'node:path';
 import { readFileSync } from 'node:fs';
+import { setGlobalDispatcher, EnvHttpProxyAgent } from 'undici';
+// Node's global fetch() (undici) ignores HTTP(S)_PROXY by default. When a proxy
+// is configured — e.g. inside a sandbox that only permits egress through it —
+// route every fetch() through it so the CLI's API/scrape calls succeed without
+// excluding the command from the sandbox. EnvHttpProxyAgent honors HTTPS_PROXY,
+// HTTP_PROXY and NO_PROXY itself; this is a no-op when none are set.
+if (process.env.HTTPS_PROXY || process.env.HTTP_PROXY || process.env.ALL_PROXY ||
+    process.env.https_proxy || process.env.http_proxy || process.env.all_proxy) {
+  try {
+    setGlobalDispatcher(new EnvHttpProxyAgent());
+  } catch { /* proxy agent unavailable — fall back to direct connections */ }
+}
 // Load package.json for version
 const __filename = fileURLToPath(import.meta.url);

package/src/core/StealthBrowserManager.js CHANGED Viewed

@@ -1376,7 +1376,7 @@ export class StealthBrowserManager {
       });
       if (challengeDetected) {
-        console.log('CloudFlare challenge detected, attempting bypass...');
+        console.error('CloudFlare challenge detected, attempting bypass...');
         // Simulate human behavior during challenge
         if (this.humanBehaviorSimulator) {
@@ -1437,7 +1437,7 @@ export class StealthBrowserManager {
       });
       if (recaptchaDetected) {
-        console.log('reCAPTCHA detected, implementing human behavior...');
+        console.error('reCAPTCHA detected, implementing human behavior...');
         // Simulate human inspection of the reCAPTCHA
         if (this.humanBehaviorSimulator) {
@@ -1491,7 +1491,7 @@ export class StealthBrowserManager {
       this.proxyManager.currentProxy = proxies[this.proxyManager.proxyIndex];
       this.proxyManager.lastRotation = now;
-      console.log('Rotated to proxy:', this.proxyManager.currentProxy);
+      console.error('Rotated to proxy:', this.proxyManager.currentProxy);
     }
     return this.proxyManager.currentProxy;
@@ -1523,6 +1523,43 @@ export class StealthBrowserManager {
     return page;
   }
+  /**
+   * One-shot stealth scrape: create a context + page, navigate to the URL,
+   * extract content, and tear the context down. Convenience wrapper over the
+   * operation-based API (createStealthContext → createStealthPage → goto).
+   *
+   * @param {Object} params
+   * @param {string} params.url                 — URL to scrape
+   * @param {string} [params.engine]            — browser engine (forwarded to config; playwright by default)
+   * @param {number} [params.wait_for]          — extra wait after load, in ms
+   * @param {boolean} [params.screenshot]       — capture a base64 PNG screenshot
+   * @param {Object} [params.stealthConfig]     — stealth configuration overrides
+   * @returns {Promise<{success:boolean, url:string, title:string, text:string, html:string, screenshot:?string}>}
+   */
+  async scrapeWithStealth({ url, engine, wait_for = 0, screenshot = false, stealthConfig = {} } = {}) {
+    if (!url) throw new Error('scrapeWithStealth requires a url');
+    const { contextId } = await this.createStealthContext({ ...stealthConfig, engine });
+    try {
+      const page = await this.createStealthPage(contextId);
+      await page.goto(url, { waitUntil: 'domcontentloaded' });
+      if (wait_for > 0) await page.waitForTimeout(wait_for);
+      const [title, html, text] = await Promise.all([
+        page.title().catch(() => ''),
+        page.content().catch(() => ''),
+        page.innerText('body').catch(() => '')
+      ]);
+      const shot = screenshot
+        ? await page.screenshot({ encoding: 'base64', fullPage: false }).catch(() => null)
+        : null;
+      return { success: true, url, title, text, html, screenshot: shot };
+    } finally {
+      await this.closeContext(contextId).catch(() => {});
+    }
+  }
   /**
    * Apply page-level stealth measures
    */

package/src/core/WebhookDispatcher.js CHANGED Viewed

@@ -74,7 +74,7 @@ export class WebhookDispatcher extends EventEmitter {
       onRetry: (error, attempt, delay, context) => {
         this.stats.retriedDeliveries++;
         if (this.enableLogging) {
-          console.log('Webhook retry ' + attempt + ' for ' + context.url + ' after ' + delay + 'ms: ' + error.message);
+          console.error('Webhook retry ' + attempt + ' for ' + context.url + ' after ' + delay + 'ms: ' + error.message);
         }
       }
     });

package/src/core/crawlers/BFSCrawler.js CHANGED Viewed

@@ -142,13 +142,13 @@ export class BFSCrawler {
     });
     if (!filterDecision.allowed) {
-      console.log(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
+      console.error(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
       return;
     }
     // Backward compatibility: also check legacy patterns
     if (!this.shouldCrawlUrl(normalizedUrl)) {
-      console.log(`Legacy pattern blocks: ${normalizedUrl}`);
+      console.error(`Legacy pattern blocks: ${normalizedUrl}`);
       return;
     }
@@ -156,7 +156,7 @@ export class BFSCrawler {
     if (this.respectRobots && this.robotsChecker) {
       const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
       if (!canFetch) {
-        console.log(`Robots.txt blocks: ${normalizedUrl}`);
+        console.error(`Robots.txt blocks: ${normalizedUrl}`);
         return;
       }
     }

package/src/tools/advanced/ScrapeWithActionsTool.js CHANGED Viewed

@@ -253,7 +253,7 @@ export class ScrapeWithActionsTool extends EventEmitter {
       const startTime = Date.now();
       if (this.enableLogging) {
-        console.log(`Starting scrape session ${sessionId} with ${validated.actions.length} actions on ${validated.url}`);
+        console.error(`Starting scrape session ${sessionId} with ${validated.actions.length} actions on ${validated.url}`);
       }
       // Check concurrent sessions limit
@@ -734,7 +734,7 @@ export class ScrapeWithActionsTool extends EventEmitter {
   log(level, message) {
     if (this.enableLogging) {
-      console.log(`[ScrapeWithActionsTool:${level.toUpperCase()}] ${message}`);
+      console.error(`[ScrapeWithActionsTool:${level.toUpperCase()}] ${message}`);
     }
   }

package/src/tools/extract/extractContent.js CHANGED Viewed

@@ -138,7 +138,7 @@ export class ExtractContentTool {
       const shouldUseJavaScript = options.requiresJavaScript || await this.shouldUseJavaScript(url);
       if (shouldUseJavaScript) {
-        console.log('Using browser rendering for JavaScript content...');
+        console.error('Using browser rendering for JavaScript content...');
         const browserResult = await this.browserProcessor.processURL({
           url,
           options: {

package/src/tools/extract/processDocument.js CHANGED Viewed

@@ -250,7 +250,7 @@ export class ProcessDocumentTool {
     const shouldUseJavaScript = options.requiresJavaScript || await this.shouldUseJavaScript(source);
     if (shouldUseJavaScript) {
-      console.log('Using browser rendering for JavaScript content...');
+      console.error('Using browser rendering for JavaScript content...');
       const browserResult = await this.browserProcessor.processURL({
         url: source,
         options: {