npm - crawlforge-mcp-server - Versions diffs - 4.6.4 → 4.6.6 - Mend

crawlforge-mcp-server 4.6.4 → 4.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +30 -0
package/package.json +5 -2
package/server.js +1 -1
package/src/core/ResearchOrchestrator.js +251 -34
package/src/tools/extract/extractContent.js +5 -0
package/src/tools/research/deepResearch.js +5 -1

package/README.md CHANGED Viewed

@@ -229,6 +229,11 @@ export OLLAMA_DEFAULT_MODEL="llama3.2"             # default; any locally-pulled
 # Optional: Cloud LLM keys — only needed when you pass provider: "openai" or "anthropic"
 export OPENAI_API_KEY="sk-..."
 export ANTHROPIC_API_KEY="sk-ant-..."
+# Optional: deep_research stealth extraction fallback (v4.6.6) — see below
+export RESEARCH_STEALTH_ENGINE="auto"      # auto (default) | camoufox | chromium
+export RESEARCH_STEALTH_FALLBACK="true"    # set to "false" to disable entirely
+export RESEARCH_MAX_STEALTH_RETRIES="8"    # cap on stealth retries per research run
 ```
 ### Local-LLM quickstart (`extract_with_llm` with Ollama)
@@ -247,6 +252,31 @@ ollama pull llama3.2
 #    extract_with_llm({ url: "https://example.com", prompt: "…", model: "llama3.2" })
 ```
+### Stealth extraction for `deep_research` (Camoufox)
+`deep_research` automatically retries sources that block the normal fetch path (Reddit, Quora, forums, and Cloudflare/DataDome-protected pages return HTTP 403) through a **real fingerprinted browser**, then re-extracts from the rendered HTML. It's bounded (`RESEARCH_MAX_STEALTH_RETRIES`, default 8, plus a per-page timeout) and lazy — the browser stack only loads when a source is actually blocked.
+Engine selection (`RESEARCH_STEALTH_ENGINE`):
+- **`auto`** (default) — prefer **Camoufox** (Firefox anti-detect), fall back to Chromium stealth, then plain fetch.
+- **`camoufox`** — force Camoufox.
+- **`chromium`** — force the Chromium stealth engine.
+Headless Chromium **cannot** clear modern challenges (Cloudflare Turnstile, DataDome) — **Camoufox can**. In testing it recovered Quora and Trustpilot pages that were otherwise fully blocked. To enable it, install the optional dependency and run its one-time binary fetch:
+```bash
+# Camoufox is declared as an optional dependency, so a normal install already pulls it.
+# If you installed with --no-optional, add it explicitly:
+npm install camoufox
+# One-time download of the Camoufox Firefox binary (~130 MB):
+npx camoufox fetch
+```
+Without the Camoufox binary, `deep_research` silently falls back to Chromium stealth and then to plain fetch — no errors, just lower recovery on heavily-protected sites. Disable the whole fallback with `RESEARCH_STEALTH_FALLBACK=false`.
+> **Note:** Hard IP-reputation blocks (e.g. Reddit's edge `403`) resist headless stealth from any IP and require residential/mobile proxies, which CrawlForge does not provide. See [docs/stealth-engines.md](docs/stealth-engines.md) for details.
 ### Manual Configuration
 Your configuration is stored at `~/.crawlforge/config.json`:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawlforge-mcp-server",
-  "version": "4.6.4",
+  "version": "4.6.6",
   "mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
   "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
   "main": "server.js",
@@ -18,7 +18,7 @@
     "test": "node tests/integration/mcp-protocol-compliance.test.js",
     "test:unit": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/unit/*.test.js'",
     "test:integration": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/integration/tools/*.test.js'",
-    "test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
+    "test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test --test-force-exit 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
     "test:tools": "node test-tools.js",
     "test:real-world": "node test-real-world.js",
     "test:all": "bash run-all-tests.sh",
@@ -131,6 +131,9 @@
     "winston": "^3.11.0",
     "zod": "^3.23.8"
   },
+  "optionalDependencies": {
+    "camoufox": "^0.1.19"
+  },
   "devDependencies": {
     "@jest/globals": "^30.3.0",
     "c8": "^11.0.0",

package/server.js CHANGED Viewed

@@ -90,7 +90,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
 // Create the server
 const server = new McpServer({
   name: "crawlforge",
-  version: "4.6.4",
+  version: "4.6.6",
   description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
   homepage: "https://www.crawlforge.dev",
   icon: "https://www.crawlforge.dev/icon.png"

package/src/core/ResearchOrchestrator.js CHANGED Viewed

@@ -34,12 +34,27 @@ export class ResearchOrchestrator extends EventEmitter {
       enableConflictDetection = true,
       cacheEnabled = true,
       cacheTTL = 1800000, // 30 minutes
+      researchApproach = 'broad',
+      // Stealth-browser fallback for sources that block the plain fetch/extract
+      // path (Reddit, Quora, forums → HTTP 403). On by default; bounded so it
+      // cannot blow the research time budget. Disable with
+      // RESEARCH_STEALTH_FALLBACK=false.
+      enableStealthFallback = process.env.RESEARCH_STEALTH_FALLBACK !== 'false',
+      maxStealthRetries = parseInt(process.env.RESEARCH_MAX_STEALTH_RETRIES || '8', 10),
+      // 'auto' (default) prefers Camoufox (Firefox anti-detect — beats
+      // Cloudflare/DataDome that headless Chromium can't) and falls back to
+      // Chromium stealth when Camoufox/its binary is unavailable. Force one
+      // with RESEARCH_STEALTH_ENGINE=camoufox|chromium.
+      stealthEngine = process.env.RESEARCH_STEALTH_ENGINE || 'auto',
+      stealthLevel = 'medium',
+      stealthTimeoutMs = 20000,
       searchConfig = {},
       crawlConfig = {},
       extractConfig = {},
       summarizeConfig = {}
     } = options;
+    this.researchApproach = researchApproach;
     this.maxDepth = Math.min(Math.max(1, maxDepth), 10);
     this.maxUrls = Math.min(Math.max(1, maxUrls), 1000);
     this.timeLimit = Math.min(Math.max(30000, timeLimit), 300000);
@@ -47,6 +62,18 @@ export class ResearchOrchestrator extends EventEmitter {
     this.enableSourceVerification = enableSourceVerification;
     this.enableConflictDetection = enableConflictDetection;
+    // Stealth fallback config + lazy state (browser launched only on first block)
+    this.enableStealthFallback = enableStealthFallback;
+    this.maxStealthRetries = Math.max(0, maxStealthRetries);
+    this.stealthEngine = stealthEngine;
+    this.stealthLevel = stealthLevel;
+    this.stealthTimeoutMs = stealthTimeoutMs;
+    this._stealthManager = null;     // Chromium StealthBrowserManager (fallback engine)
+    this._stealthBrowser = null;     // Camoufox browser handle (preferred engine)
+    this._stealthEngineActive = null;
+    this._stealthInit = null;
+    this._stealthCount = 0;
     // Initialize tools
     this.searchTool = new SearchWebTool(searchConfig);
     this.crawlTool = new CrawlDeepTool(crawlConfig);
@@ -99,7 +126,9 @@ export class ResearchOrchestrator extends EventEmitter {
       llmAnalysisCalls: 0,
       semanticAnalysisTime: 0,
       queryExpansionTime: 0,
-      synthesisTime: 0
+      synthesisTime: 0,
+      stealthRetries: 0,
+      stealthRecovered: 0
     };
   }
@@ -201,6 +230,9 @@ export class ResearchOrchestrator extends EventEmitter {
     Object.keys(this.metrics).forEach(key => {
       this.metrics[key] = 0;
     });
+    // Reset per-run stealth-retry budget
+    this._stealthCount = 0;
   }
   /**
@@ -269,32 +301,50 @@ export class ResearchOrchestrator extends EventEmitter {
   }
   /**
-   * Generate research-specific query variations
+   * Generate research-specific query variations, tuned to the research approach.
+   *
+   * Academic/scientific suffixes ("peer reviewed", "research paper", "what is")
+   * only help when the caller actually asked for an academic search. Appending
+   * them to commercial or comparative topics dragged web search toward
+   * irrelevant government/academic PDFs and long-tail noise — the cause of
+   * near-empty research runs on niche commercial topics.
    */
   generateResearchVariations(topic) {
-    const variations = [];
-    // Question-based variations
-    variations.push(`what is ${topic}`);
-    variations.push(`how does ${topic} work`);
-    variations.push(`${topic} explained`);
-    variations.push(`${topic} research`);
-    variations.push(`${topic} studies`);
-    variations.push(`${topic} analysis`);
-    // Academic and authoritative variations
-    variations.push(`${topic} academic`);
-    variations.push(`${topic} scientific`);
-    variations.push(`${topic} research paper`);
-    variations.push(`${topic} peer reviewed`);
-    // Current and historical context
-    variations.push(`latest ${topic}`);
-    variations.push(`current ${topic}`);
-    variations.push(`${topic} 2024`);
-    variations.push(`${topic} trends`);
-    return variations.slice(0, 10); // Limit variations
+    const approach = this.researchApproach || 'broad';
+    if (approach === 'academic') {
+      return [
+        `${topic} research`,
+        `${topic} study`,
+        `${topic} analysis`,
+        `${topic} academic`,
+        `${topic} scientific`,
+        `${topic} research paper`,
+        `${topic} peer reviewed`,
+        `${topic} explained`
+      ];
+    }
+    if (approach === 'current_events') {
+      return [
+        `latest ${topic}`,
+        `${topic} news`,
+        `recent ${topic}`,
+        `${topic} update`,
+        `${topic} announcement`
+      ];
+    }
+    // broad / focused / comparative — commercial & general intent
+    return [
+      `${topic} review`,
+      `${topic} reviews`,
+      `${topic} comparison`,
+      `${topic} vs alternatives`,
+      `${topic} pricing`,
+      `best ${topic}`,
+      `${topic} company`
+    ];
   }
   /**
@@ -531,11 +581,38 @@ export class ResearchOrchestrator extends EventEmitter {
             }
             // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
-            const contentText = contentData && contentData.content
-              ? (typeof contentData.content === 'string'
-                  ? contentData.content
-                  : (contentData.content.text || ''))
+            const normalizeContent = (cd) => cd && cd.content
+              ? (typeof cd.content === 'string' ? cd.content : (cd.content.text || ''))
               : '';
+            let contentText = normalizeContent(contentData);
+            // Stealth fallback: high-value discussion sources (Reddit, Quora,
+            // forums) return HTTP 403 to the plain fetch/extract path. When the
+            // normal path produced no usable content, retry through a real
+            // fingerprinted browser and re-run extraction on the rendered HTML.
+            // Bounded by maxStealthRetries + a per-page timeout.
+            const blocked = !contentData || contentData.success === false || contentText.trim().length === 0;
+            if (blocked && this.enableStealthFallback && this._stealthCount < this.maxStealthRetries) {
+              this._stealthCount++;
+              this.metrics.stealthRetries++;
+              try {
+                const stealthHtml = await this._stealthFetchHtml(source.link);
+                if (stealthHtml) {
+                  contentData = await this.extractTool.execute({
+                    url: source.link,
+                    html: stealthHtml,
+                    options: { includeMetadata: true, includeStructuredData: true }
+                  });
+                  contentText = normalizeContent(contentData);
+                  if (contentData && contentData.success !== false && contentText.trim().length > 0) {
+                    this.metrics.stealthRecovered++;
+                    this.logActivity('stealth_recovery', { url: source.link });
+                  }
+                }
+              } catch (stealthError) {
+                this.logger.warn('Stealth fallback failed', { url: source.link, error: stealthError.message });
+              }
+            }
             // Only count and enhance sources that actually produced non-empty content.
             // Skip failed extractions and empty {text:""} results.
@@ -621,10 +698,134 @@ export class ResearchOrchestrator extends EventEmitter {
       }
     });
+    // Tear down the stealth browser as soon as the extraction stage is done —
+    // it is only needed here and would otherwise leak a Playwright handle.
+    await this._closeStealth();
     // Sort by relevance score (LLM or traditional)
     return detailedFindings.sort((a, b) => (b.relevanceScore || 0) - (a.relevanceScore || 0));
   }
+  /**
+   * Lazily launch the stealth browser once. The heavy browser stack is only
+   * loaded when a source actually blocks the plain path. Engine selection:
+   *   - 'camoufox'/'auto' → Camoufox (Firefox anti-detect). Loaded via the CJS
+   *     build (its ESM bundle has a broken dynamic-require). Beats Cloudflare/
+   *     DataDome challenges that patched headless Chromium can't pass.
+   *   - 'chromium', or any Camoufox failure under 'auto' → StealthBrowserManager.
+   */
+  async _getStealthBrowser() {
+    if (!this._stealthInit) {
+      this._stealthInit = (async () => {
+        if (this.stealthEngine === 'camoufox' || this.stealthEngine === 'auto') {
+          try {
+            const { createRequire } = await import('module');
+            const require = createRequire(import.meta.url);
+            const camoufox = require('camoufox'); // CJS build — ESM build is broken
+            await this._ensureCamoufoxLayout(camoufox);
+            this._stealthBrowser = await camoufox.Camoufox({ headless: true });
+            this._stealthEngineActive = 'camoufox';
+            this.logger.info('Stealth fallback using Camoufox (Firefox) engine');
+            return;
+          } catch (e) {
+            if (this.stealthEngine === 'camoufox') throw e; // explicit request → surface
+            this.logger.warn('Camoufox unavailable, falling back to Chromium stealth', { error: e.message });
+          }
+        }
+        const { StealthBrowserManager } = await import('./StealthBrowserManager.js');
+        this._stealthManager = new StealthBrowserManager();
+        await this._stealthManager.launchStealthBrowser({ level: this.stealthLevel });
+        this._stealthEngineActive = 'chromium';
+      })();
+    }
+    await this._stealthInit;
+  }
+  /**
+   * macOS packaging fix for camoufox-js: it expects properties.json in
+   * Camoufox.app/Contents/MacOS/, but the .app bundle ships it under
+   * Contents/Resources/. Bridge it so the launcher can boot. Best-effort.
+   */
+  async _ensureCamoufoxLayout(camoufox) {
+    if (process.platform !== 'darwin' || !camoufox?.INSTALL_DIR) return;
+    try {
+      const fs = await import('fs');
+      const path = await import('path');
+      const appDir = path.join(camoufox.INSTALL_DIR, 'Camoufox.app', 'Contents');
+      const target = path.join(appDir, 'MacOS', 'properties.json');
+      const source = path.join(appDir, 'Resources', 'properties.json');
+      if (!fs.existsSync(target) && fs.existsSync(source)) {
+        fs.copyFileSync(source, target);
+      }
+    } catch { /* best-effort; launch surfaces a real error if it matters */ }
+  }
+  /**
+   * Fetch a URL's fully-rendered HTML through the stealth browser. Returns the
+   * HTML string, or null if every attempt was blocked / empty.
+   *
+   * Cloudflare/DataDome challenges are probabilistic — the same URL may serve a
+   * challenge on one load and the real page on the next — so Camoufox retries a
+   * few times with a fresh page each attempt. Chromium can't clear these at all
+   * (proven), so it gets a single attempt to avoid burning the time budget.
+   */
+  async _stealthFetchHtml(url) {
+    await this._getStealthBrowser();
+    const attempts = this._stealthEngineActive === 'camoufox' ? 3 : 1;
+    for (let i = 0; i < attempts; i++) {
+      const html = await this._stealthFetchOnce(url);
+      if (html) return html;
+    }
+    return null;
+  }
+  /** One stealth navigation. Fresh page/context; judges blocked by rendered content. */
+  async _stealthFetchOnce(url) {
+    let page;
+    if (this._stealthEngineActive === 'camoufox') {
+      page = await this._stealthBrowser.newPage();
+    } else {
+      const { contextId } = await this._stealthManager.createStealthContext({ level: this.stealthLevel });
+      page = await this._stealthManager.createStealthPage(contextId);
+    }
+    try {
+      const resp = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: this.stealthTimeoutMs });
+      // Do NOT bail on the initial HTTP status: anti-bot challenges (Cloudflare
+      // Turnstile) return 403 on the first response and only resolve to the
+      // real page after their JS runs. Let it settle, then judge by the
+      // *rendered* content instead.
+      await page.waitForLoadState('networkidle', { timeout: 8000 }).catch(() => {});
+      await page.waitForTimeout(2500).catch(() => {});
+      const html = await page.content();
+      const title = (await page.title().catch(() => '')) || '';
+      const bodyLen = await page.evaluate(() => document.body?.innerText?.trim().length || 0).catch(() => 0);
+      // Still a challenge/block page → treat as blocked.
+      const challengeTitle = /just a moment|checking your browser|attention required|verify you are human|access denied|^blocked$/i.test(title);
+      const status = resp ? resp.status() : 0;
+      if (challengeTitle) return null;
+      if (status >= 400 && bodyLen < 500) return null; // hard block (e.g. Reddit 403 shell)
+      if (bodyLen < 200) return null;                  // empty / interstitial
+      return html && html.length > 200 ? html : null;
+    } finally {
+      await page.close().catch(() => {});
+    }
+  }
+  /** Close the stealth browser and reset its lazy state (idempotent). */
+  async _closeStealth() {
+    try {
+      if (this._stealthBrowser) await this._stealthBrowser.close().catch(() => {});
+      if (this._stealthManager) await this._stealthManager.cleanup().catch(() => {});
+    } catch (e) {
+      this.logger.warn('Stealth browser cleanup failed', { error: e.message });
+    }
+    this._stealthBrowser = null;
+    this._stealthManager = null;
+    this._stealthEngineActive = null;
+    this._stealthInit = null;
+  }
   /**
    * Verify source credibility using multiple factors
    */
@@ -644,8 +845,19 @@ export class ResearchOrchestrator extends EventEmitter {
           citationPotential: this.assessCitationPotential(source)
         };
-        const overallCredibility = this.calculateOverallCredibility(credibilityFactors);
+        let overallCredibility = this.calculateOverallCredibility(credibilityFactors);
+        // Down-weight topically-irrelevant sources so high-authority but
+        // off-topic pages (e.g. a .gov PDF unrelated to the query) don't
+        // dominate the results. relevanceScore is keyword-based here (no LLM):
+        // ~1 when the topic appears in the content, ~0 when it doesn't.
+        const relevance = typeof source.relevanceScore === 'number'
+          ? source.relevanceScore
+          : null;
+        if (relevance !== null) {
+          overallCredibility *= (0.4 + 0.6 * relevance);
+        }
         // Only include sources that meet minimum credibility threshold
         if (overallCredibility >= 0.3) {
           verifiedSources.push({
@@ -1453,7 +1665,10 @@ export class ResearchOrchestrator extends EventEmitter {
     try {
       // Stop any active research
       this.stopResearch();
+      // Tear down the stealth browser if one was launched
+      await this._closeStealth();
       // Clear cache if available
       if (this.cache && typeof this.cache.clear === "function") {
         await this.cache.clear();
@@ -1491,9 +1706,11 @@ export class ResearchOrchestrator extends EventEmitter {
         llmAnalysisCalls: 0,
         semanticAnalysisTime: 0,
         queryExpansionTime: 0,
-        synthesisTime: 0
+        synthesisTime: 0,
+        stealthRetries: 0,
+        stealthRecovered: 0
       };
     } catch (error) {
       // Silent cleanup - do not throw errors during cleanup
       console.warn("Warning during ResearchOrchestrator cleanup:", error.message);

package/src/tools/extract/extractContent.js CHANGED Viewed

@@ -11,6 +11,11 @@ import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
 const ExtractContentSchema = z.object({
   url: z.string().url(),
+  // Pre-rendered HTML to process directly instead of fetching `url` (e.g. a
+  // post-action page from scrape_with_actions, or a stealth-browser render in
+  // deep_research). Without this field Zod stripped it and the tool always
+  // re-fetched the URL — silently defeating any pre-fetched-HTML caller.
+  html: z.string().optional(),
   options: z.object({
     // Content extraction options
     useReadability: z.boolean().default(true),

package/src/tools/research/deepResearch.js CHANGED Viewed

@@ -271,7 +271,11 @@ export class DeepResearchTool {
     const scopeConfig = {
       maxUrls: params.maxUrls,
       timeLimit: params.timeLimit,
-      concurrency: params.concurrency
+      concurrency: params.concurrency,
+      // The orchestrator tunes its query expansion to the approach (commercial
+      // vs academic vs current-events); without this it always used academic
+      // variations, which poisoned commercial/comparative searches.
+      researchApproach: params.researchApproach
     };
     switch (params.researchApproach) {