crawlforge-mcp-server 4.6.5 → 4.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -229,6 +229,11 @@ export OLLAMA_DEFAULT_MODEL="llama3.2" # default; any locally-pulled
229
229
  # Optional: Cloud LLM keys — only needed when you pass provider: "openai" or "anthropic"
230
230
  export OPENAI_API_KEY="sk-..."
231
231
  export ANTHROPIC_API_KEY="sk-ant-..."
232
+
233
+ # Optional: deep_research stealth extraction fallback (v4.6.6) — see below
234
+ export RESEARCH_STEALTH_ENGINE="auto" # auto (default) | camoufox | chromium
235
+ export RESEARCH_STEALTH_FALLBACK="true" # set to "false" to disable entirely
236
+ export RESEARCH_MAX_STEALTH_RETRIES="8" # cap on stealth retries per research run
232
237
  ```
233
238
 
234
239
  ### Local-LLM quickstart (`extract_with_llm` with Ollama)
@@ -247,6 +252,31 @@ ollama pull llama3.2
247
252
  # extract_with_llm({ url: "https://example.com", prompt: "…", model: "llama3.2" })
248
253
  ```
249
254
 
255
+ ### Stealth extraction for `deep_research` (Camoufox)
256
+
257
+ `deep_research` automatically retries sources that block the normal fetch path (Reddit, Quora, forums, and Cloudflare/DataDome-protected pages return HTTP 403) through a **real fingerprinted browser**, then re-extracts from the rendered HTML. It's bounded (`RESEARCH_MAX_STEALTH_RETRIES`, default 8, plus a per-page timeout) and lazy — the browser stack only loads when a source is actually blocked.
258
+
259
+ Engine selection (`RESEARCH_STEALTH_ENGINE`):
260
+
261
+ - **`auto`** (default) — prefer **Camoufox** (Firefox anti-detect), fall back to Chromium stealth, then plain fetch.
262
+ - **`camoufox`** — force Camoufox.
263
+ - **`chromium`** — force the Chromium stealth engine.
264
+
265
+ Headless Chromium **cannot** clear modern challenges (Cloudflare Turnstile, DataDome) — **Camoufox can**. In testing it recovered Quora and Trustpilot pages that were otherwise fully blocked. To enable it, install the optional dependency and run its one-time binary fetch:
266
+
267
+ ```bash
268
+ # Camoufox is declared as an optional dependency, so a normal install already pulls it.
269
+ # If you installed with --no-optional, add it explicitly:
270
+ npm install camoufox
271
+
272
+ # One-time download of the Camoufox Firefox binary (~130 MB):
273
+ npx camoufox fetch
274
+ ```
275
+
276
+ Without the Camoufox binary, `deep_research` silently falls back to Chromium stealth and then to plain fetch — no errors, just lower recovery on heavily-protected sites. Disable the whole fallback with `RESEARCH_STEALTH_FALLBACK=false`.
277
+
278
+ > **Note:** Hard IP-reputation blocks (e.g. Reddit's edge `403`) resist headless stealth from any IP and require residential/mobile proxies, which CrawlForge does not provide. See [docs/stealth-engines.md](docs/stealth-engines.md) for details.
279
+
250
280
  ### Manual Configuration
251
281
 
252
282
  Your configuration is stored at `~/.crawlforge/config.json`:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawlforge-mcp-server",
3
- "version": "4.6.5",
3
+ "version": "4.6.6",
4
4
  "mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
5
5
  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
6
6
  "main": "server.js",
@@ -131,6 +131,9 @@
131
131
  "winston": "^3.11.0",
132
132
  "zod": "^3.23.8"
133
133
  },
134
+ "optionalDependencies": {
135
+ "camoufox": "^0.1.19"
136
+ },
134
137
  "devDependencies": {
135
138
  "@jest/globals": "^30.3.0",
136
139
  "c8": "^11.0.0",
package/server.js CHANGED
@@ -90,7 +90,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
90
90
  // Create the server
91
91
  const server = new McpServer({
92
92
  name: "crawlforge",
93
- version: "4.6.5",
93
+ version: "4.6.6",
94
94
  description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
95
95
  homepage: "https://www.crawlforge.dev",
96
96
  icon: "https://www.crawlforge.dev/icon.png"
@@ -35,6 +35,19 @@ export class ResearchOrchestrator extends EventEmitter {
35
35
  cacheEnabled = true,
36
36
  cacheTTL = 1800000, // 30 minutes
37
37
  researchApproach = 'broad',
38
+ // Stealth-browser fallback for sources that block the plain fetch/extract
39
+ // path (Reddit, Quora, forums → HTTP 403). On by default; bounded so it
40
+ // cannot blow the research time budget. Disable with
41
+ // RESEARCH_STEALTH_FALLBACK=false.
42
+ enableStealthFallback = process.env.RESEARCH_STEALTH_FALLBACK !== 'false',
43
+ maxStealthRetries = parseInt(process.env.RESEARCH_MAX_STEALTH_RETRIES || '8', 10),
44
+ // 'auto' (default) prefers Camoufox (Firefox anti-detect — beats
45
+ // Cloudflare/DataDome that headless Chromium can't) and falls back to
46
+ // Chromium stealth when Camoufox/its binary is unavailable. Force one
47
+ // with RESEARCH_STEALTH_ENGINE=camoufox|chromium.
48
+ stealthEngine = process.env.RESEARCH_STEALTH_ENGINE || 'auto',
49
+ stealthLevel = 'medium',
50
+ stealthTimeoutMs = 20000,
38
51
  searchConfig = {},
39
52
  crawlConfig = {},
40
53
  extractConfig = {},
@@ -49,6 +62,18 @@ export class ResearchOrchestrator extends EventEmitter {
49
62
  this.enableSourceVerification = enableSourceVerification;
50
63
  this.enableConflictDetection = enableConflictDetection;
51
64
 
65
+ // Stealth fallback config + lazy state (browser launched only on first block)
66
+ this.enableStealthFallback = enableStealthFallback;
67
+ this.maxStealthRetries = Math.max(0, maxStealthRetries);
68
+ this.stealthEngine = stealthEngine;
69
+ this.stealthLevel = stealthLevel;
70
+ this.stealthTimeoutMs = stealthTimeoutMs;
71
+ this._stealthManager = null; // Chromium StealthBrowserManager (fallback engine)
72
+ this._stealthBrowser = null; // Camoufox browser handle (preferred engine)
73
+ this._stealthEngineActive = null;
74
+ this._stealthInit = null;
75
+ this._stealthCount = 0;
76
+
52
77
  // Initialize tools
53
78
  this.searchTool = new SearchWebTool(searchConfig);
54
79
  this.crawlTool = new CrawlDeepTool(crawlConfig);
@@ -101,7 +126,9 @@ export class ResearchOrchestrator extends EventEmitter {
101
126
  llmAnalysisCalls: 0,
102
127
  semanticAnalysisTime: 0,
103
128
  queryExpansionTime: 0,
104
- synthesisTime: 0
129
+ synthesisTime: 0,
130
+ stealthRetries: 0,
131
+ stealthRecovered: 0
105
132
  };
106
133
  }
107
134
 
@@ -203,6 +230,9 @@ export class ResearchOrchestrator extends EventEmitter {
203
230
  Object.keys(this.metrics).forEach(key => {
204
231
  this.metrics[key] = 0;
205
232
  });
233
+
234
+ // Reset per-run stealth-retry budget
235
+ this._stealthCount = 0;
206
236
  }
207
237
 
208
238
  /**
@@ -551,11 +581,38 @@ export class ResearchOrchestrator extends EventEmitter {
551
581
  }
552
582
 
553
583
  // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
554
- const contentText = contentData && contentData.content
555
- ? (typeof contentData.content === 'string'
556
- ? contentData.content
557
- : (contentData.content.text || ''))
584
+ const normalizeContent = (cd) => cd && cd.content
585
+ ? (typeof cd.content === 'string' ? cd.content : (cd.content.text || ''))
558
586
  : '';
587
+ let contentText = normalizeContent(contentData);
588
+
589
+ // Stealth fallback: high-value discussion sources (Reddit, Quora,
590
+ // forums) return HTTP 403 to the plain fetch/extract path. When the
591
+ // normal path produced no usable content, retry through a real
592
+ // fingerprinted browser and re-run extraction on the rendered HTML.
593
+ // Bounded by maxStealthRetries + a per-page timeout.
594
+ const blocked = !contentData || contentData.success === false || contentText.trim().length === 0;
595
+ if (blocked && this.enableStealthFallback && this._stealthCount < this.maxStealthRetries) {
596
+ this._stealthCount++;
597
+ this.metrics.stealthRetries++;
598
+ try {
599
+ const stealthHtml = await this._stealthFetchHtml(source.link);
600
+ if (stealthHtml) {
601
+ contentData = await this.extractTool.execute({
602
+ url: source.link,
603
+ html: stealthHtml,
604
+ options: { includeMetadata: true, includeStructuredData: true }
605
+ });
606
+ contentText = normalizeContent(contentData);
607
+ if (contentData && contentData.success !== false && contentText.trim().length > 0) {
608
+ this.metrics.stealthRecovered++;
609
+ this.logActivity('stealth_recovery', { url: source.link });
610
+ }
611
+ }
612
+ } catch (stealthError) {
613
+ this.logger.warn('Stealth fallback failed', { url: source.link, error: stealthError.message });
614
+ }
615
+ }
559
616
 
560
617
  // Only count and enhance sources that actually produced non-empty content.
561
618
  // Skip failed extractions and empty {text:""} results.
@@ -641,10 +698,134 @@ export class ResearchOrchestrator extends EventEmitter {
641
698
  }
642
699
  });
643
700
 
701
+ // Tear down the stealth browser as soon as the extraction stage is done —
702
+ // it is only needed here and would otherwise leak a Playwright handle.
703
+ await this._closeStealth();
704
+
644
705
  // Sort by relevance score (LLM or traditional)
645
706
  return detailedFindings.sort((a, b) => (b.relevanceScore || 0) - (a.relevanceScore || 0));
646
707
  }
647
708
 
709
+ /**
710
+ * Lazily launch the stealth browser once. The heavy browser stack is only
711
+ * loaded when a source actually blocks the plain path. Engine selection:
712
+ * - 'camoufox'/'auto' → Camoufox (Firefox anti-detect). Loaded via the CJS
713
+ * build (its ESM bundle has a broken dynamic-require). Beats Cloudflare/
714
+ * DataDome challenges that patched headless Chromium can't pass.
715
+ * - 'chromium', or any Camoufox failure under 'auto' → StealthBrowserManager.
716
+ */
717
+ async _getStealthBrowser() {
718
+ if (!this._stealthInit) {
719
+ this._stealthInit = (async () => {
720
+ if (this.stealthEngine === 'camoufox' || this.stealthEngine === 'auto') {
721
+ try {
722
+ const { createRequire } = await import('module');
723
+ const require = createRequire(import.meta.url);
724
+ const camoufox = require('camoufox'); // CJS build — ESM build is broken
725
+ await this._ensureCamoufoxLayout(camoufox);
726
+ this._stealthBrowser = await camoufox.Camoufox({ headless: true });
727
+ this._stealthEngineActive = 'camoufox';
728
+ this.logger.info('Stealth fallback using Camoufox (Firefox) engine');
729
+ return;
730
+ } catch (e) {
731
+ if (this.stealthEngine === 'camoufox') throw e; // explicit request → surface
732
+ this.logger.warn('Camoufox unavailable, falling back to Chromium stealth', { error: e.message });
733
+ }
734
+ }
735
+ const { StealthBrowserManager } = await import('./StealthBrowserManager.js');
736
+ this._stealthManager = new StealthBrowserManager();
737
+ await this._stealthManager.launchStealthBrowser({ level: this.stealthLevel });
738
+ this._stealthEngineActive = 'chromium';
739
+ })();
740
+ }
741
+ await this._stealthInit;
742
+ }
743
+
744
+ /**
745
+ * macOS packaging fix for camoufox-js: it expects properties.json in
746
+ * Camoufox.app/Contents/MacOS/, but the .app bundle ships it under
747
+ * Contents/Resources/. Bridge it so the launcher can boot. Best-effort.
748
+ */
749
+ async _ensureCamoufoxLayout(camoufox) {
750
+ if (process.platform !== 'darwin' || !camoufox?.INSTALL_DIR) return;
751
+ try {
752
+ const fs = await import('fs');
753
+ const path = await import('path');
754
+ const appDir = path.join(camoufox.INSTALL_DIR, 'Camoufox.app', 'Contents');
755
+ const target = path.join(appDir, 'MacOS', 'properties.json');
756
+ const source = path.join(appDir, 'Resources', 'properties.json');
757
+ if (!fs.existsSync(target) && fs.existsSync(source)) {
758
+ fs.copyFileSync(source, target);
759
+ }
760
+ } catch { /* best-effort; launch surfaces a real error if it matters */ }
761
+ }
762
+
763
+ /**
764
+ * Fetch a URL's fully-rendered HTML through the stealth browser. Returns the
765
+ * HTML string, or null if every attempt was blocked / empty.
766
+ *
767
+ * Cloudflare/DataDome challenges are probabilistic — the same URL may serve a
768
+ * challenge on one load and the real page on the next — so Camoufox retries a
769
+ * few times with a fresh page each attempt. Chromium can't clear these at all
770
+ * (proven), so it gets a single attempt to avoid burning the time budget.
771
+ */
772
+ async _stealthFetchHtml(url) {
773
+ await this._getStealthBrowser();
774
+ const attempts = this._stealthEngineActive === 'camoufox' ? 3 : 1;
775
+ for (let i = 0; i < attempts; i++) {
776
+ const html = await this._stealthFetchOnce(url);
777
+ if (html) return html;
778
+ }
779
+ return null;
780
+ }
781
+
782
+ /** One stealth navigation. Fresh page/context; judges blocked by rendered content. */
783
+ async _stealthFetchOnce(url) {
784
+ let page;
785
+ if (this._stealthEngineActive === 'camoufox') {
786
+ page = await this._stealthBrowser.newPage();
787
+ } else {
788
+ const { contextId } = await this._stealthManager.createStealthContext({ level: this.stealthLevel });
789
+ page = await this._stealthManager.createStealthPage(contextId);
790
+ }
791
+ try {
792
+ const resp = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: this.stealthTimeoutMs });
793
+ // Do NOT bail on the initial HTTP status: anti-bot challenges (Cloudflare
794
+ // Turnstile) return 403 on the first response and only resolve to the
795
+ // real page after their JS runs. Let it settle, then judge by the
796
+ // *rendered* content instead.
797
+ await page.waitForLoadState('networkidle', { timeout: 8000 }).catch(() => {});
798
+ await page.waitForTimeout(2500).catch(() => {});
799
+ const html = await page.content();
800
+ const title = (await page.title().catch(() => '')) || '';
801
+ const bodyLen = await page.evaluate(() => document.body?.innerText?.trim().length || 0).catch(() => 0);
802
+
803
+ // Still a challenge/block page → treat as blocked.
804
+ const challengeTitle = /just a moment|checking your browser|attention required|verify you are human|access denied|^blocked$/i.test(title);
805
+ const status = resp ? resp.status() : 0;
806
+ if (challengeTitle) return null;
807
+ if (status >= 400 && bodyLen < 500) return null; // hard block (e.g. Reddit 403 shell)
808
+ if (bodyLen < 200) return null; // empty / interstitial
809
+ return html && html.length > 200 ? html : null;
810
+ } finally {
811
+ await page.close().catch(() => {});
812
+ }
813
+ }
814
+
815
+ /** Close the stealth browser and reset its lazy state (idempotent). */
816
+ async _closeStealth() {
817
+ try {
818
+ if (this._stealthBrowser) await this._stealthBrowser.close().catch(() => {});
819
+ if (this._stealthManager) await this._stealthManager.cleanup().catch(() => {});
820
+ } catch (e) {
821
+ this.logger.warn('Stealth browser cleanup failed', { error: e.message });
822
+ }
823
+ this._stealthBrowser = null;
824
+ this._stealthManager = null;
825
+ this._stealthEngineActive = null;
826
+ this._stealthInit = null;
827
+ }
828
+
648
829
  /**
649
830
  * Verify source credibility using multiple factors
650
831
  */
@@ -1484,7 +1665,10 @@ export class ResearchOrchestrator extends EventEmitter {
1484
1665
  try {
1485
1666
  // Stop any active research
1486
1667
  this.stopResearch();
1487
-
1668
+
1669
+ // Tear down the stealth browser if one was launched
1670
+ await this._closeStealth();
1671
+
1488
1672
  // Clear cache if available
1489
1673
  if (this.cache && typeof this.cache.clear === "function") {
1490
1674
  await this.cache.clear();
@@ -1522,9 +1706,11 @@ export class ResearchOrchestrator extends EventEmitter {
1522
1706
  llmAnalysisCalls: 0,
1523
1707
  semanticAnalysisTime: 0,
1524
1708
  queryExpansionTime: 0,
1525
- synthesisTime: 0
1709
+ synthesisTime: 0,
1710
+ stealthRetries: 0,
1711
+ stealthRecovered: 0
1526
1712
  };
1527
-
1713
+
1528
1714
  } catch (error) {
1529
1715
  // Silent cleanup - do not throw errors during cleanup
1530
1716
  console.warn("Warning during ResearchOrchestrator cleanup:", error.message);
@@ -11,6 +11,11 @@ import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
11
11
 
12
12
  const ExtractContentSchema = z.object({
13
13
  url: z.string().url(),
14
+ // Pre-rendered HTML to process directly instead of fetching `url` (e.g. a
15
+ // post-action page from scrape_with_actions, or a stealth-browser render in
16
+ // deep_research). Without this field Zod stripped it and the tool always
17
+ // re-fetched the URL — silently defeating any pre-fetched-HTML caller.
18
+ html: z.string().optional(),
14
19
  options: z.object({
15
20
  // Content extraction options
16
21
  useReadability: z.boolean().default(true),