crawlforge-mcp-server 4.6.4 → 4.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -0
- package/package.json +5 -2
- package/server.js +1 -1
- package/src/core/ResearchOrchestrator.js +251 -34
- package/src/tools/extract/extractContent.js +5 -0
- package/src/tools/research/deepResearch.js +5 -1
package/README.md
CHANGED
|
@@ -229,6 +229,11 @@ export OLLAMA_DEFAULT_MODEL="llama3.2" # default; any locally-pulled
|
|
|
229
229
|
# Optional: Cloud LLM keys — only needed when you pass provider: "openai" or "anthropic"
|
|
230
230
|
export OPENAI_API_KEY="sk-..."
|
|
231
231
|
export ANTHROPIC_API_KEY="sk-ant-..."
|
|
232
|
+
|
|
233
|
+
# Optional: deep_research stealth extraction fallback (v4.6.6) — see below
|
|
234
|
+
export RESEARCH_STEALTH_ENGINE="auto" # auto (default) | camoufox | chromium
|
|
235
|
+
export RESEARCH_STEALTH_FALLBACK="true" # set to "false" to disable entirely
|
|
236
|
+
export RESEARCH_MAX_STEALTH_RETRIES="8" # cap on stealth retries per research run
|
|
232
237
|
```
|
|
233
238
|
|
|
234
239
|
### Local-LLM quickstart (`extract_with_llm` with Ollama)
|
|
@@ -247,6 +252,31 @@ ollama pull llama3.2
|
|
|
247
252
|
# extract_with_llm({ url: "https://example.com", prompt: "…", model: "llama3.2" })
|
|
248
253
|
```
|
|
249
254
|
|
|
255
|
+
### Stealth extraction for `deep_research` (Camoufox)
|
|
256
|
+
|
|
257
|
+
`deep_research` automatically retries sources that block the normal fetch path (Reddit, Quora, forums, and Cloudflare/DataDome-protected pages return HTTP 403) through a **real fingerprinted browser**, then re-extracts from the rendered HTML. It's bounded (`RESEARCH_MAX_STEALTH_RETRIES`, default 8, plus a per-page timeout) and lazy — the browser stack only loads when a source is actually blocked.
|
|
258
|
+
|
|
259
|
+
Engine selection (`RESEARCH_STEALTH_ENGINE`):
|
|
260
|
+
|
|
261
|
+
- **`auto`** (default) — prefer **Camoufox** (Firefox anti-detect), fall back to Chromium stealth, then plain fetch.
|
|
262
|
+
- **`camoufox`** — force Camoufox.
|
|
263
|
+
- **`chromium`** — force the Chromium stealth engine.
|
|
264
|
+
|
|
265
|
+
Headless Chromium **cannot** clear modern challenges (Cloudflare Turnstile, DataDome) — **Camoufox can**. In testing it recovered Quora and Trustpilot pages that were otherwise fully blocked. To enable it, install the optional dependency and run its one-time binary fetch:
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
# Camoufox is declared as an optional dependency, so a normal install already pulls it.
|
|
269
|
+
# If you installed with --no-optional, add it explicitly:
|
|
270
|
+
npm install camoufox
|
|
271
|
+
|
|
272
|
+
# One-time download of the Camoufox Firefox binary (~130 MB):
|
|
273
|
+
npx camoufox fetch
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
Without the Camoufox binary, `deep_research` silently falls back to Chromium stealth and then to plain fetch — no errors, just lower recovery on heavily-protected sites. Disable the whole fallback with `RESEARCH_STEALTH_FALLBACK=false`.
|
|
277
|
+
|
|
278
|
+
> **Note:** Hard IP-reputation blocks (e.g. Reddit's edge `403`) resist headless stealth from any IP and require residential/mobile proxies, which CrawlForge does not provide. See [docs/stealth-engines.md](docs/stealth-engines.md) for details.
|
|
279
|
+
|
|
250
280
|
### Manual Configuration
|
|
251
281
|
|
|
252
282
|
Your configuration is stored at `~/.crawlforge/config.json`:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.6.
|
|
3
|
+
"version": "4.6.6",
|
|
4
4
|
"mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
|
|
5
5
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
|
|
6
6
|
"main": "server.js",
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
"test": "node tests/integration/mcp-protocol-compliance.test.js",
|
|
19
19
|
"test:unit": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/unit/*.test.js'",
|
|
20
20
|
"test:integration": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/integration/tools/*.test.js'",
|
|
21
|
-
"test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
|
|
21
|
+
"test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test --test-force-exit 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
|
|
22
22
|
"test:tools": "node test-tools.js",
|
|
23
23
|
"test:real-world": "node test-real-world.js",
|
|
24
24
|
"test:all": "bash run-all-tests.sh",
|
|
@@ -131,6 +131,9 @@
|
|
|
131
131
|
"winston": "^3.11.0",
|
|
132
132
|
"zod": "^3.23.8"
|
|
133
133
|
},
|
|
134
|
+
"optionalDependencies": {
|
|
135
|
+
"camoufox": "^0.1.19"
|
|
136
|
+
},
|
|
134
137
|
"devDependencies": {
|
|
135
138
|
"@jest/globals": "^30.3.0",
|
|
136
139
|
"c8": "^11.0.0",
|
package/server.js
CHANGED
|
@@ -90,7 +90,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
90
90
|
// Create the server
|
|
91
91
|
const server = new McpServer({
|
|
92
92
|
name: "crawlforge",
|
|
93
|
-
version: "4.6.
|
|
93
|
+
version: "4.6.6",
|
|
94
94
|
description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
|
|
95
95
|
homepage: "https://www.crawlforge.dev",
|
|
96
96
|
icon: "https://www.crawlforge.dev/icon.png"
|
|
@@ -34,12 +34,27 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
34
34
|
enableConflictDetection = true,
|
|
35
35
|
cacheEnabled = true,
|
|
36
36
|
cacheTTL = 1800000, // 30 minutes
|
|
37
|
+
researchApproach = 'broad',
|
|
38
|
+
// Stealth-browser fallback for sources that block the plain fetch/extract
|
|
39
|
+
// path (Reddit, Quora, forums → HTTP 403). On by default; bounded so it
|
|
40
|
+
// cannot blow the research time budget. Disable with
|
|
41
|
+
// RESEARCH_STEALTH_FALLBACK=false.
|
|
42
|
+
enableStealthFallback = process.env.RESEARCH_STEALTH_FALLBACK !== 'false',
|
|
43
|
+
maxStealthRetries = parseInt(process.env.RESEARCH_MAX_STEALTH_RETRIES || '8', 10),
|
|
44
|
+
// 'auto' (default) prefers Camoufox (Firefox anti-detect — beats
|
|
45
|
+
// Cloudflare/DataDome that headless Chromium can't) and falls back to
|
|
46
|
+
// Chromium stealth when Camoufox/its binary is unavailable. Force one
|
|
47
|
+
// with RESEARCH_STEALTH_ENGINE=camoufox|chromium.
|
|
48
|
+
stealthEngine = process.env.RESEARCH_STEALTH_ENGINE || 'auto',
|
|
49
|
+
stealthLevel = 'medium',
|
|
50
|
+
stealthTimeoutMs = 20000,
|
|
37
51
|
searchConfig = {},
|
|
38
52
|
crawlConfig = {},
|
|
39
53
|
extractConfig = {},
|
|
40
54
|
summarizeConfig = {}
|
|
41
55
|
} = options;
|
|
42
56
|
|
|
57
|
+
this.researchApproach = researchApproach;
|
|
43
58
|
this.maxDepth = Math.min(Math.max(1, maxDepth), 10);
|
|
44
59
|
this.maxUrls = Math.min(Math.max(1, maxUrls), 1000);
|
|
45
60
|
this.timeLimit = Math.min(Math.max(30000, timeLimit), 300000);
|
|
@@ -47,6 +62,18 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
47
62
|
this.enableSourceVerification = enableSourceVerification;
|
|
48
63
|
this.enableConflictDetection = enableConflictDetection;
|
|
49
64
|
|
|
65
|
+
// Stealth fallback config + lazy state (browser launched only on first block)
|
|
66
|
+
this.enableStealthFallback = enableStealthFallback;
|
|
67
|
+
this.maxStealthRetries = Math.max(0, maxStealthRetries);
|
|
68
|
+
this.stealthEngine = stealthEngine;
|
|
69
|
+
this.stealthLevel = stealthLevel;
|
|
70
|
+
this.stealthTimeoutMs = stealthTimeoutMs;
|
|
71
|
+
this._stealthManager = null; // Chromium StealthBrowserManager (fallback engine)
|
|
72
|
+
this._stealthBrowser = null; // Camoufox browser handle (preferred engine)
|
|
73
|
+
this._stealthEngineActive = null;
|
|
74
|
+
this._stealthInit = null;
|
|
75
|
+
this._stealthCount = 0;
|
|
76
|
+
|
|
50
77
|
// Initialize tools
|
|
51
78
|
this.searchTool = new SearchWebTool(searchConfig);
|
|
52
79
|
this.crawlTool = new CrawlDeepTool(crawlConfig);
|
|
@@ -99,7 +126,9 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
99
126
|
llmAnalysisCalls: 0,
|
|
100
127
|
semanticAnalysisTime: 0,
|
|
101
128
|
queryExpansionTime: 0,
|
|
102
|
-
synthesisTime: 0
|
|
129
|
+
synthesisTime: 0,
|
|
130
|
+
stealthRetries: 0,
|
|
131
|
+
stealthRecovered: 0
|
|
103
132
|
};
|
|
104
133
|
}
|
|
105
134
|
|
|
@@ -201,6 +230,9 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
201
230
|
Object.keys(this.metrics).forEach(key => {
|
|
202
231
|
this.metrics[key] = 0;
|
|
203
232
|
});
|
|
233
|
+
|
|
234
|
+
// Reset per-run stealth-retry budget
|
|
235
|
+
this._stealthCount = 0;
|
|
204
236
|
}
|
|
205
237
|
|
|
206
238
|
/**
|
|
@@ -269,32 +301,50 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
269
301
|
}
|
|
270
302
|
|
|
271
303
|
/**
|
|
272
|
-
* Generate research-specific query variations
|
|
304
|
+
* Generate research-specific query variations, tuned to the research approach.
|
|
305
|
+
*
|
|
306
|
+
* Academic/scientific suffixes ("peer reviewed", "research paper", "what is")
|
|
307
|
+
* only help when the caller actually asked for an academic search. Appending
|
|
308
|
+
* them to commercial or comparative topics dragged web search toward
|
|
309
|
+
* irrelevant government/academic PDFs and long-tail noise — the cause of
|
|
310
|
+
* near-empty research runs on niche commercial topics.
|
|
273
311
|
*/
|
|
274
312
|
generateResearchVariations(topic) {
|
|
275
|
-
const
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
313
|
+
const approach = this.researchApproach || 'broad';
|
|
314
|
+
|
|
315
|
+
if (approach === 'academic') {
|
|
316
|
+
return [
|
|
317
|
+
`${topic} research`,
|
|
318
|
+
`${topic} study`,
|
|
319
|
+
`${topic} analysis`,
|
|
320
|
+
`${topic} academic`,
|
|
321
|
+
`${topic} scientific`,
|
|
322
|
+
`${topic} research paper`,
|
|
323
|
+
`${topic} peer reviewed`,
|
|
324
|
+
`${topic} explained`
|
|
325
|
+
];
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
if (approach === 'current_events') {
|
|
329
|
+
return [
|
|
330
|
+
`latest ${topic}`,
|
|
331
|
+
`${topic} news`,
|
|
332
|
+
`recent ${topic}`,
|
|
333
|
+
`${topic} update`,
|
|
334
|
+
`${topic} announcement`
|
|
335
|
+
];
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// broad / focused / comparative — commercial & general intent
|
|
339
|
+
return [
|
|
340
|
+
`${topic} review`,
|
|
341
|
+
`${topic} reviews`,
|
|
342
|
+
`${topic} comparison`,
|
|
343
|
+
`${topic} vs alternatives`,
|
|
344
|
+
`${topic} pricing`,
|
|
345
|
+
`best ${topic}`,
|
|
346
|
+
`${topic} company`
|
|
347
|
+
];
|
|
298
348
|
}
|
|
299
349
|
|
|
300
350
|
/**
|
|
@@ -531,11 +581,38 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
531
581
|
}
|
|
532
582
|
|
|
533
583
|
// Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
|
|
534
|
-
const
|
|
535
|
-
? (typeof
|
|
536
|
-
? contentData.content
|
|
537
|
-
: (contentData.content.text || ''))
|
|
584
|
+
const normalizeContent = (cd) => cd && cd.content
|
|
585
|
+
? (typeof cd.content === 'string' ? cd.content : (cd.content.text || ''))
|
|
538
586
|
: '';
|
|
587
|
+
let contentText = normalizeContent(contentData);
|
|
588
|
+
|
|
589
|
+
// Stealth fallback: high-value discussion sources (Reddit, Quora,
|
|
590
|
+
// forums) return HTTP 403 to the plain fetch/extract path. When the
|
|
591
|
+
// normal path produced no usable content, retry through a real
|
|
592
|
+
// fingerprinted browser and re-run extraction on the rendered HTML.
|
|
593
|
+
// Bounded by maxStealthRetries + a per-page timeout.
|
|
594
|
+
const blocked = !contentData || contentData.success === false || contentText.trim().length === 0;
|
|
595
|
+
if (blocked && this.enableStealthFallback && this._stealthCount < this.maxStealthRetries) {
|
|
596
|
+
this._stealthCount++;
|
|
597
|
+
this.metrics.stealthRetries++;
|
|
598
|
+
try {
|
|
599
|
+
const stealthHtml = await this._stealthFetchHtml(source.link);
|
|
600
|
+
if (stealthHtml) {
|
|
601
|
+
contentData = await this.extractTool.execute({
|
|
602
|
+
url: source.link,
|
|
603
|
+
html: stealthHtml,
|
|
604
|
+
options: { includeMetadata: true, includeStructuredData: true }
|
|
605
|
+
});
|
|
606
|
+
contentText = normalizeContent(contentData);
|
|
607
|
+
if (contentData && contentData.success !== false && contentText.trim().length > 0) {
|
|
608
|
+
this.metrics.stealthRecovered++;
|
|
609
|
+
this.logActivity('stealth_recovery', { url: source.link });
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
} catch (stealthError) {
|
|
613
|
+
this.logger.warn('Stealth fallback failed', { url: source.link, error: stealthError.message });
|
|
614
|
+
}
|
|
615
|
+
}
|
|
539
616
|
|
|
540
617
|
// Only count and enhance sources that actually produced non-empty content.
|
|
541
618
|
// Skip failed extractions and empty {text:""} results.
|
|
@@ -621,10 +698,134 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
621
698
|
}
|
|
622
699
|
});
|
|
623
700
|
|
|
701
|
+
// Tear down the stealth browser as soon as the extraction stage is done —
|
|
702
|
+
// it is only needed here and would otherwise leak a Playwright handle.
|
|
703
|
+
await this._closeStealth();
|
|
704
|
+
|
|
624
705
|
// Sort by relevance score (LLM or traditional)
|
|
625
706
|
return detailedFindings.sort((a, b) => (b.relevanceScore || 0) - (a.relevanceScore || 0));
|
|
626
707
|
}
|
|
627
708
|
|
|
709
|
+
/**
|
|
710
|
+
* Lazily launch the stealth browser once. The heavy browser stack is only
|
|
711
|
+
* loaded when a source actually blocks the plain path. Engine selection:
|
|
712
|
+
* - 'camoufox'/'auto' → Camoufox (Firefox anti-detect). Loaded via the CJS
|
|
713
|
+
* build (its ESM bundle has a broken dynamic-require). Beats Cloudflare/
|
|
714
|
+
* DataDome challenges that patched headless Chromium can't pass.
|
|
715
|
+
* - 'chromium', or any Camoufox failure under 'auto' → StealthBrowserManager.
|
|
716
|
+
*/
|
|
717
|
+
async _getStealthBrowser() {
|
|
718
|
+
if (!this._stealthInit) {
|
|
719
|
+
this._stealthInit = (async () => {
|
|
720
|
+
if (this.stealthEngine === 'camoufox' || this.stealthEngine === 'auto') {
|
|
721
|
+
try {
|
|
722
|
+
const { createRequire } = await import('module');
|
|
723
|
+
const require = createRequire(import.meta.url);
|
|
724
|
+
const camoufox = require('camoufox'); // CJS build — ESM build is broken
|
|
725
|
+
await this._ensureCamoufoxLayout(camoufox);
|
|
726
|
+
this._stealthBrowser = await camoufox.Camoufox({ headless: true });
|
|
727
|
+
this._stealthEngineActive = 'camoufox';
|
|
728
|
+
this.logger.info('Stealth fallback using Camoufox (Firefox) engine');
|
|
729
|
+
return;
|
|
730
|
+
} catch (e) {
|
|
731
|
+
if (this.stealthEngine === 'camoufox') throw e; // explicit request → surface
|
|
732
|
+
this.logger.warn('Camoufox unavailable, falling back to Chromium stealth', { error: e.message });
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
const { StealthBrowserManager } = await import('./StealthBrowserManager.js');
|
|
736
|
+
this._stealthManager = new StealthBrowserManager();
|
|
737
|
+
await this._stealthManager.launchStealthBrowser({ level: this.stealthLevel });
|
|
738
|
+
this._stealthEngineActive = 'chromium';
|
|
739
|
+
})();
|
|
740
|
+
}
|
|
741
|
+
await this._stealthInit;
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
/**
|
|
745
|
+
* macOS packaging fix for camoufox-js: it expects properties.json in
|
|
746
|
+
* Camoufox.app/Contents/MacOS/, but the .app bundle ships it under
|
|
747
|
+
* Contents/Resources/. Bridge it so the launcher can boot. Best-effort.
|
|
748
|
+
*/
|
|
749
|
+
async _ensureCamoufoxLayout(camoufox) {
|
|
750
|
+
if (process.platform !== 'darwin' || !camoufox?.INSTALL_DIR) return;
|
|
751
|
+
try {
|
|
752
|
+
const fs = await import('fs');
|
|
753
|
+
const path = await import('path');
|
|
754
|
+
const appDir = path.join(camoufox.INSTALL_DIR, 'Camoufox.app', 'Contents');
|
|
755
|
+
const target = path.join(appDir, 'MacOS', 'properties.json');
|
|
756
|
+
const source = path.join(appDir, 'Resources', 'properties.json');
|
|
757
|
+
if (!fs.existsSync(target) && fs.existsSync(source)) {
|
|
758
|
+
fs.copyFileSync(source, target);
|
|
759
|
+
}
|
|
760
|
+
} catch { /* best-effort; launch surfaces a real error if it matters */ }
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
/**
|
|
764
|
+
* Fetch a URL's fully-rendered HTML through the stealth browser. Returns the
|
|
765
|
+
* HTML string, or null if every attempt was blocked / empty.
|
|
766
|
+
*
|
|
767
|
+
* Cloudflare/DataDome challenges are probabilistic — the same URL may serve a
|
|
768
|
+
* challenge on one load and the real page on the next — so Camoufox retries a
|
|
769
|
+
* few times with a fresh page each attempt. Chromium can't clear these at all
|
|
770
|
+
* (proven), so it gets a single attempt to avoid burning the time budget.
|
|
771
|
+
*/
|
|
772
|
+
async _stealthFetchHtml(url) {
|
|
773
|
+
await this._getStealthBrowser();
|
|
774
|
+
const attempts = this._stealthEngineActive === 'camoufox' ? 3 : 1;
|
|
775
|
+
for (let i = 0; i < attempts; i++) {
|
|
776
|
+
const html = await this._stealthFetchOnce(url);
|
|
777
|
+
if (html) return html;
|
|
778
|
+
}
|
|
779
|
+
return null;
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
/** One stealth navigation. Fresh page/context; judges blocked by rendered content. */
|
|
783
|
+
async _stealthFetchOnce(url) {
|
|
784
|
+
let page;
|
|
785
|
+
if (this._stealthEngineActive === 'camoufox') {
|
|
786
|
+
page = await this._stealthBrowser.newPage();
|
|
787
|
+
} else {
|
|
788
|
+
const { contextId } = await this._stealthManager.createStealthContext({ level: this.stealthLevel });
|
|
789
|
+
page = await this._stealthManager.createStealthPage(contextId);
|
|
790
|
+
}
|
|
791
|
+
try {
|
|
792
|
+
const resp = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: this.stealthTimeoutMs });
|
|
793
|
+
// Do NOT bail on the initial HTTP status: anti-bot challenges (Cloudflare
|
|
794
|
+
// Turnstile) return 403 on the first response and only resolve to the
|
|
795
|
+
// real page after their JS runs. Let it settle, then judge by the
|
|
796
|
+
// *rendered* content instead.
|
|
797
|
+
await page.waitForLoadState('networkidle', { timeout: 8000 }).catch(() => {});
|
|
798
|
+
await page.waitForTimeout(2500).catch(() => {});
|
|
799
|
+
const html = await page.content();
|
|
800
|
+
const title = (await page.title().catch(() => '')) || '';
|
|
801
|
+
const bodyLen = await page.evaluate(() => document.body?.innerText?.trim().length || 0).catch(() => 0);
|
|
802
|
+
|
|
803
|
+
// Still a challenge/block page → treat as blocked.
|
|
804
|
+
const challengeTitle = /just a moment|checking your browser|attention required|verify you are human|access denied|^blocked$/i.test(title);
|
|
805
|
+
const status = resp ? resp.status() : 0;
|
|
806
|
+
if (challengeTitle) return null;
|
|
807
|
+
if (status >= 400 && bodyLen < 500) return null; // hard block (e.g. Reddit 403 shell)
|
|
808
|
+
if (bodyLen < 200) return null; // empty / interstitial
|
|
809
|
+
return html && html.length > 200 ? html : null;
|
|
810
|
+
} finally {
|
|
811
|
+
await page.close().catch(() => {});
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
/** Close the stealth browser and reset its lazy state (idempotent). */
|
|
816
|
+
async _closeStealth() {
|
|
817
|
+
try {
|
|
818
|
+
if (this._stealthBrowser) await this._stealthBrowser.close().catch(() => {});
|
|
819
|
+
if (this._stealthManager) await this._stealthManager.cleanup().catch(() => {});
|
|
820
|
+
} catch (e) {
|
|
821
|
+
this.logger.warn('Stealth browser cleanup failed', { error: e.message });
|
|
822
|
+
}
|
|
823
|
+
this._stealthBrowser = null;
|
|
824
|
+
this._stealthManager = null;
|
|
825
|
+
this._stealthEngineActive = null;
|
|
826
|
+
this._stealthInit = null;
|
|
827
|
+
}
|
|
828
|
+
|
|
628
829
|
/**
|
|
629
830
|
* Verify source credibility using multiple factors
|
|
630
831
|
*/
|
|
@@ -644,8 +845,19 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
644
845
|
citationPotential: this.assessCitationPotential(source)
|
|
645
846
|
};
|
|
646
847
|
|
|
647
|
-
|
|
648
|
-
|
|
848
|
+
let overallCredibility = this.calculateOverallCredibility(credibilityFactors);
|
|
849
|
+
|
|
850
|
+
// Down-weight topically-irrelevant sources so high-authority but
|
|
851
|
+
// off-topic pages (e.g. a .gov PDF unrelated to the query) don't
|
|
852
|
+
// dominate the results. relevanceScore is keyword-based here (no LLM):
|
|
853
|
+
// ~1 when the topic appears in the content, ~0 when it doesn't.
|
|
854
|
+
const relevance = typeof source.relevanceScore === 'number'
|
|
855
|
+
? source.relevanceScore
|
|
856
|
+
: null;
|
|
857
|
+
if (relevance !== null) {
|
|
858
|
+
overallCredibility *= (0.4 + 0.6 * relevance);
|
|
859
|
+
}
|
|
860
|
+
|
|
649
861
|
// Only include sources that meet minimum credibility threshold
|
|
650
862
|
if (overallCredibility >= 0.3) {
|
|
651
863
|
verifiedSources.push({
|
|
@@ -1453,7 +1665,10 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
1453
1665
|
try {
|
|
1454
1666
|
// Stop any active research
|
|
1455
1667
|
this.stopResearch();
|
|
1456
|
-
|
|
1668
|
+
|
|
1669
|
+
// Tear down the stealth browser if one was launched
|
|
1670
|
+
await this._closeStealth();
|
|
1671
|
+
|
|
1457
1672
|
// Clear cache if available
|
|
1458
1673
|
if (this.cache && typeof this.cache.clear === "function") {
|
|
1459
1674
|
await this.cache.clear();
|
|
@@ -1491,9 +1706,11 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
1491
1706
|
llmAnalysisCalls: 0,
|
|
1492
1707
|
semanticAnalysisTime: 0,
|
|
1493
1708
|
queryExpansionTime: 0,
|
|
1494
|
-
synthesisTime: 0
|
|
1709
|
+
synthesisTime: 0,
|
|
1710
|
+
stealthRetries: 0,
|
|
1711
|
+
stealthRecovered: 0
|
|
1495
1712
|
};
|
|
1496
|
-
|
|
1713
|
+
|
|
1497
1714
|
} catch (error) {
|
|
1498
1715
|
// Silent cleanup - do not throw errors during cleanup
|
|
1499
1716
|
console.warn("Warning during ResearchOrchestrator cleanup:", error.message);
|
|
@@ -11,6 +11,11 @@ import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
|
|
|
11
11
|
|
|
12
12
|
const ExtractContentSchema = z.object({
|
|
13
13
|
url: z.string().url(),
|
|
14
|
+
// Pre-rendered HTML to process directly instead of fetching `url` (e.g. a
|
|
15
|
+
// post-action page from scrape_with_actions, or a stealth-browser render in
|
|
16
|
+
// deep_research). Without this field Zod stripped it and the tool always
|
|
17
|
+
// re-fetched the URL — silently defeating any pre-fetched-HTML caller.
|
|
18
|
+
html: z.string().optional(),
|
|
14
19
|
options: z.object({
|
|
15
20
|
// Content extraction options
|
|
16
21
|
useReadability: z.boolean().default(true),
|
|
@@ -271,7 +271,11 @@ export class DeepResearchTool {
|
|
|
271
271
|
const scopeConfig = {
|
|
272
272
|
maxUrls: params.maxUrls,
|
|
273
273
|
timeLimit: params.timeLimit,
|
|
274
|
-
concurrency: params.concurrency
|
|
274
|
+
concurrency: params.concurrency,
|
|
275
|
+
// The orchestrator tunes its query expansion to the approach (commercial
|
|
276
|
+
// vs academic vs current-events); without this it always used academic
|
|
277
|
+
// variations, which poisoned commercial/comparative searches.
|
|
278
|
+
researchApproach: params.researchApproach
|
|
275
279
|
};
|
|
276
280
|
|
|
277
281
|
switch (params.researchApproach) {
|