crawlforge-mcp-server 3.4.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -2
- package/package.json +6 -4
- package/server.js +166 -32
- package/src/cli/commands/actions.js +36 -0
- package/src/cli/commands/analyze.js +19 -0
- package/src/cli/commands/batch.js +45 -0
- package/src/cli/commands/crawl.js +30 -0
- package/src/cli/commands/extract.js +45 -0
- package/src/cli/commands/install-skills.js +46 -0
- package/src/cli/commands/llmstxt.js +24 -0
- package/src/cli/commands/localize.js +29 -0
- package/src/cli/commands/map.js +26 -0
- package/src/cli/commands/monitor.js +29 -0
- package/src/cli/commands/research.js +26 -0
- package/src/cli/commands/scrape.js +37 -0
- package/src/cli/commands/search.js +28 -0
- package/src/cli/commands/stealth.js +29 -0
- package/src/cli/commands/template.js +26 -0
- package/src/cli/commands/track.js +24 -0
- package/src/cli/commands/uninstall-skills.js +35 -0
- package/src/cli/formatter.js +57 -0
- package/src/cli/index.js +94 -0
- package/src/cli/lib/runTool.js +40 -0
- package/src/core/ActionExecutor.js +8 -6
- package/src/core/AuthManager.js +103 -3
- package/src/core/ChangeTracker.js +34 -0
- package/src/core/ElicitationHelper.js +112 -0
- package/src/core/JobManager.js +36 -2
- package/src/core/LocalizationManager.js +19 -5
- package/src/core/PerformanceManager.js +53 -17
- package/src/core/ResearchOrchestrator.js +40 -5
- package/src/core/SamplingClient.js +191 -0
- package/src/core/StealthBrowserManager.js +248 -2
- package/src/core/WebhookDispatcher.js +18 -10
- package/src/prompts/PromptRegistry.js +199 -0
- package/src/resources/ResourceRegistry.js +273 -0
- package/src/server/transports/streamableHttp.js +6 -6
- package/src/server/withAuth.js +25 -0
- package/src/skills/crawlforge-cli.md +157 -0
- package/src/skills/crawlforge-mcp.md +80 -0
- package/src/skills/crawlforge-research.md +104 -0
- package/src/skills/crawlforge-stealth.md +98 -0
- package/src/skills/installer.js +141 -0
- package/src/tools/advanced/batchScrape/index.js +30 -0
- package/src/tools/advanced/batchScrape/schema.js +1 -1
- package/src/tools/basic/extractText.js +19 -8
- package/src/tools/crawl/crawlDeep.js +27 -0
- package/src/tools/extract/extractContent.js +5 -17
- package/src/tools/extract/extractStructured.js +8 -0
- package/src/tools/extract/extractWithLlm.js +35 -25
- package/src/tools/extract/listOllamaModels.js +66 -0
- package/src/tools/extract/processDocument.js +7 -1
- package/src/tools/extract/summarizeContent.js +17 -0
- package/src/tools/research/deepResearch.js +34 -0
- package/src/tools/templates/ScrapeTemplateTool.js +68 -0
- package/src/tools/templates/TemplateRegistry.js +311 -0
- package/src/utils/Logger.js +15 -0
- package/src/utils/htmlToMarkdown.js +54 -0
- package/src/utils/secretMask.js +86 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SamplingClient — MCP Sampling wrapper for CrawlForge
|
|
3
|
+
*
|
|
4
|
+
* Allows tools to request LLM completions from the connected MCP client
|
|
5
|
+
* instead of holding server-side API keys.
|
|
6
|
+
*
|
|
7
|
+
* Fallback chain (applied in resolveCompletion):
|
|
8
|
+
* 1. Ollama (local, no API key needed)
|
|
9
|
+
* 2. Server-side API key (OPENAI_API_KEY / ANTHROPIC_API_KEY)
|
|
10
|
+
* 3. MCP sampling request to client
|
|
11
|
+
* 4. Error
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
const OLLAMA_DEFAULT_MODEL = 'llama3.2';
|
|
15
|
+
const OLLAMA_BASE_URL = () => (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Attempt an Ollama completion.
|
|
19
|
+
* @param {string} prompt
|
|
20
|
+
* @param {object} options
|
|
21
|
+
* @returns {Promise<string>}
|
|
22
|
+
*/
|
|
23
|
+
async function tryOllama(prompt, { model, maxTokens } = {}) {
|
|
24
|
+
const ollamaModel = model || process.env.OLLAMA_DEFAULT_MODEL || OLLAMA_DEFAULT_MODEL;
|
|
25
|
+
const url = `${OLLAMA_BASE_URL()}/api/generate`;
|
|
26
|
+
const res = await fetch(url, {
|
|
27
|
+
method: 'POST',
|
|
28
|
+
headers: { 'Content-Type': 'application/json' },
|
|
29
|
+
body: JSON.stringify({
|
|
30
|
+
model: ollamaModel,
|
|
31
|
+
prompt,
|
|
32
|
+
stream: false,
|
|
33
|
+
...(maxTokens ? { options: { num_predict: maxTokens } } : {}),
|
|
34
|
+
}),
|
|
35
|
+
signal: AbortSignal.timeout(30_000),
|
|
36
|
+
});
|
|
37
|
+
if (!res.ok) throw new Error(`Ollama HTTP ${res.status}`);
|
|
38
|
+
const data = await res.json();
|
|
39
|
+
if (!data.response) throw new Error('Ollama returned empty response');
|
|
40
|
+
return data.response;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Attempt an OpenAI completion using server-side API key.
|
|
45
|
+
* @param {string} prompt
|
|
46
|
+
* @param {object} options
|
|
47
|
+
* @returns {Promise<string>}
|
|
48
|
+
*/
|
|
49
|
+
async function tryOpenAI(prompt, { model, maxTokens } = {}) {
|
|
50
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
51
|
+
if (!apiKey) throw new Error('OPENAI_API_KEY not set');
|
|
52
|
+
const base = (process.env.OPENAI_BASE_URL || 'https://api.openai.com').replace(/\/$/, '');
|
|
53
|
+
const res = await fetch(`${base}/v1/chat/completions`, {
|
|
54
|
+
method: 'POST',
|
|
55
|
+
headers: {
|
|
56
|
+
'Content-Type': 'application/json',
|
|
57
|
+
Authorization: `Bearer ${apiKey}`,
|
|
58
|
+
},
|
|
59
|
+
body: JSON.stringify({
|
|
60
|
+
model: model || 'gpt-4o-mini',
|
|
61
|
+
messages: [{ role: 'user', content: prompt }],
|
|
62
|
+
...(maxTokens ? { max_tokens: maxTokens } : {}),
|
|
63
|
+
}),
|
|
64
|
+
signal: AbortSignal.timeout(30_000),
|
|
65
|
+
});
|
|
66
|
+
if (!res.ok) throw new Error(`OpenAI HTTP ${res.status}`);
|
|
67
|
+
const data = await res.json();
|
|
68
|
+
return data.choices?.[0]?.message?.content || '';
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Attempt an Anthropic completion using server-side API key.
|
|
73
|
+
* @param {string} prompt
|
|
74
|
+
* @param {object} options
|
|
75
|
+
* @returns {Promise<string>}
|
|
76
|
+
*/
|
|
77
|
+
async function tryAnthropic(prompt, { model, maxTokens } = {}) {
|
|
78
|
+
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
79
|
+
if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set');
|
|
80
|
+
const base = (process.env.ANTHROPIC_BASE_URL || 'https://api.anthropic.com').replace(/\/$/, '');
|
|
81
|
+
const res = await fetch(`${base}/v1/messages`, {
|
|
82
|
+
method: 'POST',
|
|
83
|
+
headers: {
|
|
84
|
+
'Content-Type': 'application/json',
|
|
85
|
+
'x-api-key': apiKey,
|
|
86
|
+
'anthropic-version': '2023-06-01',
|
|
87
|
+
},
|
|
88
|
+
body: JSON.stringify({
|
|
89
|
+
model: model || 'claude-haiku-4-5-20251001',
|
|
90
|
+
max_tokens: maxTokens || 1024,
|
|
91
|
+
messages: [{ role: 'user', content: prompt }],
|
|
92
|
+
}),
|
|
93
|
+
signal: AbortSignal.timeout(30_000),
|
|
94
|
+
});
|
|
95
|
+
if (!res.ok) throw new Error(`Anthropic HTTP ${res.status}`);
|
|
96
|
+
const data = await res.json();
|
|
97
|
+
return data.content?.[0]?.text || '';
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
export class SamplingClient {
|
|
101
|
+
/**
|
|
102
|
+
* @param {object} options
|
|
103
|
+
* @param {object|null} options.mcpServer - McpServer instance (must have requestSampling method if sampling is desired)
|
|
104
|
+
*/
|
|
105
|
+
constructor({ mcpServer } = {}) {
|
|
106
|
+
this._mcpServer = mcpServer || null;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Resolve an LLM completion using the fallback chain:
|
|
111
|
+
* Ollama → API key (OpenAI then Anthropic) → MCP sampling → error
|
|
112
|
+
*
|
|
113
|
+
* @param {string} prompt - The prompt to complete
|
|
114
|
+
* @param {object} options
|
|
115
|
+
* @param {string} [options.model] - Override model name
|
|
116
|
+
* @param {number} [options.maxTokens] - Max tokens for response
|
|
117
|
+
* @param {string} [options.systemPrompt] - Optional system-level instruction
|
|
118
|
+
* @returns {Promise<{ text: string, provider: string }>}
|
|
119
|
+
*/
|
|
120
|
+
async complete(prompt, options = {}) {
|
|
121
|
+
const fullPrompt = options.systemPrompt
|
|
122
|
+
? `${options.systemPrompt}\n\n${prompt}`
|
|
123
|
+
: prompt;
|
|
124
|
+
|
|
125
|
+
// 1. Try Ollama (local, no API key)
|
|
126
|
+
try {
|
|
127
|
+
const text = await tryOllama(fullPrompt, options);
|
|
128
|
+
return { text, provider: 'ollama' };
|
|
129
|
+
} catch (_ollamaErr) {
|
|
130
|
+
// Ollama unavailable — continue fallback chain
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// 2. Try server-side API keys
|
|
134
|
+
if (process.env.OPENAI_API_KEY) {
|
|
135
|
+
try {
|
|
136
|
+
const text = await tryOpenAI(fullPrompt, options);
|
|
137
|
+
return { text, provider: 'openai' };
|
|
138
|
+
} catch (_openaiErr) {
|
|
139
|
+
// OpenAI failed — try Anthropic
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (process.env.ANTHROPIC_API_KEY) {
|
|
144
|
+
try {
|
|
145
|
+
const text = await tryAnthropic(fullPrompt, options);
|
|
146
|
+
return { text, provider: 'anthropic' };
|
|
147
|
+
} catch (_anthropicErr) {
|
|
148
|
+
// Anthropic failed — try sampling
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// 3. Try MCP sampling (client-side LLM)
|
|
153
|
+
if (this._mcpServer?.server?.createMessage) {
|
|
154
|
+
try {
|
|
155
|
+
const samplingResult = await this._mcpServer.server.createMessage({
|
|
156
|
+
messages: [{ role: 'user', content: { type: 'text', text: fullPrompt } }],
|
|
157
|
+
maxTokens: options.maxTokens || 1024,
|
|
158
|
+
includeContext: 'none',
|
|
159
|
+
});
|
|
160
|
+
const text = samplingResult?.content?.text || '';
|
|
161
|
+
if (text) return { text, provider: 'sampling' };
|
|
162
|
+
} catch (_samplingErr) {
|
|
163
|
+
// Sampling not supported or failed
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// 4. All fallbacks exhausted
|
|
168
|
+
throw new Error(
|
|
169
|
+
'No LLM available: Ollama is not running, no API keys set (OPENAI_API_KEY / ANTHROPIC_API_KEY), and the MCP client does not support sampling.'
|
|
170
|
+
);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Check which LLM providers are available without making a completion call.
|
|
175
|
+
* @returns {Promise<{ ollama: boolean, openai: boolean, anthropic: boolean, sampling: boolean }>}
|
|
176
|
+
*/
|
|
177
|
+
async probe() {
|
|
178
|
+
const result = { ollama: false, openai: false, anthropic: false, sampling: false };
|
|
179
|
+
|
|
180
|
+
try {
|
|
181
|
+
const res = await fetch(`${OLLAMA_BASE_URL()}/api/tags`, { signal: AbortSignal.timeout(3000) });
|
|
182
|
+
result.ollama = res.ok;
|
|
183
|
+
} catch (_) { /* unavailable */ }
|
|
184
|
+
|
|
185
|
+
result.openai = !!process.env.OPENAI_API_KEY;
|
|
186
|
+
result.anthropic = !!process.env.ANTHROPIC_API_KEY;
|
|
187
|
+
result.sampling = !!(this._mcpServer?.server?.createMessage);
|
|
188
|
+
|
|
189
|
+
return result;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
@@ -60,8 +60,9 @@ const StealthConfigSchema = z.object({
|
|
|
60
60
|
export class StealthBrowserManager {
|
|
61
61
|
constructor(options = {}) {
|
|
62
62
|
this.browser = null;
|
|
63
|
+
this._maxContexts = parseInt(process.env.MAX_BROWSER_CONTEXTS || '10', 10);
|
|
63
64
|
this.contexts = new BrowserContextPool({
|
|
64
|
-
maxContexts:
|
|
65
|
+
maxContexts: this._maxContexts,
|
|
65
66
|
periodicRefreshAfter: 200,
|
|
66
67
|
closeIdleAfterMs: 30 * 60 * 1000,
|
|
67
68
|
waitTimeoutMs: 10_000,
|
|
@@ -69,6 +70,8 @@ export class StealthBrowserManager {
|
|
|
69
70
|
this.fingerprints.delete(contextId);
|
|
70
71
|
}
|
|
71
72
|
});
|
|
73
|
+
// D2.2: fingerprints Map is capped at _maxContexts to prevent unbounded growth.
|
|
74
|
+
// Oldest entries are evicted when the cap is exceeded (insertion order via Map).
|
|
72
75
|
this.fingerprints = new Map();
|
|
73
76
|
|
|
74
77
|
// Enhanced stealth components
|
|
@@ -377,7 +380,8 @@ export class StealthBrowserManager {
|
|
|
377
380
|
await this.applyAdvancedStealthConfigurations(context, validatedConfig, fingerprint);
|
|
378
381
|
|
|
379
382
|
await this.contexts.set(contextId, { context, fingerprint, config: validatedConfig });
|
|
380
|
-
|
|
383
|
+
// D2.2: enforce LRU cap on fingerprints Map
|
|
384
|
+
this._setFingerprint(contextId, fingerprint);
|
|
381
385
|
|
|
382
386
|
return { context, contextId, fingerprint };
|
|
383
387
|
}
|
|
@@ -1702,6 +1706,18 @@ export class StealthBrowserManager {
|
|
|
1702
1706
|
}
|
|
1703
1707
|
}
|
|
1704
1708
|
|
|
1709
|
+
/**
|
|
1710
|
+
* D2.2: LRU-capped fingerprint setter.
|
|
1711
|
+
* Evicts the oldest entry when the Map exceeds _maxContexts to prevent unbounded growth.
|
|
1712
|
+
*/
|
|
1713
|
+
_setFingerprint(contextId, fingerprint) {
|
|
1714
|
+
if (this.fingerprints.size >= this._maxContexts) {
|
|
1715
|
+
const oldestKey = this.fingerprints.keys().next().value;
|
|
1716
|
+
this.fingerprints.delete(oldestKey);
|
|
1717
|
+
}
|
|
1718
|
+
this.fingerprints.set(contextId, fingerprint);
|
|
1719
|
+
}
|
|
1720
|
+
|
|
1705
1721
|
/**
|
|
1706
1722
|
* Close all contexts and browser
|
|
1707
1723
|
*/
|
|
@@ -1800,4 +1816,234 @@ export class StealthBrowserManager {
|
|
|
1800
1816
|
}
|
|
1801
1817
|
}
|
|
1802
1818
|
|
|
1819
|
+
|
|
1820
|
+
|
|
1821
|
+
// ─── D3.2: BrowserEngine interface + CamoufoxAdapter ──────────────────────────
|
|
1822
|
+
//
|
|
1823
|
+
// Camoufox licensing note:
|
|
1824
|
+
// camoufox (github.com/daijro/camoufox) is MIT-licensed.
|
|
1825
|
+
// python-camoufox launcher is MPL-2.0. The JS bindings
|
|
1826
|
+
// (@camoufox/jsapi) are MIT. There are no AGPL forks in the
|
|
1827
|
+
// main distribution chain as of 2026-05. Always re-verify before
|
|
1828
|
+
// distributing: https://github.com/daijro/camoufox/blob/main/LICENSE
|
|
1829
|
+
//
|
|
1830
|
+
// Engine-selection criteria:
|
|
1831
|
+
// playwright — Chromium-based, fastest, best Playwright ecosystem support.
|
|
1832
|
+
// Good default for most sites.
|
|
1833
|
+
// camoufox — Firefox-based, patches browser internals to hide automation
|
|
1834
|
+
// markers at the C++ level, not via JS injection. Scores
|
|
1835
|
+
// significantly higher on CreepJS and Datadome than any
|
|
1836
|
+
// Playwright+stealth combination. Use when Playwright is
|
|
1837
|
+
// detected and blocked.
|
|
1838
|
+
//
|
|
1839
|
+
// Benchmark methodology (not run here — network-dependent):
|
|
1840
|
+
// 1. Open https://bot.sannysoft.com with each engine — count red indicators.
|
|
1841
|
+
// 2. Open https://nowsecure.nl with each engine — check "You are not a bot".
|
|
1842
|
+
// 3. Run https://abrahamjuliot.github.io/creepjs/ — compare trust score %.
|
|
1843
|
+
// 4. Use Datadome test page — verify challenge is not triggered.
|
|
1844
|
+
// All tests must be run with a clean incognito context and no extensions.
|
|
1845
|
+
|
|
1846
|
+
/**
|
|
1847
|
+
* BrowserEngine interface (D3.2).
|
|
1848
|
+
* Implementors must provide:
|
|
1849
|
+
* launch(config) → Promise<Browser-like>
|
|
1850
|
+
* name() → string
|
|
1851
|
+
* isAvailable() → Promise<boolean>
|
|
1852
|
+
*/
|
|
1853
|
+
export class BrowserEngine {
|
|
1854
|
+
/** @returns {string} */
|
|
1855
|
+
name() { throw new Error('BrowserEngine.name() must be implemented'); }
|
|
1856
|
+
|
|
1857
|
+
/** @returns {Promise<boolean>} */
|
|
1858
|
+
async isAvailable() { return false; }
|
|
1859
|
+
|
|
1860
|
+
/**
|
|
1861
|
+
* @param {object} config
|
|
1862
|
+
* @returns {Promise<object>} browser-like handle
|
|
1863
|
+
*/
|
|
1864
|
+
async launch(_config) { throw new Error('BrowserEngine.launch() must be implemented'); }
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
/**
|
|
1868
|
+
* CamoufoxAdapter — Firefox-based engine using the camoufox package.
|
|
1869
|
+
* Falls back gracefully when camoufox is not installed.
|
|
1870
|
+
*
|
|
1871
|
+
* Install: npm install camoufox (MIT license)
|
|
1872
|
+
*/
|
|
1873
|
+
export class CamoufoxAdapter extends BrowserEngine {
|
|
1874
|
+
name() { return 'camoufox'; }
|
|
1875
|
+
|
|
1876
|
+
async isAvailable() {
|
|
1877
|
+
try {
|
|
1878
|
+
await import('camoufox');
|
|
1879
|
+
return true;
|
|
1880
|
+
} catch {
|
|
1881
|
+
return false;
|
|
1882
|
+
}
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1885
|
+
async launch(config = {}) {
|
|
1886
|
+
let camoufox;
|
|
1887
|
+
try {
|
|
1888
|
+
camoufox = await import('camoufox');
|
|
1889
|
+
} catch {
|
|
1890
|
+
throw new Error(
|
|
1891
|
+
'camoufox is not installed. Run: npm install camoufox. Note: camoufox is MIT-licensed and requires Firefox to be installed.'
|
|
1892
|
+
);
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
// camoufox API mirrors playwright — returns a Browser object
|
|
1896
|
+
const browser = await (camoufox.launch || camoufox.default?.launch)({
|
|
1897
|
+
headless: config.headless !== false,
|
|
1898
|
+
...config.launchOptions
|
|
1899
|
+
});
|
|
1900
|
+
|
|
1901
|
+
return browser;
|
|
1902
|
+
}
|
|
1903
|
+
}
|
|
1904
|
+
|
|
1905
|
+
// ─── D3.4: BrowserBackend interface + backends ────────────────────────────────
|
|
1906
|
+
//
|
|
1907
|
+
// CRAWLFORGE_BROWSER_BACKEND=local → LocalPlaywrightBackend (default, current behavior)
|
|
1908
|
+
// CRAWLFORGE_BROWSER_BACKEND=browserbase → BrowserBaseBackend via CDP
|
|
1909
|
+
//
|
|
1910
|
+
// Graceful fallback: if BrowserBaseBackend fails to connect (no API key, network error,
|
|
1911
|
+
// quota exceeded), StealthBrowserManager.getBrowserBackend() falls back to local.
|
|
1912
|
+
|
|
1913
|
+
/**
|
|
1914
|
+
* BrowserBackend interface (D3.4).
|
|
1915
|
+
* Implementors must provide:
|
|
1916
|
+
* connect(config) → Promise<Browser-like>
|
|
1917
|
+
* disconnect() → Promise<void>
|
|
1918
|
+
* name() → string
|
|
1919
|
+
* isConfigured() → boolean
|
|
1920
|
+
*/
|
|
1921
|
+
export class BrowserBackend {
|
|
1922
|
+
name() { throw new Error('BrowserBackend.name() must be implemented'); }
|
|
1923
|
+
isConfigured() { return false; }
|
|
1924
|
+
async connect(_config) { throw new Error('BrowserBackend.connect() must be implemented'); }
|
|
1925
|
+
async disconnect() {}
|
|
1926
|
+
}
|
|
1927
|
+
|
|
1928
|
+
/**
|
|
1929
|
+
* LocalPlaywrightBackend — wraps existing Playwright Chromium behavior.
|
|
1930
|
+
* This is the default backend (preserves all pre-D3.4 behavior).
|
|
1931
|
+
*/
|
|
1932
|
+
export class LocalPlaywrightBackend extends BrowserBackend {
|
|
1933
|
+
name() { return 'local'; }
|
|
1934
|
+
isConfigured() { return true; }
|
|
1935
|
+
|
|
1936
|
+
async connect(config = {}) {
|
|
1937
|
+
const { chromium } = await import('playwright');
|
|
1938
|
+
return chromium.launch({
|
|
1939
|
+
headless: config.headless !== false,
|
|
1940
|
+
...config.launchOptions
|
|
1941
|
+
});
|
|
1942
|
+
}
|
|
1943
|
+
|
|
1944
|
+
async disconnect() {}
|
|
1945
|
+
}
|
|
1946
|
+
|
|
1947
|
+
/**
|
|
1948
|
+
* BrowserBaseBackend — connects to BrowserBase cloud browser via CDP.
|
|
1949
|
+
*
|
|
1950
|
+
* Requirements:
|
|
1951
|
+
* BROWSERBASE_API_KEY — your BrowserBase API key
|
|
1952
|
+
* CRAWLFORGE_BROWSER_BACKEND=browserbase
|
|
1953
|
+
*
|
|
1954
|
+
* The backend creates a BrowserBase session, gets the CDP endpoint, and
|
|
1955
|
+
* connects Playwright over it. All stealth fingerprint injection still
|
|
1956
|
+
* runs through CrawlForge's existing page-level scripts.
|
|
1957
|
+
*
|
|
1958
|
+
* Docs: https://docs.browserbase.com/integrations/playwright
|
|
1959
|
+
*/
|
|
1960
|
+
export class BrowserBaseBackend extends BrowserBackend {
|
|
1961
|
+
constructor() {
|
|
1962
|
+
super();
|
|
1963
|
+
this._sessionId = null;
|
|
1964
|
+
}
|
|
1965
|
+
|
|
1966
|
+
name() { return 'browserbase'; }
|
|
1967
|
+
|
|
1968
|
+
isConfigured() {
|
|
1969
|
+
return Boolean(process.env.BROWSERBASE_API_KEY);
|
|
1970
|
+
}
|
|
1971
|
+
|
|
1972
|
+
async connect(config = {}) {
|
|
1973
|
+
const apiKey = process.env.BROWSERBASE_API_KEY;
|
|
1974
|
+
if (!apiKey) {
|
|
1975
|
+
throw new Error(
|
|
1976
|
+
'BrowserBase requires BROWSERBASE_API_KEY environment variable. ' +
|
|
1977
|
+
'Get your key at https://browserbase.com'
|
|
1978
|
+
);
|
|
1979
|
+
}
|
|
1980
|
+
|
|
1981
|
+
// Create a BrowserBase session
|
|
1982
|
+
const sessionRes = await fetch('https://www.browserbase.com/v1/sessions', {
|
|
1983
|
+
method: 'POST',
|
|
1984
|
+
headers: {
|
|
1985
|
+
'Content-Type': 'application/json',
|
|
1986
|
+
'X-BB-API-Key': apiKey
|
|
1987
|
+
},
|
|
1988
|
+
body: JSON.stringify({
|
|
1989
|
+
projectId: process.env.BROWSERBASE_PROJECT_ID,
|
|
1990
|
+
...config.sessionOptions
|
|
1991
|
+
})
|
|
1992
|
+
});
|
|
1993
|
+
|
|
1994
|
+
if (!sessionRes.ok) {
|
|
1995
|
+
const err = await sessionRes.text().catch(() => '');
|
|
1996
|
+
throw new Error();
|
|
1997
|
+
}
|
|
1998
|
+
|
|
1999
|
+
const session = await sessionRes.json();
|
|
2000
|
+
this._sessionId = session.id;
|
|
2001
|
+
|
|
2002
|
+
// Connect Playwright over CDP
|
|
2003
|
+
const { chromium } = await import('playwright');
|
|
2004
|
+
const browser = await chromium.connectOverCDP(session.connectUrl, {
|
|
2005
|
+
timeout: config.timeout || 30000
|
|
2006
|
+
});
|
|
2007
|
+
|
|
2008
|
+
return browser;
|
|
2009
|
+
}
|
|
2010
|
+
|
|
2011
|
+
async disconnect() {
|
|
2012
|
+
if (!this._sessionId) return;
|
|
2013
|
+
const apiKey = process.env.BROWSERBASE_API_KEY;
|
|
2014
|
+
if (!apiKey) return;
|
|
2015
|
+
|
|
2016
|
+
try {
|
|
2017
|
+
await fetch(`https://www.browserbase.com/v1/sessions/${this._sessionId}`, {
|
|
2018
|
+
method: 'DELETE',
|
|
2019
|
+
headers: { 'X-BB-API-Key': apiKey }
|
|
2020
|
+
});
|
|
2021
|
+
} catch {
|
|
2022
|
+
// Non-fatal — session will expire on BrowserBase's side
|
|
2023
|
+
} finally {
|
|
2024
|
+
this._sessionId = null;
|
|
2025
|
+
}
|
|
2026
|
+
}
|
|
2027
|
+
}
|
|
2028
|
+
|
|
2029
|
+
/**
|
|
2030
|
+
* Factory: resolve which BrowserBackend to use based on env config.
|
|
2031
|
+
* Falls back to local on any error.
|
|
2032
|
+
*
|
|
2033
|
+
* @param {object} [options]
|
|
2034
|
+
* @returns {BrowserBackend}
|
|
2035
|
+
*/
|
|
2036
|
+
export function resolveBrowserBackend(options = {}) {
|
|
2037
|
+
const requested = (process.env.CRAWLFORGE_BROWSER_BACKEND || 'local').toLowerCase();
|
|
2038
|
+
|
|
2039
|
+
if (requested === 'browserbase') {
|
|
2040
|
+
const bb = new BrowserBaseBackend();
|
|
2041
|
+
if (bb.isConfigured()) return bb;
|
|
2042
|
+
// BROWSERBASE_API_KEY not set — fall through to local
|
|
2043
|
+
console.error('[StealthBrowserManager] CRAWLFORGE_BROWSER_BACKEND=browserbase but BROWSERBASE_API_KEY is not set. Falling back to local Playwright.');
|
|
2044
|
+
}
|
|
2045
|
+
|
|
2046
|
+
return new LocalPlaywrightBackend();
|
|
2047
|
+
}
|
|
2048
|
+
|
|
1803
2049
|
export default StealthBrowserManager;
|
|
@@ -287,7 +287,9 @@ export class WebhookDispatcher extends EventEmitter {
|
|
|
287
287
|
this.processing = true;
|
|
288
288
|
|
|
289
289
|
try {
|
|
290
|
-
|
|
290
|
+
// D2.5: cap retry batch size to prevent a flood of retries overwhelming targets
|
|
291
|
+
const rawBatchSize = this.enableBatching ? this.batchSize : 1;
|
|
292
|
+
const batchSize = Math.min(rawBatchSize, 10); // never process more than 10 at once
|
|
291
293
|
const batch = this.queue.splice(0, batchSize);
|
|
292
294
|
|
|
293
295
|
if (this.enableBatching && batch.length > 1) {
|
|
@@ -328,14 +330,13 @@ export class WebhookDispatcher extends EventEmitter {
|
|
|
328
330
|
|
|
329
331
|
// Check if we should retry
|
|
330
332
|
if (event.attempts < this.maxRetries) {
|
|
331
|
-
//
|
|
332
|
-
const
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
);
|
|
333
|
+
// D2.5: per-webhook exponential backoff with jitter to prevent retry storms
|
|
334
|
+
const baseDelay = this.retryDelay * Math.pow(2, event.attempts - 1);
|
|
335
|
+
const jitter = Math.random() * Math.min(baseDelay * 0.25, 5000); // up to 25% or 5s
|
|
336
|
+
const delay = Math.min(baseDelay + jitter, 60000); // cap at 1 minute
|
|
336
337
|
|
|
337
338
|
setTimeout(() => {
|
|
338
|
-
this.queue.
|
|
339
|
+
this.queue.push(event); // push to back (not front) to avoid head-of-line blocking
|
|
339
340
|
}, delay);
|
|
340
341
|
|
|
341
342
|
this.emit('webhookRetry', event, error, delay);
|
|
@@ -503,9 +504,16 @@ export class WebhookDispatcher extends EventEmitter {
|
|
|
503
504
|
clearInterval(this.healthMonitoringTimer);
|
|
504
505
|
}
|
|
505
506
|
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
507
|
+
// D2.5: add jitter to health check interval to prevent synchronized storms
|
|
508
|
+
const scheduleNextHealthCheck = () => {
|
|
509
|
+
const jitter = Math.floor(Math.random() * Math.min(this.healthCheckInterval * 0.1, 10000));
|
|
510
|
+
this.healthMonitoringTimer = setTimeout(() => {
|
|
511
|
+
this.performHealthChecks().finally(() => {
|
|
512
|
+
if (this.healthMonitoringTimer !== null) scheduleNextHealthCheck();
|
|
513
|
+
});
|
|
514
|
+
}, this.healthCheckInterval + jitter);
|
|
515
|
+
};
|
|
516
|
+
scheduleNextHealthCheck();
|
|
509
517
|
}
|
|
510
518
|
|
|
511
519
|
/**
|