crawlforge-mcp-server 4.2.8 → 4.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/src/cli/commands/actions.js +4 -4
- package/src/cli/commands/llmstxt.js +4 -2
- package/src/cli/commands/map.js +6 -6
- package/src/cli/commands/monitor.js +27 -8
- package/src/cli/commands/research.js +7 -4
- package/src/cli/commands/track.js +22 -5
- package/src/cli/index.js +13 -0
- package/src/core/StealthBrowserManager.js +40 -3
- package/src/core/WebhookDispatcher.js +1 -1
- package/src/core/crawlers/BFSCrawler.js +3 -3
- package/src/tools/advanced/ScrapeWithActionsTool.js +2 -2
- package/src/tools/extract/extractContent.js +1 -1
- package/src/tools/extract/processDocument.js +1 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.2.
|
|
3
|
+
"version": "4.2.10",
|
|
4
4
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"bin": {
|
|
@@ -113,6 +113,7 @@
|
|
|
113
113
|
"playwright": "^1.54.2",
|
|
114
114
|
"robots-parser": "^3.0.1",
|
|
115
115
|
"turndown": "^7.2.4",
|
|
116
|
+
"undici": "^7.24.0",
|
|
116
117
|
"winston": "^3.11.0",
|
|
117
118
|
"zod": "^3.23.8"
|
|
118
119
|
},
|
|
@@ -11,8 +11,7 @@ export function register(program) {
|
|
|
11
11
|
.command('actions <url>')
|
|
12
12
|
.description('Run browser automation actions against a URL')
|
|
13
13
|
.requiredOption('--script <file>', 'JSON file containing action script')
|
|
14
|
-
.option('--screenshot', 'Capture
|
|
15
|
-
.option('--wait <ms>', 'Wait time between actions in milliseconds', '500')
|
|
14
|
+
.option('--screenshot', 'Capture screenshots during action execution')
|
|
16
15
|
.action(async (url, opts, cmd) => {
|
|
17
16
|
const globals = cmd.parent.opts();
|
|
18
17
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
@@ -26,11 +25,12 @@ export function register(program) {
|
|
|
26
25
|
}
|
|
27
26
|
|
|
28
27
|
const tool = new ScrapeWithActionsTool(getToolConfig('scrape_with_actions'));
|
|
28
|
+
// ScrapeWithActionsSchema uses captureScreenshots (no between-action wait
|
|
29
|
+
// field — insert {type:'wait'} actions in the script for that).
|
|
29
30
|
await runTool(tool, {
|
|
30
31
|
url,
|
|
31
32
|
actions,
|
|
32
|
-
|
|
33
|
-
wait_between_actions: parseInt(opts.wait, 10)
|
|
33
|
+
captureScreenshots: !!opts.screenshot
|
|
34
34
|
}, cliFlags);
|
|
35
35
|
});
|
|
36
36
|
}
|
|
@@ -15,10 +15,12 @@ export function register(program) {
|
|
|
15
15
|
const globals = cmd.parent.opts();
|
|
16
16
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
17
17
|
const tool = new GenerateLLMsTxtTool(getToolConfig('generate_llms_txt'));
|
|
18
|
+
// GenerateLLMsTxtSchema expects: url, format ('both'|'llms-txt'|'llms-full-txt'),
|
|
19
|
+
// analysisOptions.maxPages.
|
|
18
20
|
await runTool(tool, {
|
|
19
21
|
url,
|
|
20
|
-
|
|
21
|
-
|
|
22
|
+
format: opts.includeFull ? 'both' : 'llms-txt',
|
|
23
|
+
analysisOptions: { maxPages: parseInt(opts.maxPages, 10) }
|
|
22
24
|
}, cliFlags);
|
|
23
25
|
});
|
|
24
26
|
}
|
package/src/cli/commands/map.js
CHANGED
|
@@ -9,18 +9,18 @@ export function register(program) {
|
|
|
9
9
|
program
|
|
10
10
|
.command('map <url>')
|
|
11
11
|
.description('Generate a sitemap for a website')
|
|
12
|
-
.option('--
|
|
13
|
-
.option('--
|
|
14
|
-
.option('--format <fmt>', 'Output format: json or xml', 'json')
|
|
12
|
+
.option('--max-pages <n>', 'Maximum URLs to discover', '500')
|
|
13
|
+
.option('--no-sitemap', 'Skip parsing sitemap.xml')
|
|
15
14
|
.action(async (url, opts, cmd) => {
|
|
16
15
|
const globals = cmd.parent.opts();
|
|
17
16
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
17
|
const tool = new MapSiteTool(getToolConfig('map_site'));
|
|
18
|
+
// MapSiteSchema expects: url, max_urls, include_sitemap.
|
|
19
|
+
// (map_site has no crawl-depth or xml/json output toggle.)
|
|
19
20
|
await runTool(tool, {
|
|
20
21
|
url,
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
output_format: opts.format
|
|
22
|
+
max_urls: parseInt(opts.maxPages, 10),
|
|
23
|
+
include_sitemap: opts.sitemap
|
|
24
24
|
}, cliFlags);
|
|
25
25
|
});
|
|
26
26
|
}
|
|
@@ -17,14 +17,33 @@ export function register(program) {
|
|
|
17
17
|
const globals = cmd.parent.opts();
|
|
18
18
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
19
19
|
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
|
|
21
|
+
// TrackChangesSchema shape: operation 'monitor' (setInterval poller);
|
|
22
|
+
// interval is ms (min 60s); selector → trackingOptions.customSelectors;
|
|
23
|
+
// threshold (%) → significanceThresholds; webhook → notificationOptions.webhook.
|
|
24
|
+
const t = Math.min(Math.max(parseFloat(opts.threshold) / 100, 0), 1);
|
|
25
|
+
const params = {
|
|
22
26
|
url,
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
trackingOptions: {
|
|
28
|
+
...(opts.selector ? { customSelectors: [opts.selector] } : {}),
|
|
29
|
+
significanceThresholds: { minor: t, moderate: Math.max(0.3, t), major: Math.max(0.7, t) }
|
|
30
|
+
},
|
|
31
|
+
monitoringOptions: {
|
|
32
|
+
enabled: true,
|
|
33
|
+
interval: Math.max(parseInt(opts.interval, 10), 60) * 1000
|
|
34
|
+
},
|
|
35
|
+
...(opts.webhook ? { notificationOptions: { webhook: { enabled: true, url: opts.webhook } } } : {})
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
// setupMonitoring polls compareWithBaseline, which needs a baseline; create
|
|
39
|
+
// one from the current page first so the monitor watches for changes from now.
|
|
40
|
+
const wrapperTool = {
|
|
41
|
+
execute: async (p) => {
|
|
42
|
+
await tool.execute({ ...p, operation: 'create_baseline' });
|
|
43
|
+
return await tool.execute({ ...p, operation: 'monitor' });
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
// monitor runs continuously — do not auto-exit after the first result.
|
|
47
|
+
await runTool(wrapperTool, params, cliFlags, { exitOnSuccess: false });
|
|
29
48
|
});
|
|
30
49
|
}
|
|
@@ -16,11 +16,14 @@ export function register(program) {
|
|
|
16
16
|
const globals = cmd.parent.opts();
|
|
17
17
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
18
18
|
const tool = new DeepResearchTool(getToolConfig('deep_research'));
|
|
19
|
+
// DeepResearchSchema expects: topic, maxDepth (1-10), maxUrls, outputFormat.
|
|
20
|
+
const depthMap = { basic: 2, standard: 5, deep: 8 };
|
|
21
|
+
const formatMap = { summary: 'summary', detailed: 'comprehensive' };
|
|
19
22
|
await runTool(tool, {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
topic,
|
|
24
|
+
maxDepth: depthMap[opts.depth] ?? 5,
|
|
25
|
+
maxUrls: parseInt(opts.maxUrls, 10),
|
|
26
|
+
outputFormat: formatMap[opts.outputFormat] ?? 'summary'
|
|
24
27
|
}, cliFlags);
|
|
25
28
|
});
|
|
26
29
|
}
|
|
@@ -15,10 +15,27 @@ export function register(program) {
|
|
|
15
15
|
const globals = cmd.parent.opts();
|
|
16
16
|
const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
|
|
17
17
|
const tool = new TrackChangesTool(getToolConfig('track_changes'));
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
18
|
+
|
|
19
|
+
// TrackChangesSchema shape: selector → trackingOptions.customSelectors,
|
|
20
|
+
// threshold (%) → trackingOptions.significanceThresholds (0-1, ordered).
|
|
21
|
+
const t = Math.min(Math.max(parseFloat(opts.threshold) / 100, 0), 1);
|
|
22
|
+
const trackingOptions = {
|
|
23
|
+
...(opts.selector ? { customSelectors: [opts.selector] } : {}),
|
|
24
|
+
significanceThresholds: { minor: t, moderate: Math.max(0.3, t), major: Math.max(0.7, t) }
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
// `compare` throws "No baseline found" on first run — bootstrap one, then
|
|
28
|
+
// the next invocation reports actual changes against it.
|
|
29
|
+
const params = { url, trackingOptions };
|
|
30
|
+
const wrapperTool = {
|
|
31
|
+
execute: async (p) => {
|
|
32
|
+
const res = await tool.execute({ ...p, operation: 'compare' });
|
|
33
|
+
if (res && res.success === false && /No baseline/i.test(res.error || '')) {
|
|
34
|
+
return await tool.execute({ ...p, operation: 'create_baseline' });
|
|
35
|
+
}
|
|
36
|
+
return res;
|
|
37
|
+
}
|
|
38
|
+
};
|
|
39
|
+
await runTool(wrapperTool, params, cliFlags);
|
|
23
40
|
});
|
|
24
41
|
}
|
package/src/cli/index.js
CHANGED
|
@@ -16,6 +16,19 @@ import { createRequire } from 'node:module';
|
|
|
16
16
|
import { fileURLToPath } from 'node:url';
|
|
17
17
|
import { dirname, join } from 'node:path';
|
|
18
18
|
import { readFileSync } from 'node:fs';
|
|
19
|
+
import { setGlobalDispatcher, EnvHttpProxyAgent } from 'undici';
|
|
20
|
+
|
|
21
|
+
// Node's global fetch() (undici) ignores HTTP(S)_PROXY by default. When a proxy
|
|
22
|
+
// is configured — e.g. inside a sandbox that only permits egress through it —
|
|
23
|
+
// route every fetch() through it so the CLI's API/scrape calls succeed without
|
|
24
|
+
// excluding the command from the sandbox. EnvHttpProxyAgent honors HTTPS_PROXY,
|
|
25
|
+
// HTTP_PROXY and NO_PROXY itself; this is a no-op when none are set.
|
|
26
|
+
if (process.env.HTTPS_PROXY || process.env.HTTP_PROXY || process.env.ALL_PROXY ||
|
|
27
|
+
process.env.https_proxy || process.env.http_proxy || process.env.all_proxy) {
|
|
28
|
+
try {
|
|
29
|
+
setGlobalDispatcher(new EnvHttpProxyAgent());
|
|
30
|
+
} catch { /* proxy agent unavailable — fall back to direct connections */ }
|
|
31
|
+
}
|
|
19
32
|
|
|
20
33
|
// Load package.json for version
|
|
21
34
|
const __filename = fileURLToPath(import.meta.url);
|
|
@@ -1376,7 +1376,7 @@ export class StealthBrowserManager {
|
|
|
1376
1376
|
});
|
|
1377
1377
|
|
|
1378
1378
|
if (challengeDetected) {
|
|
1379
|
-
console.
|
|
1379
|
+
console.error('CloudFlare challenge detected, attempting bypass...');
|
|
1380
1380
|
|
|
1381
1381
|
// Simulate human behavior during challenge
|
|
1382
1382
|
if (this.humanBehaviorSimulator) {
|
|
@@ -1437,7 +1437,7 @@ export class StealthBrowserManager {
|
|
|
1437
1437
|
});
|
|
1438
1438
|
|
|
1439
1439
|
if (recaptchaDetected) {
|
|
1440
|
-
console.
|
|
1440
|
+
console.error('reCAPTCHA detected, implementing human behavior...');
|
|
1441
1441
|
|
|
1442
1442
|
// Simulate human inspection of the reCAPTCHA
|
|
1443
1443
|
if (this.humanBehaviorSimulator) {
|
|
@@ -1491,7 +1491,7 @@ export class StealthBrowserManager {
|
|
|
1491
1491
|
this.proxyManager.currentProxy = proxies[this.proxyManager.proxyIndex];
|
|
1492
1492
|
this.proxyManager.lastRotation = now;
|
|
1493
1493
|
|
|
1494
|
-
console.
|
|
1494
|
+
console.error('Rotated to proxy:', this.proxyManager.currentProxy);
|
|
1495
1495
|
}
|
|
1496
1496
|
|
|
1497
1497
|
return this.proxyManager.currentProxy;
|
|
@@ -1523,6 +1523,43 @@ export class StealthBrowserManager {
|
|
|
1523
1523
|
return page;
|
|
1524
1524
|
}
|
|
1525
1525
|
|
|
1526
|
+
/**
|
|
1527
|
+
* One-shot stealth scrape: create a context + page, navigate to the URL,
|
|
1528
|
+
* extract content, and tear the context down. Convenience wrapper over the
|
|
1529
|
+
* operation-based API (createStealthContext → createStealthPage → goto).
|
|
1530
|
+
*
|
|
1531
|
+
* @param {Object} params
|
|
1532
|
+
* @param {string} params.url — URL to scrape
|
|
1533
|
+
* @param {string} [params.engine] — browser engine (forwarded to config; playwright by default)
|
|
1534
|
+
* @param {number} [params.wait_for] — extra wait after load, in ms
|
|
1535
|
+
* @param {boolean} [params.screenshot] — capture a base64 PNG screenshot
|
|
1536
|
+
* @param {Object} [params.stealthConfig] — stealth configuration overrides
|
|
1537
|
+
* @returns {Promise<{success:boolean, url:string, title:string, text:string, html:string, screenshot:?string}>}
|
|
1538
|
+
*/
|
|
1539
|
+
async scrapeWithStealth({ url, engine, wait_for = 0, screenshot = false, stealthConfig = {} } = {}) {
|
|
1540
|
+
if (!url) throw new Error('scrapeWithStealth requires a url');
|
|
1541
|
+
|
|
1542
|
+
const { contextId } = await this.createStealthContext({ ...stealthConfig, engine });
|
|
1543
|
+
try {
|
|
1544
|
+
const page = await this.createStealthPage(contextId);
|
|
1545
|
+
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
1546
|
+
if (wait_for > 0) await page.waitForTimeout(wait_for);
|
|
1547
|
+
|
|
1548
|
+
const [title, html, text] = await Promise.all([
|
|
1549
|
+
page.title().catch(() => ''),
|
|
1550
|
+
page.content().catch(() => ''),
|
|
1551
|
+
page.innerText('body').catch(() => '')
|
|
1552
|
+
]);
|
|
1553
|
+
const shot = screenshot
|
|
1554
|
+
? await page.screenshot({ encoding: 'base64', fullPage: false }).catch(() => null)
|
|
1555
|
+
: null;
|
|
1556
|
+
|
|
1557
|
+
return { success: true, url, title, text, html, screenshot: shot };
|
|
1558
|
+
} finally {
|
|
1559
|
+
await this.closeContext(contextId).catch(() => {});
|
|
1560
|
+
}
|
|
1561
|
+
}
|
|
1562
|
+
|
|
1526
1563
|
/**
|
|
1527
1564
|
* Apply page-level stealth measures
|
|
1528
1565
|
*/
|
|
@@ -74,7 +74,7 @@ export class WebhookDispatcher extends EventEmitter {
|
|
|
74
74
|
onRetry: (error, attempt, delay, context) => {
|
|
75
75
|
this.stats.retriedDeliveries++;
|
|
76
76
|
if (this.enableLogging) {
|
|
77
|
-
console.
|
|
77
|
+
console.error('Webhook retry ' + attempt + ' for ' + context.url + ' after ' + delay + 'ms: ' + error.message);
|
|
78
78
|
}
|
|
79
79
|
}
|
|
80
80
|
});
|
|
@@ -142,13 +142,13 @@ export class BFSCrawler {
|
|
|
142
142
|
});
|
|
143
143
|
|
|
144
144
|
if (!filterDecision.allowed) {
|
|
145
|
-
console.
|
|
145
|
+
console.error(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
|
|
146
146
|
return;
|
|
147
147
|
}
|
|
148
148
|
|
|
149
149
|
// Backward compatibility: also check legacy patterns
|
|
150
150
|
if (!this.shouldCrawlUrl(normalizedUrl)) {
|
|
151
|
-
console.
|
|
151
|
+
console.error(`Legacy pattern blocks: ${normalizedUrl}`);
|
|
152
152
|
return;
|
|
153
153
|
}
|
|
154
154
|
|
|
@@ -156,7 +156,7 @@ export class BFSCrawler {
|
|
|
156
156
|
if (this.respectRobots && this.robotsChecker) {
|
|
157
157
|
const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
|
|
158
158
|
if (!canFetch) {
|
|
159
|
-
console.
|
|
159
|
+
console.error(`Robots.txt blocks: ${normalizedUrl}`);
|
|
160
160
|
return;
|
|
161
161
|
}
|
|
162
162
|
}
|
|
@@ -253,7 +253,7 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
253
253
|
const startTime = Date.now();
|
|
254
254
|
|
|
255
255
|
if (this.enableLogging) {
|
|
256
|
-
console.
|
|
256
|
+
console.error(`Starting scrape session ${sessionId} with ${validated.actions.length} actions on ${validated.url}`);
|
|
257
257
|
}
|
|
258
258
|
|
|
259
259
|
// Check concurrent sessions limit
|
|
@@ -734,7 +734,7 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
734
734
|
|
|
735
735
|
log(level, message) {
|
|
736
736
|
if (this.enableLogging) {
|
|
737
|
-
console.
|
|
737
|
+
console.error(`[ScrapeWithActionsTool:${level.toUpperCase()}] ${message}`);
|
|
738
738
|
}
|
|
739
739
|
}
|
|
740
740
|
|
|
@@ -138,7 +138,7 @@ export class ExtractContentTool {
|
|
|
138
138
|
const shouldUseJavaScript = options.requiresJavaScript || await this.shouldUseJavaScript(url);
|
|
139
139
|
|
|
140
140
|
if (shouldUseJavaScript) {
|
|
141
|
-
console.
|
|
141
|
+
console.error('Using browser rendering for JavaScript content...');
|
|
142
142
|
const browserResult = await this.browserProcessor.processURL({
|
|
143
143
|
url,
|
|
144
144
|
options: {
|
|
@@ -250,7 +250,7 @@ export class ProcessDocumentTool {
|
|
|
250
250
|
const shouldUseJavaScript = options.requiresJavaScript || await this.shouldUseJavaScript(source);
|
|
251
251
|
|
|
252
252
|
if (shouldUseJavaScript) {
|
|
253
|
-
console.
|
|
253
|
+
console.error('Using browser rendering for JavaScript content...');
|
|
254
254
|
const browserResult = await this.browserProcessor.processURL({
|
|
255
255
|
url: source,
|
|
256
256
|
options: {
|