crawlforge-mcp-server 4.7.1 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -2
- package/package.json +2 -1
- package/server.js +56 -10
- package/src/cli/commands/init.js +13 -2
- package/src/cli/commands/install-skills.js +10 -1
- package/src/cli/commands/monitor.js +81 -0
- package/src/cli/commands/uninstall-skills.js +10 -1
- package/src/core/ActionExecutor.js +81 -15
- package/src/core/ElicitationHelper.js +18 -5
- package/src/core/LLMsTxtAnalyzer.js +2 -1
- package/src/core/MonitorScheduler.js +281 -0
- package/src/core/MonitorStore.js +79 -0
- package/src/core/ResearchOrchestrator.js +2 -1
- package/src/core/crawlers/BFSCrawler.js +2 -1
- package/src/resources/ResourceRegistry.js +3 -0
- package/src/skills/agent-skills/crawlforge-batch-automation/SKILL.md +126 -0
- package/src/skills/agent-skills/crawlforge-batch-automation/references/actions.md +127 -0
- package/src/skills/agent-skills/crawlforge-change-tracking/SKILL.md +116 -0
- package/src/skills/agent-skills/crawlforge-deep-research/SKILL.md +108 -0
- package/src/skills/agent-skills/crawlforge-deep-research/references/workflows.md +76 -0
- package/src/skills/agent-skills/crawlforge-getting-started/SKILL.md +89 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/cli.md +71 -0
- package/src/skills/agent-skills/crawlforge-getting-started/references/credits.md +75 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/SKILL.md +106 -0
- package/src/skills/agent-skills/crawlforge-stealth-browsing/references/engine-selection.md +63 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/SKILL.md +121 -0
- package/src/skills/agent-skills/crawlforge-structured-extraction/references/templates.md +39 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/SKILL.md +141 -0
- package/src/skills/agent-skills/crawlforge-web-scraping/references/tool-reference.md +95 -0
- package/src/skills/installer.js +186 -34
- package/src/tools/advanced/ScrapeWithActionsTool.js +7 -0
- package/src/tools/advanced/batchScrape/worker.js +8 -2
- package/src/tools/basic/_fetch.js +14 -1
- package/src/tools/crawl/_sessionContext.js +3 -1
- package/src/tools/extract/_fetchAndParse.js +2 -1
- package/src/tools/extract/extractContent.js +2 -1
- package/src/tools/extract/extractStructured.js +43 -0
- package/src/tools/extract/processDocument.js +2 -1
- package/src/tools/scrape/_brandingExtractor.js +378 -0
- package/src/tools/scrape/unifiedScrape.js +66 -6
- package/src/tools/templates/ScrapeTemplateTool.js +2 -1
- package/src/tools/tracking/trackChanges/differ.js +3 -1
- package/src/tools/tracking/trackChanges/index.js +74 -21
- package/src/tools/tracking/trackChanges/schema.js +7 -2
- package/src/utils/hostRateLimiter.js +46 -0
- package/src/utils/robotsChecker.js +2 -1
- package/src/utils/sitemapParser.js +2 -1
- package/src/utils/ssrfGuard.js +161 -0
- package/src/utils/ssrfProtection.js +6 -9
- package/src/skills/crawlforge-cli.md +0 -157
- package/src/skills/crawlforge-mcp.md +0 -80
- package/src/skills/crawlforge-research.md +0 -104
- package/src/skills/crawlforge-stealth.md +0 -98
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
import { z } from 'zod';
|
|
8
8
|
|
|
9
9
|
export const TrackChangesSchema = z.object({
|
|
10
|
-
url: z.string().url(),
|
|
10
|
+
url: z.string().url().optional(),
|
|
11
11
|
operation: z.enum([
|
|
12
12
|
'create_baseline',
|
|
13
13
|
'compare',
|
|
@@ -16,6 +16,7 @@ export const TrackChangesSchema = z.object({
|
|
|
16
16
|
'get_stats',
|
|
17
17
|
'create_scheduled_monitor',
|
|
18
18
|
'stop_scheduled_monitor',
|
|
19
|
+
'list_scheduled_monitors',
|
|
19
20
|
'get_dashboard',
|
|
20
21
|
'export_history',
|
|
21
22
|
'create_alert_rule',
|
|
@@ -100,7 +101,11 @@ export const TrackChangesSchema = z.object({
|
|
|
100
101
|
scheduledMonitorOptions: z.object({
|
|
101
102
|
schedule: z.string().optional(),
|
|
102
103
|
templateId: z.string().optional(),
|
|
103
|
-
enabled: z.boolean().default(true)
|
|
104
|
+
enabled: z.boolean().default(true),
|
|
105
|
+
interval: z.number().min(60000).optional(),
|
|
106
|
+
goal: z.string().optional(),
|
|
107
|
+
monitorId: z.string().optional(),
|
|
108
|
+
notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).optional()
|
|
104
109
|
}).optional(),
|
|
105
110
|
|
|
106
111
|
alertRuleOptions: z.object({
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared per-host outbound rate limiter (politeness / abuse protection).
|
|
3
|
+
*
|
|
4
|
+
* Throttles outbound scraping requests per target host so a single tool call
|
|
5
|
+
* (batch_scrape, map_site, the basic fetch path, etc.) cannot hammer one origin.
|
|
6
|
+
* Mirrors the per-domain limiter BFSCrawler already uses, driven by the shared
|
|
7
|
+
* config so all paths agree on a default.
|
|
8
|
+
*
|
|
9
|
+
* Backwards-compatible: default 10 req/s + 100 req/min per host (the existing
|
|
10
|
+
* effective behaviour), enabled by RATE_LIMIT_PER_DOMAIN (default true). Setting
|
|
11
|
+
* RATE_LIMIT_PER_DOMAIN=false disables the throttle entirely — there is no global
|
|
12
|
+
* cross-host cap, so broad multi-host crawls are never slowed by this.
|
|
13
|
+
*/
|
|
14
|
+
import { RateLimiter } from './rateLimiter.js';
|
|
15
|
+
import { config } from '../constants/config.js';
|
|
16
|
+
|
|
17
|
+
let _limiter = null;
|
|
18
|
+
function limiter() {
|
|
19
|
+
if (!_limiter) {
|
|
20
|
+
_limiter = new RateLimiter({
|
|
21
|
+
requestsPerSecond: config.rateLimit.requestsPerSecond,
|
|
22
|
+
requestsPerMinute: config.rateLimit.requestsPerMinute,
|
|
23
|
+
perDomain: true,
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
return _limiter;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Wait (if necessary) until another request to this URL's host is allowed.
|
|
31
|
+
* Never throws — a limiter failure must not block a legitimate fetch.
|
|
32
|
+
* @param {string} url
|
|
33
|
+
*/
|
|
34
|
+
export async function throttleHost(url) {
|
|
35
|
+
if (config.rateLimit.perDomain === false) return; // feature disabled
|
|
36
|
+
try {
|
|
37
|
+
await limiter().checkLimit(url);
|
|
38
|
+
} catch {
|
|
39
|
+
/* never block a fetch on a limiter error */
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Test/diagnostic hook. */
|
|
44
|
+
export function _resetHostRateLimiter() {
|
|
45
|
+
_limiter = null;
|
|
46
|
+
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import robotsParser from 'robots-parser';
|
|
2
|
+
import { safeFetch } from './ssrfGuard.js';
|
|
2
3
|
|
|
3
4
|
export class RobotsChecker {
|
|
4
5
|
constructor(userAgent = 'CrawlForge/1.0') {
|
|
@@ -32,7 +33,7 @@ export class RobotsChecker {
|
|
|
32
33
|
const controller = new AbortController();
|
|
33
34
|
const timeoutId = setTimeout(() => controller.abort(), 5000);
|
|
34
35
|
|
|
35
|
-
const response = await
|
|
36
|
+
const response = await safeFetch(robotsUrl, {
|
|
36
37
|
signal: controller.signal,
|
|
37
38
|
headers: {
|
|
38
39
|
'User-Agent': this.userAgent
|
|
@@ -3,6 +3,7 @@ import zlib from 'zlib';
|
|
|
3
3
|
import { promisify } from 'util';
|
|
4
4
|
import { CacheManager } from '../core/cache/CacheManager.js';
|
|
5
5
|
import { normalizeUrl } from './urlNormalizer.js';
|
|
6
|
+
import { safeFetch } from './ssrfGuard.js';
|
|
6
7
|
|
|
7
8
|
const gunzip = promisify(zlib.gunzip);
|
|
8
9
|
|
|
@@ -632,7 +633,7 @@ export class SitemapParser {
|
|
|
632
633
|
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
633
634
|
|
|
634
635
|
try {
|
|
635
|
-
const response = await
|
|
636
|
+
const response = await safeFetch(url, {
|
|
636
637
|
signal: controller.signal,
|
|
637
638
|
headers: {
|
|
638
639
|
'User-Agent': this.userAgent,
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SSRF guard for the live outbound fetch path.
|
|
3
|
+
*
|
|
4
|
+
* This wires the (previously unused) SSRF protections into the actual scraping
|
|
5
|
+
* fetch helpers. Enforcement happens at TCP connect time via a custom undici
|
|
6
|
+
* dispatcher `lookup`, so it covers the initial request, every redirect hop, and
|
|
7
|
+
* closes the DNS-rebinding window (the validated IP is the one connected to —
|
|
8
|
+
* there is no second, unchecked resolution).
|
|
9
|
+
*
|
|
10
|
+
* Two levels:
|
|
11
|
+
* - Stage 1 (default): blocks connections to loopback, link-local /
|
|
12
|
+
* cloud-metadata (169.254.0.0/16, incl. 169.254.169.254), and 0.0.0.0.
|
|
13
|
+
* These are never legitimate public-scrape targets, so impact is ~zero.
|
|
14
|
+
* - Stage 2 (SSRF_STRICT=true): full private-range enforcement (RFC1918, ULA,
|
|
15
|
+
* multicast, CGNAT, etc.) via the existing SSRFProtection range logic.
|
|
16
|
+
*
|
|
17
|
+
* Controls (backwards-compatible defaults):
|
|
18
|
+
* - SSRF_PROTECTION_ENABLED=false -> disable the guard entirely (kill switch).
|
|
19
|
+
* - ALLOWED_DOMAINS=a.com,b.com -> bypass the guard for trusted hosts
|
|
20
|
+
* (e.g. a local dev server at localhost). Matches host or any subdomain.
|
|
21
|
+
* - SSRF_STRICT=true -> Stage 2 full enforcement.
|
|
22
|
+
*/
|
|
23
|
+
import dns from 'node:dns';
|
|
24
|
+
import { Agent } from 'undici';
|
|
25
|
+
import { config } from '../constants/config.js';
|
|
26
|
+
import { SSRFProtection } from './ssrfProtection.js';
|
|
27
|
+
|
|
28
|
+
// Reused only for its (well-tested) CIDR range math — no network state.
|
|
29
|
+
const _ssrf = new SSRFProtection();
|
|
30
|
+
|
|
31
|
+
// Narrow Stage-1 ranges: things no legitimate public scrape ever targets.
|
|
32
|
+
const STAGE1_RANGES = ['127.0.0.0/8', '169.254.0.0/16', '0.0.0.0/8', '::1/128', 'fe80::/10'];
|
|
33
|
+
|
|
34
|
+
// Literal cloud-metadata / service-discovery hostnames blocked even before DNS.
|
|
35
|
+
const METADATA_HOSTS = new Set(['metadata.google.internal', 'metadata.azure.com', 'metadata']);
|
|
36
|
+
|
|
37
|
+
function strictMode() {
|
|
38
|
+
return process.env.SSRF_STRICT === 'true';
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Whether a resolved IP must be blocked for the current mode.
|
|
43
|
+
* @param {string} ip
|
|
44
|
+
* @returns {boolean}
|
|
45
|
+
*/
|
|
46
|
+
export function ipBlocked(ip) {
|
|
47
|
+
if (strictMode()) {
|
|
48
|
+
// Full enforcement: anything not explicitly allowed by SSRFProtection.
|
|
49
|
+
return !_ssrf.isIPAllowed(ip);
|
|
50
|
+
}
|
|
51
|
+
if (ip === '127.0.0.1' || ip === '::1' || ip === '0.0.0.0') return true;
|
|
52
|
+
return STAGE1_RANGES.some((range) => _ssrf.isIPInRange(ip, range));
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* undici connect-time lookup: resolves the host, rejects if ANY resolved address
|
|
57
|
+
* is blocked, otherwise hands undici the validated address(es) — so the socket
|
|
58
|
+
* connects to exactly what we checked (rebinding-safe).
|
|
59
|
+
*/
|
|
60
|
+
function ssrfLookup(hostname, opts, callback) {
|
|
61
|
+
dns.lookup(hostname, { all: true, verbatim: true }, (err, addresses) => {
|
|
62
|
+
if (err) return callback(err);
|
|
63
|
+
for (const { address } of addresses) {
|
|
64
|
+
if (ipBlocked(address)) {
|
|
65
|
+
return callback(
|
|
66
|
+
Object.assign(
|
|
67
|
+
new Error(`SSRF Protection: ${hostname} resolves to blocked address ${address}`),
|
|
68
|
+
{ code: 'SSRF_BLOCKED' }
|
|
69
|
+
)
|
|
70
|
+
);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
if (opts && opts.all) return callback(null, addresses);
|
|
74
|
+
const first = addresses[0];
|
|
75
|
+
callback(null, first.address, first.family);
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
let _agent = null;
|
|
80
|
+
function guardedDispatcher() {
|
|
81
|
+
if (!_agent) {
|
|
82
|
+
_agent = new Agent({ connect: { lookup: ssrfLookup } });
|
|
83
|
+
}
|
|
84
|
+
return _agent;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function isAllowlisted(host, allowed) {
|
|
88
|
+
return (allowed || []).some((d) => {
|
|
89
|
+
const dd = String(d).trim().toLowerCase();
|
|
90
|
+
return dd && (host === dd || host.endsWith('.' + dd));
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Pre-flight check + dispatcher selection for an outbound scrape target.
|
|
96
|
+
* Returns `{ dispatcher }` to spread into fetch options. `dispatcher` is
|
|
97
|
+
* undefined when the guard is disabled or the host is explicitly allowlisted.
|
|
98
|
+
* Throws (code SSRF_BLOCKED) for protocol / metadata-host pre-flight violations.
|
|
99
|
+
*
|
|
100
|
+
* @param {string} url
|
|
101
|
+
* @returns {{ dispatcher?: import('undici').Agent }}
|
|
102
|
+
*/
|
|
103
|
+
export function ssrfGuard(url) {
|
|
104
|
+
const sec = config.security?.ssrfProtection;
|
|
105
|
+
if (!sec || sec.enabled === false) return {}; // kill switch -> default fetch behavior
|
|
106
|
+
|
|
107
|
+
let u;
|
|
108
|
+
try {
|
|
109
|
+
u = new URL(url);
|
|
110
|
+
} catch {
|
|
111
|
+
return {}; // let fetch surface its own invalid-URL error
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (!['http:', 'https:'].includes(u.protocol)) {
|
|
115
|
+
throw Object.assign(new Error(`SSRF Protection: protocol '${u.protocol}' is not allowed`), {
|
|
116
|
+
code: 'SSRF_BLOCKED',
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const host = u.hostname.toLowerCase();
|
|
121
|
+
if (isAllowlisted(host, sec.allowedDomains)) return {}; // explicit escape hatch
|
|
122
|
+
|
|
123
|
+
if (METADATA_HOSTS.has(host)) {
|
|
124
|
+
throw Object.assign(new Error(`SSRF Protection: blocked metadata host '${host}'`), {
|
|
125
|
+
code: 'SSRF_BLOCKED',
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return { dispatcher: guardedDispatcher() };
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/** True if an error (or its fetch `cause`) came from the SSRF guard. */
|
|
133
|
+
export function isSsrfError(err) {
|
|
134
|
+
return err?.code === 'SSRF_BLOCKED' || err?.cause?.code === 'SSRF_BLOCKED';
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Drop-in replacement for global `fetch` that applies the SSRF guard.
|
|
139
|
+
* Behaviour-preserving for allowed URLs: all options pass through unchanged and
|
|
140
|
+
* the native Response is returned; only a guarded dispatcher is injected. For
|
|
141
|
+
* blocked targets it throws a clear `SSRF Protection: ...` error (pre-flight) or
|
|
142
|
+
* the fetch rejects at connect time with an SSRF_BLOCKED cause.
|
|
143
|
+
*
|
|
144
|
+
* @param {string} url
|
|
145
|
+
* @param {RequestInit} [options]
|
|
146
|
+
* @returns {Promise<Response>}
|
|
147
|
+
*/
|
|
148
|
+
export async function safeFetch(url, options = {}) {
|
|
149
|
+
const guard = ssrfGuard(url); // throws on protocol / metadata-host violations
|
|
150
|
+
try {
|
|
151
|
+
return await fetch(url, { ...options, ...guard });
|
|
152
|
+
} catch (err) {
|
|
153
|
+
if (isSsrfError(err)) {
|
|
154
|
+
throw new Error(err.cause?.message || err.message);
|
|
155
|
+
}
|
|
156
|
+
throw err;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Exposed for unit tests.
|
|
161
|
+
export const __ssrfInternals = { ssrfLookup, isAllowlisted, STAGE1_RANGES };
|
|
@@ -414,14 +414,7 @@ export class SSRFProtection {
|
|
|
414
414
|
return false;
|
|
415
415
|
}
|
|
416
416
|
}
|
|
417
|
-
/**
|
|
418
417
|
|
|
419
|
-
/**
|
|
420
|
-
* Check for path traversal patterns in raw URL before parsing
|
|
421
|
-
* @param {string} url - Raw URL to check
|
|
422
|
-
* @returns {Object} - Result with violations array
|
|
423
|
-
*/
|
|
424
|
-
* Validate URL path for suspicious patterns
|
|
425
418
|
/**
|
|
426
419
|
* Check for path traversal patterns in raw URL before parsing
|
|
427
420
|
* @param {string} url - Raw URL to check
|
|
@@ -454,9 +447,13 @@ export class SSRFProtection {
|
|
|
454
447
|
|
|
455
448
|
return { violations };
|
|
456
449
|
}
|
|
457
|
-
|
|
450
|
+
|
|
451
|
+
/**
|
|
452
|
+
* Validate URL path for suspicious patterns
|
|
453
|
+
* @param {string} path
|
|
458
454
|
* @returns {Object}
|
|
459
|
-
*/
|
|
455
|
+
*/
|
|
456
|
+
validatePath(path) {
|
|
460
457
|
const suspiciousPatterns = [
|
|
461
458
|
/\.\.\//, // Directory traversal
|
|
462
459
|
/\/etc\//, // System files
|
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
# CrawlForge CLI Usage Guide
|
|
2
|
-
|
|
3
|
-
The `crawlforge` CLI exposes all 23 MCP tools as command-line subcommands.
|
|
4
|
-
|
|
5
|
-
## Installation
|
|
6
|
-
|
|
7
|
-
```bash
|
|
8
|
-
npm install -g crawlforge-mcp-server
|
|
9
|
-
# or run without installing:
|
|
10
|
-
npx crawlforge-mcp-server <command>
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
## Global Flags
|
|
14
|
-
|
|
15
|
-
All commands support these flags:
|
|
16
|
-
- `--json` — output compact JSON
|
|
17
|
-
- `--pretty` — output pretty-printed JSON
|
|
18
|
-
- `--quiet` — suppress output (exit code only)
|
|
19
|
-
- `--api-key <key>` — override CRAWLFORGE_API_KEY env var
|
|
20
|
-
- `--timeout <ms>` — global request timeout (default: 30000)
|
|
21
|
-
|
|
22
|
-
## Commands
|
|
23
|
-
|
|
24
|
-
### scrape — Fetch a URL
|
|
25
|
-
```bash
|
|
26
|
-
crawlforge scrape https://example.com
|
|
27
|
-
crawlforge scrape https://example.com --extract --format markdown
|
|
28
|
-
crawlforge scrape https://example.com --pretty
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
### search — Search the web
|
|
32
|
-
```bash
|
|
33
|
-
crawlforge search "MCP server tutorial" --limit 10
|
|
34
|
-
crawlforge search "nodejs scraping" --provider searxng --json
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
### crawl — Deep website crawl
|
|
38
|
-
```bash
|
|
39
|
-
crawlforge crawl https://docs.example.com --depth 3 --max-pages 200
|
|
40
|
-
crawlforge crawl https://example.com --no-robots --concurrency 20
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
### map — Generate sitemap
|
|
44
|
-
```bash
|
|
45
|
-
crawlforge map https://example.com --pretty
|
|
46
|
-
crawlforge map https://example.com --format xml > sitemap.xml
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
### extract — Structured data extraction
|
|
50
|
-
```bash
|
|
51
|
-
# Schema-based extraction
|
|
52
|
-
crawlforge extract https://example.com/product --schema product-schema.json
|
|
53
|
-
|
|
54
|
-
# LLM-guided extraction
|
|
55
|
-
crawlforge extract https://example.com/article --prompt "extract title, author, date, summary"
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
### track — Track content changes
|
|
59
|
-
```bash
|
|
60
|
-
crawlforge track https://example.com --threshold 10
|
|
61
|
-
crawlforge track https://example.com --selector ".main-content"
|
|
62
|
-
```
|
|
63
|
-
|
|
64
|
-
### analyze — Content analysis
|
|
65
|
-
```bash
|
|
66
|
-
crawlforge analyze https://example.com --depth full --pretty
|
|
67
|
-
```
|
|
68
|
-
|
|
69
|
-
### research — Deep research
|
|
70
|
-
```bash
|
|
71
|
-
crawlforge research "state of AI in 2025" --depth deep --max-urls 30
|
|
72
|
-
crawlforge research "competitor pricing" --output-format detailed --json
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
### stealth — Anti-bot scraping
|
|
76
|
-
```bash
|
|
77
|
-
crawlforge stealth https://protected-site.com
|
|
78
|
-
crawlforge stealth https://protected-site.com --engine camoufox --screenshot
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
### batch — Batch scrape from file
|
|
82
|
-
```bash
|
|
83
|
-
# Create a URLs file:
|
|
84
|
-
cat > urls.txt << EOF
|
|
85
|
-
https://example.com/page1
|
|
86
|
-
https://example.com/page2
|
|
87
|
-
https://example.com/page3
|
|
88
|
-
EOF
|
|
89
|
-
|
|
90
|
-
crawlforge batch urls.txt --format markdown --concurrency 10
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
### actions — Browser automation
|
|
94
|
-
```bash
|
|
95
|
-
# Create an actions script:
|
|
96
|
-
cat > login.json << EOF
|
|
97
|
-
[
|
|
98
|
-
{ "type": "click", "selector": "#login-btn" },
|
|
99
|
-
{ "type": "type", "selector": "#email", "text": "user@example.com" },
|
|
100
|
-
{ "type": "wait", "duration": 1000 }
|
|
101
|
-
]
|
|
102
|
-
EOF
|
|
103
|
-
|
|
104
|
-
crawlforge actions https://example.com --script login.json --screenshot
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
### localize — Geo-targeted fetch
|
|
108
|
-
```bash
|
|
109
|
-
crawlforge localize https://example.com --locale fr-FR --country FR
|
|
110
|
-
crawlforge localize https://shop.example.com --locale en-GB --currency GBP
|
|
111
|
-
```
|
|
112
|
-
|
|
113
|
-
### llmstxt — Generate llms.txt
|
|
114
|
-
```bash
|
|
115
|
-
crawlforge llmstxt https://example.com
|
|
116
|
-
crawlforge llmstxt https://example.com --include-full > llms.txt
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
### template — Pre-built site scrapers
|
|
120
|
-
```bash
|
|
121
|
-
crawlforge template github-repo https://github.com/owner/repo
|
|
122
|
-
crawlforge template amazon-product https://amazon.com/dp/B0XXXXX
|
|
123
|
-
crawlforge template npm-package https://npmjs.com/package/commander
|
|
124
|
-
crawlforge template --list # list all available templates
|
|
125
|
-
```
|
|
126
|
-
|
|
127
|
-
### monitor — Continuous change monitoring
|
|
128
|
-
```bash
|
|
129
|
-
crawlforge monitor https://example.com --interval 60 --webhook https://my-site.com/hook
|
|
130
|
-
crawlforge monitor https://example.com --selector ".price" --threshold 1
|
|
131
|
-
```
|
|
132
|
-
|
|
133
|
-
### install-skills — Install AI assistant skills
|
|
134
|
-
```bash
|
|
135
|
-
crawlforge install-skills --target claude-code
|
|
136
|
-
crawlforge install-skills --target cursor --force
|
|
137
|
-
crawlforge install-skills --target all --dry-run
|
|
138
|
-
```
|
|
139
|
-
|
|
140
|
-
### uninstall-skills — Remove AI assistant skills
|
|
141
|
-
```bash
|
|
142
|
-
crawlforge uninstall-skills --target claude-code
|
|
143
|
-
crawlforge uninstall-skills --target all
|
|
144
|
-
```
|
|
145
|
-
|
|
146
|
-
## Output Piping Examples
|
|
147
|
-
|
|
148
|
-
```bash
|
|
149
|
-
# Extract markdown and save to file
|
|
150
|
-
crawlforge scrape https://example.com --extract --format markdown > page.md
|
|
151
|
-
|
|
152
|
-
# Search and parse with jq
|
|
153
|
-
crawlforge search "nodejs MCP" --json | jq '.results[].url'
|
|
154
|
-
|
|
155
|
-
# Batch scrape and process results
|
|
156
|
-
crawlforge batch urls.txt --json | jq '.results | length'
|
|
157
|
-
```
|
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
# CrawlForge MCP Tools — When and How to Use
|
|
2
|
-
|
|
3
|
-
CrawlForge is a professional MCP server with 23 tools for web scraping, crawling, content extraction, research, and AI compliance.
|
|
4
|
-
|
|
5
|
-
## When to Use MCP Tools vs CLI
|
|
6
|
-
|
|
7
|
-
Use MCP tools when you need results inline within an AI assistant session.
|
|
8
|
-
Use the CLI (`crawlforge <command>`) for scripts, CI, and automation pipelines.
|
|
9
|
-
|
|
10
|
-
## All 23 Tools
|
|
11
|
-
|
|
12
|
-
### Basic Fetching (5 tools)
|
|
13
|
-
- **fetch_url** — Raw HTTP fetch; returns headers + body. Use for quick single-URL fetches.
|
|
14
|
-
- **extract_text** — Clean readable text from a page (strips HTML). Use for reading articles.
|
|
15
|
-
- **extract_links** — All links from a page with anchor text. Use for link analysis.
|
|
16
|
-
- **extract_metadata** — Title, description, OG tags, schema.org from a page.
|
|
17
|
-
- **scrape_structured** — CSS-selector based data extraction from a page.
|
|
18
|
-
|
|
19
|
-
### Search (1 tool)
|
|
20
|
-
- **search_web** — Search via CrawlForge API or SearXNG. Supports query expansion, ranking, dedup.
|
|
21
|
-
|
|
22
|
-
### Crawling (2 tools)
|
|
23
|
-
- **crawl_deep** — BFS crawl up to 1000 pages with configurable depth, content extraction, link analysis.
|
|
24
|
-
- **map_site** — Fast sitemap generation via sitemap.xml or crawl. Returns URL list with metadata.
|
|
25
|
-
|
|
26
|
-
### Content Extraction (7 tools)
|
|
27
|
-
- **extract_content** — Main content extraction with Readability, markdown output, image handling.
|
|
28
|
-
- **process_document** — PDF, DOCX, TXT processing with chunking and metadata.
|
|
29
|
-
- **summarize_content** — Abstractive summarization (via Ollama/API/sampling).
|
|
30
|
-
- **analyze_content** — Sentiment, entities, readability, keyword density, topic detection.
|
|
31
|
-
- **extract_structured** — JSON schema-driven extraction with LLM or CSS selectors.
|
|
32
|
-
- **extract_with_llm** — Natural language prompt-based extraction. Fallback: Ollama → API keys → sampling.
|
|
33
|
-
- **list_ollama_models** — List locally available Ollama models.
|
|
34
|
-
|
|
35
|
-
### Advanced (2 tools)
|
|
36
|
-
- **batch_scrape** — Scrape multiple URLs concurrently. Default output: markdown (RAG-ready).
|
|
37
|
-
- **scrape_with_actions** — Browser automation (click, type, scroll, wait) before scraping.
|
|
38
|
-
|
|
39
|
-
### Research (1 tool)
|
|
40
|
-
- **deep_research** — Multi-stage research: query expansion → parallel fetch → dedup → synthesis.
|
|
41
|
-
|
|
42
|
-
### Tracking (1 tool)
|
|
43
|
-
- **track_changes** — Snapshot URL and diff against baseline. Returns change percentage + diff.
|
|
44
|
-
|
|
45
|
-
### LLMs.txt (1 tool)
|
|
46
|
-
- **generate_llms_txt** — Generate llms.txt and llms-full.txt for AI compliance.
|
|
47
|
-
|
|
48
|
-
### Stealth (1 tool)
|
|
49
|
-
- **stealth_mode** — Anti-bot browser scraping. Engines: playwright (default) or camoufox.
|
|
50
|
-
|
|
51
|
-
### Localization (1 tool)
|
|
52
|
-
- **localization** — Fetch with locale/geo targeting, proxy routing, currency awareness.
|
|
53
|
-
|
|
54
|
-
### Templates (1 tool)
|
|
55
|
-
- **scrape_template** — Pre-built extractors for: amazon-product, linkedin-profile, github-repo, youtube-video, tweet, reddit-thread, hacker-news-front-page, producthunt-launch, stackoverflow-question, npm-package.
|
|
56
|
-
|
|
57
|
-
## Cost Reference (Credits)
|
|
58
|
-
- fetch_url, extract_text, extract_links, extract_metadata: 1 credit
|
|
59
|
-
- search_web, map_site: 2 credits
|
|
60
|
-
- extract_content, scrape_structured, analyze_content, summarize_content: 3 credits
|
|
61
|
-
- crawl_deep, batch_scrape, track_changes, generate_llms_txt: 5 credits
|
|
62
|
-
- extract_structured, extract_with_llm, stealth_mode, localization, scrape_with_actions: 5 credits
|
|
63
|
-
- deep_research: 10–50 credits (dynamic, triggers elicitation when >50)
|
|
64
|
-
|
|
65
|
-
## Example Tool Calls
|
|
66
|
-
|
|
67
|
-
Fetch a page:
|
|
68
|
-
```json
|
|
69
|
-
{ "tool": "fetch_url", "params": { "url": "https://example.com" } }
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
Search the web:
|
|
73
|
-
```json
|
|
74
|
-
{ "tool": "search_web", "params": { "query": "MCP server Node.js", "limit": 5 } }
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
Extract markdown from an article:
|
|
78
|
-
```json
|
|
79
|
-
{ "tool": "extract_content", "params": { "url": "https://example.com/article", "output_format": "markdown" } }
|
|
80
|
-
```
|
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
# CrawlForge Deep Research Workflow
|
|
2
|
-
|
|
3
|
-
## When to Use deep_research
|
|
4
|
-
|
|
5
|
-
Use `deep_research` for comprehensive topic research that requires multiple sources:
|
|
6
|
-
- Competitive analysis (compare multiple competitors)
|
|
7
|
-
- Technology landscape research
|
|
8
|
-
- Fact-gathering with citations
|
|
9
|
-
- Market research with multiple data points
|
|
10
|
-
- Any topic requiring 5+ web sources synthesized
|
|
11
|
-
|
|
12
|
-
Do NOT use for:
|
|
13
|
-
- Single URL content extraction → use `extract_content`
|
|
14
|
-
- Simple web searches → use `search_web`
|
|
15
|
-
- Known URLs you want to read → use `fetch_url` or `batch_scrape`
|
|
16
|
-
|
|
17
|
-
## How deep_research Works
|
|
18
|
-
|
|
19
|
-
1. **Query Expansion** — Generates 3–5 related queries from your topic
|
|
20
|
-
2. **Parallel Fetching** — Fetches up to `max_urls` sources simultaneously
|
|
21
|
-
3. **URL Deduplication** — Skips already-visited URLs within the session
|
|
22
|
-
4. **Content Extraction** — Extracts clean text from each source
|
|
23
|
-
5. **Synthesis** — If Ollama/API key available: returns synthesized report; otherwise returns raw evidence for the calling LLM to synthesize
|
|
24
|
-
|
|
25
|
-
## LLM Fallback Chain
|
|
26
|
-
|
|
27
|
-
```
|
|
28
|
-
Ollama (local, default) → OpenAI API key → Anthropic API key → MCP Sampling → Raw evidence
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
With no LLM configured, `deep_research` returns structured raw evidence that Claude or another LLM can synthesize.
|
|
32
|
-
|
|
33
|
-
## MCP Tool Usage
|
|
34
|
-
|
|
35
|
-
```json
|
|
36
|
-
// Standard research
|
|
37
|
-
{
|
|
38
|
-
"tool": "deep_research",
|
|
39
|
-
"params": {
|
|
40
|
-
"query": "React vs Vue vs Angular in 2025",
|
|
41
|
-
"depth": "standard",
|
|
42
|
-
"max_urls": 20
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
// Deep research with all sources
|
|
47
|
-
{
|
|
48
|
-
"tool": "deep_research",
|
|
49
|
-
"params": {
|
|
50
|
-
"query": "competitor pricing analysis for B2B SaaS",
|
|
51
|
-
"depth": "deep",
|
|
52
|
-
"max_urls": 50,
|
|
53
|
-
"output_format": "detailed"
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
Note: When `max_urls > 50`, the tool triggers an elicitation asking for confirmation before proceeding (cost guard).
|
|
59
|
-
|
|
60
|
-
## CLI Usage
|
|
61
|
-
|
|
62
|
-
```bash
|
|
63
|
-
# Standard research
|
|
64
|
-
crawlforge research "React vs Vue in 2025" --depth standard
|
|
65
|
-
|
|
66
|
-
# Deep research with JSON output
|
|
67
|
-
crawlforge research "B2B SaaS pricing trends" --depth deep --max-urls 30 --json
|
|
68
|
-
|
|
69
|
-
# Save research report to file
|
|
70
|
-
crawlforge research "competitor analysis" --pretty > research-report.json
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
## Depth Levels
|
|
74
|
-
|
|
75
|
-
| Depth | URLs Analyzed | Use Case | Approx. Credits |
|
|
76
|
-
|-------|--------------|----------|-----------------|
|
|
77
|
-
| basic | 5–10 | Quick overview | 10–15 |
|
|
78
|
-
| standard | 15–25 | General research | 15–30 |
|
|
79
|
-
| deep | 30–75 | Comprehensive analysis | 30–75+ |
|
|
80
|
-
|
|
81
|
-
## Cost Management
|
|
82
|
-
|
|
83
|
-
- `deep_research` costs 10 base credits + 1 per URL analyzed
|
|
84
|
-
- Elicitation fires when projected cost > 50 credits
|
|
85
|
-
- Use `max_urls` to cap costs: `max_urls: 10` ≈ 20 credits max
|
|
86
|
-
- Token budget auto-limits LLM synthesis costs (default: 200,000 chars)
|
|
87
|
-
|
|
88
|
-
## Accessing Research Results as Resources
|
|
89
|
-
|
|
90
|
-
Completed research sessions are available as MCP Resources:
|
|
91
|
-
```
|
|
92
|
-
crawlforge://research/{sessionId}
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
List via `resources/list` — no need to re-run the research.
|
|
96
|
-
|
|
97
|
-
## Combining with Other Tools
|
|
98
|
-
|
|
99
|
-
For targeted competitive research:
|
|
100
|
-
```
|
|
101
|
-
1. search_web "competitor X pricing" → get URLs
|
|
102
|
-
2. batch_scrape [competitor URLs] → get content in parallel
|
|
103
|
-
3. deep_research "competitor X vs us" → synthesized analysis
|
|
104
|
-
```
|