crawlforge-mcp-server 4.7.2 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CLAUDE.md +2 -2
  2. package/package.json +2 -1
  3. package/server.js +42 -9
  4. package/src/cli/commands/init.js +13 -2
  5. package/src/cli/commands/install-skills.js +10 -1
  6. package/src/cli/commands/monitor.js +81 -0
  7. package/src/cli/commands/uninstall-skills.js +10 -1
  8. package/src/core/ActionExecutor.js +51 -9
  9. package/src/core/ElicitationHelper.js +18 -5
  10. package/src/core/LLMsTxtAnalyzer.js +2 -1
  11. package/src/core/MonitorScheduler.js +281 -0
  12. package/src/core/MonitorStore.js +79 -0
  13. package/src/core/ResearchOrchestrator.js +2 -1
  14. package/src/core/crawlers/BFSCrawler.js +2 -1
  15. package/src/skills/agent-skills/crawlforge-batch-automation/SKILL.md +126 -0
  16. package/src/skills/agent-skills/crawlforge-batch-automation/references/actions.md +127 -0
  17. package/src/skills/agent-skills/crawlforge-change-tracking/SKILL.md +116 -0
  18. package/src/skills/agent-skills/crawlforge-deep-research/SKILL.md +108 -0
  19. package/src/skills/agent-skills/crawlforge-deep-research/references/workflows.md +76 -0
  20. package/src/skills/agent-skills/crawlforge-getting-started/SKILL.md +89 -0
  21. package/src/skills/agent-skills/crawlforge-getting-started/references/cli.md +71 -0
  22. package/src/skills/agent-skills/crawlforge-getting-started/references/credits.md +75 -0
  23. package/src/skills/agent-skills/crawlforge-stealth-browsing/SKILL.md +106 -0
  24. package/src/skills/agent-skills/crawlforge-stealth-browsing/references/engine-selection.md +63 -0
  25. package/src/skills/agent-skills/crawlforge-structured-extraction/SKILL.md +121 -0
  26. package/src/skills/agent-skills/crawlforge-structured-extraction/references/templates.md +39 -0
  27. package/src/skills/agent-skills/crawlforge-web-scraping/SKILL.md +141 -0
  28. package/src/skills/agent-skills/crawlforge-web-scraping/references/tool-reference.md +95 -0
  29. package/src/skills/installer.js +186 -34
  30. package/src/tools/advanced/batchScrape/worker.js +8 -2
  31. package/src/tools/basic/_fetch.js +14 -1
  32. package/src/tools/crawl/_sessionContext.js +3 -1
  33. package/src/tools/extract/_fetchAndParse.js +2 -1
  34. package/src/tools/extract/extractContent.js +2 -1
  35. package/src/tools/extract/processDocument.js +2 -1
  36. package/src/tools/scrape/_brandingExtractor.js +378 -0
  37. package/src/tools/scrape/unifiedScrape.js +66 -6
  38. package/src/tools/templates/ScrapeTemplateTool.js +2 -1
  39. package/src/tools/tracking/trackChanges/differ.js +3 -1
  40. package/src/tools/tracking/trackChanges/index.js +74 -21
  41. package/src/tools/tracking/trackChanges/schema.js +7 -2
  42. package/src/utils/hostRateLimiter.js +46 -0
  43. package/src/utils/robotsChecker.js +2 -1
  44. package/src/utils/sitemapParser.js +2 -1
  45. package/src/utils/ssrfGuard.js +161 -0
  46. package/src/utils/ssrfProtection.js +6 -9
  47. package/src/skills/crawlforge-cli.md +0 -157
  48. package/src/skills/crawlforge-mcp.md +0 -80
  49. package/src/skills/crawlforge-research.md +0 -104
  50. package/src/skills/crawlforge-stealth.md +0 -98
@@ -7,7 +7,7 @@
7
7
  import { z } from 'zod';
8
8
 
9
9
  export const TrackChangesSchema = z.object({
10
- url: z.string().url(),
10
+ url: z.string().url().optional(),
11
11
  operation: z.enum([
12
12
  'create_baseline',
13
13
  'compare',
@@ -16,6 +16,7 @@ export const TrackChangesSchema = z.object({
16
16
  'get_stats',
17
17
  'create_scheduled_monitor',
18
18
  'stop_scheduled_monitor',
19
+ 'list_scheduled_monitors',
19
20
  'get_dashboard',
20
21
  'export_history',
21
22
  'create_alert_rule',
@@ -100,7 +101,11 @@ export const TrackChangesSchema = z.object({
100
101
  scheduledMonitorOptions: z.object({
101
102
  schedule: z.string().optional(),
102
103
  templateId: z.string().optional(),
103
- enabled: z.boolean().default(true)
104
+ enabled: z.boolean().default(true),
105
+ interval: z.number().min(60000).optional(),
106
+ goal: z.string().optional(),
107
+ monitorId: z.string().optional(),
108
+ notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).optional()
104
109
  }).optional(),
105
110
 
106
111
  alertRuleOptions: z.object({
@@ -0,0 +1,46 @@
1
+ /**
2
+ * Shared per-host outbound rate limiter (politeness / abuse protection).
3
+ *
4
+ * Throttles outbound scraping requests per target host so a single tool call
5
+ * (batch_scrape, map_site, the basic fetch path, etc.) cannot hammer one origin.
6
+ * Mirrors the per-domain limiter BFSCrawler already uses, driven by the shared
7
+ * config so all paths agree on a default.
8
+ *
9
+ * Backwards-compatible: default 10 req/s + 100 req/min per host (the existing
10
+ * effective behaviour), enabled by RATE_LIMIT_PER_DOMAIN (default true). Setting
11
+ * RATE_LIMIT_PER_DOMAIN=false disables the throttle entirely — there is no global
12
+ * cross-host cap, so broad multi-host crawls are never slowed by this.
13
+ */
14
+ import { RateLimiter } from './rateLimiter.js';
15
+ import { config } from '../constants/config.js';
16
+
17
+ let _limiter = null;
18
+ function limiter() {
19
+ if (!_limiter) {
20
+ _limiter = new RateLimiter({
21
+ requestsPerSecond: config.rateLimit.requestsPerSecond,
22
+ requestsPerMinute: config.rateLimit.requestsPerMinute,
23
+ perDomain: true,
24
+ });
25
+ }
26
+ return _limiter;
27
+ }
28
+
29
+ /**
30
+ * Wait (if necessary) until another request to this URL's host is allowed.
31
+ * Never throws — a limiter failure must not block a legitimate fetch.
32
+ * @param {string} url
33
+ */
34
+ export async function throttleHost(url) {
35
+ if (config.rateLimit.perDomain === false) return; // feature disabled
36
+ try {
37
+ await limiter().checkLimit(url);
38
+ } catch {
39
+ /* never block a fetch on a limiter error */
40
+ }
41
+ }
42
+
43
+ /** Test/diagnostic hook. */
44
+ export function _resetHostRateLimiter() {
45
+ _limiter = null;
46
+ }
@@ -1,4 +1,5 @@
1
1
  import robotsParser from 'robots-parser';
2
+ import { safeFetch } from './ssrfGuard.js';
2
3
 
3
4
  export class RobotsChecker {
4
5
  constructor(userAgent = 'CrawlForge/1.0') {
@@ -32,7 +33,7 @@ export class RobotsChecker {
32
33
  const controller = new AbortController();
33
34
  const timeoutId = setTimeout(() => controller.abort(), 5000);
34
35
 
35
- const response = await fetch(robotsUrl, {
36
+ const response = await safeFetch(robotsUrl, {
36
37
  signal: controller.signal,
37
38
  headers: {
38
39
  'User-Agent': this.userAgent
@@ -3,6 +3,7 @@ import zlib from 'zlib';
3
3
  import { promisify } from 'util';
4
4
  import { CacheManager } from '../core/cache/CacheManager.js';
5
5
  import { normalizeUrl } from './urlNormalizer.js';
6
+ import { safeFetch } from './ssrfGuard.js';
6
7
 
7
8
  const gunzip = promisify(zlib.gunzip);
8
9
 
@@ -632,7 +633,7 @@ export class SitemapParser {
632
633
  const timeoutId = setTimeout(() => controller.abort(), this.timeout);
633
634
 
634
635
  try {
635
- const response = await fetch(url, {
636
+ const response = await safeFetch(url, {
636
637
  signal: controller.signal,
637
638
  headers: {
638
639
  'User-Agent': this.userAgent,
@@ -0,0 +1,161 @@
1
+ /**
2
+ * SSRF guard for the live outbound fetch path.
3
+ *
4
+ * This wires the (previously unused) SSRF protections into the actual scraping
5
+ * fetch helpers. Enforcement happens at TCP connect time via a custom undici
6
+ * dispatcher `lookup`, so it covers the initial request, every redirect hop, and
7
+ * closes the DNS-rebinding window (the validated IP is the one connected to —
8
+ * there is no second, unchecked resolution).
9
+ *
10
+ * Two levels:
11
+ * - Stage 1 (default): blocks connections to loopback, link-local /
12
+ * cloud-metadata (169.254.0.0/16, incl. 169.254.169.254), and 0.0.0.0.
13
+ * These are never legitimate public-scrape targets, so impact is ~zero.
14
+ * - Stage 2 (SSRF_STRICT=true): full private-range enforcement (RFC1918, ULA,
15
+ * multicast, CGNAT, etc.) via the existing SSRFProtection range logic.
16
+ *
17
+ * Controls (backwards-compatible defaults):
18
+ * - SSRF_PROTECTION_ENABLED=false -> disable the guard entirely (kill switch).
19
+ * - ALLOWED_DOMAINS=a.com,b.com -> bypass the guard for trusted hosts
20
+ * (e.g. a local dev server at localhost). Matches host or any subdomain.
21
+ * - SSRF_STRICT=true -> Stage 2 full enforcement.
22
+ */
23
+ import dns from 'node:dns';
24
+ import { Agent } from 'undici';
25
+ import { config } from '../constants/config.js';
26
+ import { SSRFProtection } from './ssrfProtection.js';
27
+
28
+ // Reused only for its (well-tested) CIDR range math — no network state.
29
+ const _ssrf = new SSRFProtection();
30
+
31
+ // Narrow Stage-1 ranges: things no legitimate public scrape ever targets.
32
+ const STAGE1_RANGES = ['127.0.0.0/8', '169.254.0.0/16', '0.0.0.0/8', '::1/128', 'fe80::/10'];
33
+
34
+ // Literal cloud-metadata / service-discovery hostnames blocked even before DNS.
35
+ const METADATA_HOSTS = new Set(['metadata.google.internal', 'metadata.azure.com', 'metadata']);
36
+
37
+ function strictMode() {
38
+ return process.env.SSRF_STRICT === 'true';
39
+ }
40
+
41
+ /**
42
+ * Whether a resolved IP must be blocked for the current mode.
43
+ * @param {string} ip
44
+ * @returns {boolean}
45
+ */
46
+ export function ipBlocked(ip) {
47
+ if (strictMode()) {
48
+ // Full enforcement: anything not explicitly allowed by SSRFProtection.
49
+ return !_ssrf.isIPAllowed(ip);
50
+ }
51
+ if (ip === '127.0.0.1' || ip === '::1' || ip === '0.0.0.0') return true;
52
+ return STAGE1_RANGES.some((range) => _ssrf.isIPInRange(ip, range));
53
+ }
54
+
55
+ /**
56
+ * undici connect-time lookup: resolves the host, rejects if ANY resolved address
57
+ * is blocked, otherwise hands undici the validated address(es) — so the socket
58
+ * connects to exactly what we checked (rebinding-safe).
59
+ */
60
+ function ssrfLookup(hostname, opts, callback) {
61
+ dns.lookup(hostname, { all: true, verbatim: true }, (err, addresses) => {
62
+ if (err) return callback(err);
63
+ for (const { address } of addresses) {
64
+ if (ipBlocked(address)) {
65
+ return callback(
66
+ Object.assign(
67
+ new Error(`SSRF Protection: ${hostname} resolves to blocked address ${address}`),
68
+ { code: 'SSRF_BLOCKED' }
69
+ )
70
+ );
71
+ }
72
+ }
73
+ if (opts && opts.all) return callback(null, addresses);
74
+ const first = addresses[0];
75
+ callback(null, first.address, first.family);
76
+ });
77
+ }
78
+
79
+ let _agent = null;
80
+ function guardedDispatcher() {
81
+ if (!_agent) {
82
+ _agent = new Agent({ connect: { lookup: ssrfLookup } });
83
+ }
84
+ return _agent;
85
+ }
86
+
87
+ function isAllowlisted(host, allowed) {
88
+ return (allowed || []).some((d) => {
89
+ const dd = String(d).trim().toLowerCase();
90
+ return dd && (host === dd || host.endsWith('.' + dd));
91
+ });
92
+ }
93
+
94
+ /**
95
+ * Pre-flight check + dispatcher selection for an outbound scrape target.
96
+ * Returns `{ dispatcher }` to spread into fetch options. `dispatcher` is
97
+ * undefined when the guard is disabled or the host is explicitly allowlisted.
98
+ * Throws (code SSRF_BLOCKED) for protocol / metadata-host pre-flight violations.
99
+ *
100
+ * @param {string} url
101
+ * @returns {{ dispatcher?: import('undici').Agent }}
102
+ */
103
+ export function ssrfGuard(url) {
104
+ const sec = config.security?.ssrfProtection;
105
+ if (!sec || sec.enabled === false) return {}; // kill switch -> default fetch behavior
106
+
107
+ let u;
108
+ try {
109
+ u = new URL(url);
110
+ } catch {
111
+ return {}; // let fetch surface its own invalid-URL error
112
+ }
113
+
114
+ if (!['http:', 'https:'].includes(u.protocol)) {
115
+ throw Object.assign(new Error(`SSRF Protection: protocol '${u.protocol}' is not allowed`), {
116
+ code: 'SSRF_BLOCKED',
117
+ });
118
+ }
119
+
120
+ const host = u.hostname.toLowerCase();
121
+ if (isAllowlisted(host, sec.allowedDomains)) return {}; // explicit escape hatch
122
+
123
+ if (METADATA_HOSTS.has(host)) {
124
+ throw Object.assign(new Error(`SSRF Protection: blocked metadata host '${host}'`), {
125
+ code: 'SSRF_BLOCKED',
126
+ });
127
+ }
128
+
129
+ return { dispatcher: guardedDispatcher() };
130
+ }
131
+
132
+ /** True if an error (or its fetch `cause`) came from the SSRF guard. */
133
+ export function isSsrfError(err) {
134
+ return err?.code === 'SSRF_BLOCKED' || err?.cause?.code === 'SSRF_BLOCKED';
135
+ }
136
+
137
+ /**
138
+ * Drop-in replacement for global `fetch` that applies the SSRF guard.
139
+ * Behaviour-preserving for allowed URLs: all options pass through unchanged and
140
+ * the native Response is returned; only a guarded dispatcher is injected. For
141
+ * blocked targets it throws a clear `SSRF Protection: ...` error (pre-flight) or
142
+ * the fetch rejects at connect time with an SSRF_BLOCKED cause.
143
+ *
144
+ * @param {string} url
145
+ * @param {RequestInit} [options]
146
+ * @returns {Promise<Response>}
147
+ */
148
+ export async function safeFetch(url, options = {}) {
149
+ const guard = ssrfGuard(url); // throws on protocol / metadata-host violations
150
+ try {
151
+ return await fetch(url, { ...options, ...guard });
152
+ } catch (err) {
153
+ if (isSsrfError(err)) {
154
+ throw new Error(err.cause?.message || err.message);
155
+ }
156
+ throw err;
157
+ }
158
+ }
159
+
160
+ // Exposed for unit tests.
161
+ export const __ssrfInternals = { ssrfLookup, isAllowlisted, STAGE1_RANGES };
@@ -414,14 +414,7 @@ export class SSRFProtection {
414
414
  return false;
415
415
  }
416
416
  }
417
- /**
418
417
 
419
- /**
420
- * Check for path traversal patterns in raw URL before parsing
421
- * @param {string} url - Raw URL to check
422
- * @returns {Object} - Result with violations array
423
- */
424
- * Validate URL path for suspicious patterns
425
418
  /**
426
419
  * Check for path traversal patterns in raw URL before parsing
427
420
  * @param {string} url - Raw URL to check
@@ -454,9 +447,13 @@ export class SSRFProtection {
454
447
 
455
448
  return { violations };
456
449
  }
457
- * @param {string} path
450
+
451
+ /**
452
+ * Validate URL path for suspicious patterns
453
+ * @param {string} path
458
454
  * @returns {Object}
459
- */ validatePath(path) {
455
+ */
456
+ validatePath(path) {
460
457
  const suspiciousPatterns = [
461
458
  /\.\.\//, // Directory traversal
462
459
  /\/etc\//, // System files
@@ -1,157 +0,0 @@
1
- # CrawlForge CLI Usage Guide
2
-
3
- The `crawlforge` CLI exposes all 23 MCP tools as command-line subcommands.
4
-
5
- ## Installation
6
-
7
- ```bash
8
- npm install -g crawlforge-mcp-server
9
- # or run without installing:
10
- npx crawlforge-mcp-server <command>
11
- ```
12
-
13
- ## Global Flags
14
-
15
- All commands support these flags:
16
- - `--json` — output compact JSON
17
- - `--pretty` — output pretty-printed JSON
18
- - `--quiet` — suppress output (exit code only)
19
- - `--api-key <key>` — override CRAWLFORGE_API_KEY env var
20
- - `--timeout <ms>` — global request timeout (default: 30000)
21
-
22
- ## Commands
23
-
24
- ### scrape — Fetch a URL
25
- ```bash
26
- crawlforge scrape https://example.com
27
- crawlforge scrape https://example.com --extract --format markdown
28
- crawlforge scrape https://example.com --pretty
29
- ```
30
-
31
- ### search — Search the web
32
- ```bash
33
- crawlforge search "MCP server tutorial" --limit 10
34
- crawlforge search "nodejs scraping" --provider searxng --json
35
- ```
36
-
37
- ### crawl — Deep website crawl
38
- ```bash
39
- crawlforge crawl https://docs.example.com --depth 3 --max-pages 200
40
- crawlforge crawl https://example.com --no-robots --concurrency 20
41
- ```
42
-
43
- ### map — Generate sitemap
44
- ```bash
45
- crawlforge map https://example.com --pretty
46
- crawlforge map https://example.com --format xml > sitemap.xml
47
- ```
48
-
49
- ### extract — Structured data extraction
50
- ```bash
51
- # Schema-based extraction
52
- crawlforge extract https://example.com/product --schema product-schema.json
53
-
54
- # LLM-guided extraction
55
- crawlforge extract https://example.com/article --prompt "extract title, author, date, summary"
56
- ```
57
-
58
- ### track — Track content changes
59
- ```bash
60
- crawlforge track https://example.com --threshold 10
61
- crawlforge track https://example.com --selector ".main-content"
62
- ```
63
-
64
- ### analyze — Content analysis
65
- ```bash
66
- crawlforge analyze https://example.com --depth full --pretty
67
- ```
68
-
69
- ### research — Deep research
70
- ```bash
71
- crawlforge research "state of AI in 2025" --depth deep --max-urls 30
72
- crawlforge research "competitor pricing" --output-format detailed --json
73
- ```
74
-
75
- ### stealth — Anti-bot scraping
76
- ```bash
77
- crawlforge stealth https://protected-site.com
78
- crawlforge stealth https://protected-site.com --engine camoufox --screenshot
79
- ```
80
-
81
- ### batch — Batch scrape from file
82
- ```bash
83
- # Create a URLs file:
84
- cat > urls.txt << EOF
85
- https://example.com/page1
86
- https://example.com/page2
87
- https://example.com/page3
88
- EOF
89
-
90
- crawlforge batch urls.txt --format markdown --concurrency 10
91
- ```
92
-
93
- ### actions — Browser automation
94
- ```bash
95
- # Create an actions script:
96
- cat > login.json << EOF
97
- [
98
- { "type": "click", "selector": "#login-btn" },
99
- { "type": "type", "selector": "#email", "text": "user@example.com" },
100
- { "type": "wait", "duration": 1000 }
101
- ]
102
- EOF
103
-
104
- crawlforge actions https://example.com --script login.json --screenshot
105
- ```
106
-
107
- ### localize — Geo-targeted fetch
108
- ```bash
109
- crawlforge localize https://example.com --locale fr-FR --country FR
110
- crawlforge localize https://shop.example.com --locale en-GB --currency GBP
111
- ```
112
-
113
- ### llmstxt — Generate llms.txt
114
- ```bash
115
- crawlforge llmstxt https://example.com
116
- crawlforge llmstxt https://example.com --include-full > llms.txt
117
- ```
118
-
119
- ### template — Pre-built site scrapers
120
- ```bash
121
- crawlforge template github-repo https://github.com/owner/repo
122
- crawlforge template amazon-product https://amazon.com/dp/B0XXXXX
123
- crawlforge template npm-package https://npmjs.com/package/commander
124
- crawlforge template --list # list all available templates
125
- ```
126
-
127
- ### monitor — Continuous change monitoring
128
- ```bash
129
- crawlforge monitor https://example.com --interval 60 --webhook https://my-site.com/hook
130
- crawlforge monitor https://example.com --selector ".price" --threshold 1
131
- ```
132
-
133
- ### install-skills — Install AI assistant skills
134
- ```bash
135
- crawlforge install-skills --target claude-code
136
- crawlforge install-skills --target cursor --force
137
- crawlforge install-skills --target all --dry-run
138
- ```
139
-
140
- ### uninstall-skills — Remove AI assistant skills
141
- ```bash
142
- crawlforge uninstall-skills --target claude-code
143
- crawlforge uninstall-skills --target all
144
- ```
145
-
146
- ## Output Piping Examples
147
-
148
- ```bash
149
- # Extract markdown and save to file
150
- crawlforge scrape https://example.com --extract --format markdown > page.md
151
-
152
- # Search and parse with jq
153
- crawlforge search "nodejs MCP" --json | jq '.results[].url'
154
-
155
- # Batch scrape and process results
156
- crawlforge batch urls.txt --json | jq '.results | length'
157
- ```
@@ -1,80 +0,0 @@
1
- # CrawlForge MCP Tools — When and How to Use
2
-
3
- CrawlForge is a professional MCP server with 23 tools for web scraping, crawling, content extraction, research, and AI compliance.
4
-
5
- ## When to Use MCP Tools vs CLI
6
-
7
- Use MCP tools when you need results inline within an AI assistant session.
8
- Use the CLI (`crawlforge <command>`) for scripts, CI, and automation pipelines.
9
-
10
- ## All 23 Tools
11
-
12
- ### Basic Fetching (5 tools)
13
- - **fetch_url** — Raw HTTP fetch; returns headers + body. Use for quick single-URL fetches.
14
- - **extract_text** — Clean readable text from a page (strips HTML). Use for reading articles.
15
- - **extract_links** — All links from a page with anchor text. Use for link analysis.
16
- - **extract_metadata** — Title, description, OG tags, schema.org from a page.
17
- - **scrape_structured** — CSS-selector based data extraction from a page.
18
-
19
- ### Search (1 tool)
20
- - **search_web** — Search via CrawlForge API or SearXNG. Supports query expansion, ranking, dedup.
21
-
22
- ### Crawling (2 tools)
23
- - **crawl_deep** — BFS crawl up to 1000 pages with configurable depth, content extraction, link analysis.
24
- - **map_site** — Fast sitemap generation via sitemap.xml or crawl. Returns URL list with metadata.
25
-
26
- ### Content Extraction (7 tools)
27
- - **extract_content** — Main content extraction with Readability, markdown output, image handling.
28
- - **process_document** — PDF, DOCX, TXT processing with chunking and metadata.
29
- - **summarize_content** — Abstractive summarization (via Ollama/API/sampling).
30
- - **analyze_content** — Sentiment, entities, readability, keyword density, topic detection.
31
- - **extract_structured** — JSON schema-driven extraction with LLM or CSS selectors.
32
- - **extract_with_llm** — Natural language prompt-based extraction. Fallback: Ollama → API keys → sampling.
33
- - **list_ollama_models** — List locally available Ollama models.
34
-
35
- ### Advanced (2 tools)
36
- - **batch_scrape** — Scrape multiple URLs concurrently. Default output: markdown (RAG-ready).
37
- - **scrape_with_actions** — Browser automation (click, type, scroll, wait) before scraping.
38
-
39
- ### Research (1 tool)
40
- - **deep_research** — Multi-stage research: query expansion → parallel fetch → dedup → synthesis.
41
-
42
- ### Tracking (1 tool)
43
- - **track_changes** — Snapshot URL and diff against baseline. Returns change percentage + diff.
44
-
45
- ### LLMs.txt (1 tool)
46
- - **generate_llms_txt** — Generate llms.txt and llms-full.txt for AI compliance.
47
-
48
- ### Stealth (1 tool)
49
- - **stealth_mode** — Anti-bot browser scraping. Engines: playwright (default) or camoufox.
50
-
51
- ### Localization (1 tool)
52
- - **localization** — Fetch with locale/geo targeting, proxy routing, currency awareness.
53
-
54
- ### Templates (1 tool)
55
- - **scrape_template** — Pre-built extractors for: amazon-product, linkedin-profile, github-repo, youtube-video, tweet, reddit-thread, hacker-news-front-page, producthunt-launch, stackoverflow-question, npm-package.
56
-
57
- ## Cost Reference (Credits)
58
- - fetch_url, extract_text, extract_links, extract_metadata: 1 credit
59
- - search_web, map_site: 2 credits
60
- - extract_content, scrape_structured, analyze_content, summarize_content: 3 credits
61
- - crawl_deep, batch_scrape, track_changes, generate_llms_txt: 5 credits
62
- - extract_structured, extract_with_llm, stealth_mode, localization, scrape_with_actions: 5 credits
63
- - deep_research: 10–50 credits (dynamic, triggers elicitation when >50)
64
-
65
- ## Example Tool Calls
66
-
67
- Fetch a page:
68
- ```json
69
- { "tool": "fetch_url", "params": { "url": "https://example.com" } }
70
- ```
71
-
72
- Search the web:
73
- ```json
74
- { "tool": "search_web", "params": { "query": "MCP server Node.js", "limit": 5 } }
75
- ```
76
-
77
- Extract markdown from an article:
78
- ```json
79
- { "tool": "extract_content", "params": { "url": "https://example.com/article", "output_format": "markdown" } }
80
- ```
@@ -1,104 +0,0 @@
1
- # CrawlForge Deep Research Workflow
2
-
3
- ## When to Use deep_research
4
-
5
- Use `deep_research` for comprehensive topic research that requires multiple sources:
6
- - Competitive analysis (compare multiple competitors)
7
- - Technology landscape research
8
- - Fact-gathering with citations
9
- - Market research with multiple data points
10
- - Any topic requiring 5+ web sources synthesized
11
-
12
- Do NOT use for:
13
- - Single URL content extraction → use `extract_content`
14
- - Simple web searches → use `search_web`
15
- - Known URLs you want to read → use `fetch_url` or `batch_scrape`
16
-
17
- ## How deep_research Works
18
-
19
- 1. **Query Expansion** — Generates 3–5 related queries from your topic
20
- 2. **Parallel Fetching** — Fetches up to `max_urls` sources simultaneously
21
- 3. **URL Deduplication** — Skips already-visited URLs within the session
22
- 4. **Content Extraction** — Extracts clean text from each source
23
- 5. **Synthesis** — If Ollama/API key available: returns synthesized report; otherwise returns raw evidence for the calling LLM to synthesize
24
-
25
- ## LLM Fallback Chain
26
-
27
- ```
28
- Ollama (local, default) → OpenAI API key → Anthropic API key → MCP Sampling → Raw evidence
29
- ```
30
-
31
- With no LLM configured, `deep_research` returns structured raw evidence that Claude or another LLM can synthesize.
32
-
33
- ## MCP Tool Usage
34
-
35
- ```json
36
- // Standard research
37
- {
38
- "tool": "deep_research",
39
- "params": {
40
- "query": "React vs Vue vs Angular in 2025",
41
- "depth": "standard",
42
- "max_urls": 20
43
- }
44
- }
45
-
46
- // Deep research with all sources
47
- {
48
- "tool": "deep_research",
49
- "params": {
50
- "query": "competitor pricing analysis for B2B SaaS",
51
- "depth": "deep",
52
- "max_urls": 50,
53
- "output_format": "detailed"
54
- }
55
- }
56
- ```
57
-
58
- Note: When `max_urls > 50`, the tool triggers an elicitation asking for confirmation before proceeding (cost guard).
59
-
60
- ## CLI Usage
61
-
62
- ```bash
63
- # Standard research
64
- crawlforge research "React vs Vue in 2025" --depth standard
65
-
66
- # Deep research with JSON output
67
- crawlforge research "B2B SaaS pricing trends" --depth deep --max-urls 30 --json
68
-
69
- # Save research report to file
70
- crawlforge research "competitor analysis" --pretty > research-report.json
71
- ```
72
-
73
- ## Depth Levels
74
-
75
- | Depth | URLs Analyzed | Use Case | Approx. Credits |
76
- |-------|--------------|----------|-----------------|
77
- | basic | 5–10 | Quick overview | 10–15 |
78
- | standard | 15–25 | General research | 15–30 |
79
- | deep | 30–75 | Comprehensive analysis | 30–75+ |
80
-
81
- ## Cost Management
82
-
83
- - `deep_research` costs 10 base credits + 1 per URL analyzed
84
- - Elicitation fires when projected cost > 50 credits
85
- - Use `max_urls` to cap costs: `max_urls: 10` ≈ 20 credits max
86
- - Token budget auto-limits LLM synthesis costs (default: 200,000 chars)
87
-
88
- ## Accessing Research Results as Resources
89
-
90
- Completed research sessions are available as MCP Resources:
91
- ```
92
- crawlforge://research/{sessionId}
93
- ```
94
-
95
- List via `resources/list` — no need to re-run the research.
96
-
97
- ## Combining with Other Tools
98
-
99
- For targeted competitive research:
100
- ```
101
- 1. search_web "competitor X pricing" → get URLs
102
- 2. batch_scrape [competitor URLs] → get content in parallel
103
- 3. deep_research "competitor X vs us" → synthesized analysis
104
- ```