crawlforge-mcp-server 3.4.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +28 -2
  2. package/package.json +6 -4
  3. package/server.js +166 -32
  4. package/src/cli/commands/actions.js +36 -0
  5. package/src/cli/commands/analyze.js +19 -0
  6. package/src/cli/commands/batch.js +45 -0
  7. package/src/cli/commands/crawl.js +30 -0
  8. package/src/cli/commands/extract.js +45 -0
  9. package/src/cli/commands/install-skills.js +46 -0
  10. package/src/cli/commands/llmstxt.js +24 -0
  11. package/src/cli/commands/localize.js +29 -0
  12. package/src/cli/commands/map.js +26 -0
  13. package/src/cli/commands/monitor.js +29 -0
  14. package/src/cli/commands/research.js +26 -0
  15. package/src/cli/commands/scrape.js +37 -0
  16. package/src/cli/commands/search.js +28 -0
  17. package/src/cli/commands/stealth.js +29 -0
  18. package/src/cli/commands/template.js +26 -0
  19. package/src/cli/commands/track.js +24 -0
  20. package/src/cli/commands/uninstall-skills.js +35 -0
  21. package/src/cli/formatter.js +57 -0
  22. package/src/cli/index.js +94 -0
  23. package/src/cli/lib/runTool.js +40 -0
  24. package/src/core/ActionExecutor.js +8 -6
  25. package/src/core/AuthManager.js +103 -3
  26. package/src/core/ChangeTracker.js +34 -0
  27. package/src/core/ElicitationHelper.js +112 -0
  28. package/src/core/JobManager.js +36 -2
  29. package/src/core/LocalizationManager.js +19 -5
  30. package/src/core/PerformanceManager.js +53 -17
  31. package/src/core/ResearchOrchestrator.js +40 -5
  32. package/src/core/SamplingClient.js +191 -0
  33. package/src/core/StealthBrowserManager.js +248 -2
  34. package/src/core/WebhookDispatcher.js +18 -10
  35. package/src/prompts/PromptRegistry.js +199 -0
  36. package/src/resources/ResourceRegistry.js +273 -0
  37. package/src/server/transports/streamableHttp.js +6 -6
  38. package/src/server/withAuth.js +25 -0
  39. package/src/skills/crawlforge-cli.md +157 -0
  40. package/src/skills/crawlforge-mcp.md +80 -0
  41. package/src/skills/crawlforge-research.md +104 -0
  42. package/src/skills/crawlforge-stealth.md +98 -0
  43. package/src/skills/installer.js +141 -0
  44. package/src/tools/advanced/batchScrape/index.js +30 -0
  45. package/src/tools/advanced/batchScrape/schema.js +1 -1
  46. package/src/tools/basic/extractText.js +19 -8
  47. package/src/tools/crawl/crawlDeep.js +27 -0
  48. package/src/tools/extract/extractContent.js +5 -17
  49. package/src/tools/extract/extractStructured.js +8 -0
  50. package/src/tools/extract/extractWithLlm.js +35 -25
  51. package/src/tools/extract/listOllamaModels.js +66 -0
  52. package/src/tools/extract/processDocument.js +7 -1
  53. package/src/tools/extract/summarizeContent.js +17 -0
  54. package/src/tools/research/deepResearch.js +34 -0
  55. package/src/tools/templates/ScrapeTemplateTool.js +68 -0
  56. package/src/tools/templates/TemplateRegistry.js +311 -0
  57. package/src/utils/Logger.js +15 -0
  58. package/src/utils/htmlToMarkdown.js +54 -0
  59. package/src/utils/secretMask.js +86 -0
@@ -0,0 +1,80 @@
1
+ # CrawlForge MCP Tools — When and How to Use
2
+
3
+ CrawlForge is a professional MCP server with 23 tools for web scraping, crawling, content extraction, research, and AI compliance.
4
+
5
+ ## When to Use MCP Tools vs CLI
6
+
7
+ Use MCP tools when you need results inline within an AI assistant session.
8
+ Use the CLI (`crawlforge <command>`) for scripts, CI, and automation pipelines.
9
+
10
+ ## All 23 Tools
11
+
12
+ ### Basic Fetching (5 tools)
13
+ - **fetch_url** — Raw HTTP fetch; returns headers + body. Use for quick single-URL fetches.
14
+ - **extract_text** — Clean readable text from a page (strips HTML). Use for reading articles.
15
+ - **extract_links** — All links from a page with anchor text. Use for link analysis.
16
+ - **extract_metadata** — Title, description, OG tags, schema.org from a page.
17
+ - **scrape_structured** — CSS-selector based data extraction from a page.
18
+
19
+ ### Search (1 tool)
20
+ - **search_web** — Search via CrawlForge API or SearXNG. Supports query expansion, ranking, dedup.
21
+
22
+ ### Crawling (2 tools)
23
+ - **crawl_deep** — BFS crawl up to 1000 pages with configurable depth, content extraction, link analysis.
24
+ - **map_site** — Fast sitemap generation via sitemap.xml or crawl. Returns URL list with metadata.
25
+
26
+ ### Content Extraction (7 tools)
27
+ - **extract_content** — Main content extraction with Readability, markdown output, image handling.
28
+ - **process_document** — PDF, DOCX, TXT processing with chunking and metadata.
29
+ - **summarize_content** — Abstractive summarization (via Ollama/API/sampling).
30
+ - **analyze_content** — Sentiment, entities, readability, keyword density, topic detection.
31
+ - **extract_structured** — JSON schema-driven extraction with LLM or CSS selectors.
32
+ - **extract_with_llm** — Natural language prompt-based extraction. Fallback: Ollama → API keys → sampling.
33
+ - **list_ollama_models** — List locally available Ollama models.
34
+
35
+ ### Advanced (2 tools)
36
+ - **batch_scrape** — Scrape multiple URLs concurrently. Default output: markdown (RAG-ready).
37
+ - **scrape_with_actions** — Browser automation (click, type, scroll, wait) before scraping.
38
+
39
+ ### Research (1 tool)
40
+ - **deep_research** — Multi-stage research: query expansion → parallel fetch → dedup → synthesis.
41
+
42
+ ### Tracking (1 tool)
43
+ - **track_changes** — Snapshot URL and diff against baseline. Returns change percentage + diff.
44
+
45
+ ### LLMs.txt (1 tool)
46
+ - **generate_llms_txt** — Generate llms.txt and llms-full.txt for AI compliance.
47
+
48
+ ### Stealth (1 tool)
49
+ - **stealth_mode** — Anti-bot browser scraping. Engines: playwright (default) or camoufox.
50
+
51
+ ### Localization (1 tool)
52
+ - **localization** — Fetch with locale/geo targeting, proxy routing, currency awareness.
53
+
54
+ ### Templates (1 tool)
55
+ - **scrape_template** — Pre-built extractors for: amazon-product, linkedin-profile, github-repo, youtube-video, tweet, reddit-thread, hacker-news-front-page, producthunt-launch, stackoverflow-question, npm-package.
56
+
57
+ ## Cost Reference (Credits)
58
+ - fetch_url, extract_text, extract_links, extract_metadata: 1 credit
59
+ - search_web, map_site: 2 credits
60
+ - extract_content, scrape_structured, analyze_content, summarize_content: 3 credits
61
+ - crawl_deep, batch_scrape, track_changes, generate_llms_txt: 5 credits
62
+ - extract_structured, extract_with_llm, stealth_mode, localization, scrape_with_actions: 5 credits
63
+ - deep_research: 10–50 credits (dynamic, triggers elicitation when >50)
64
+
65
+ ## Example Tool Calls
66
+
67
+ Fetch a page:
68
+ ```json
69
+ { "tool": "fetch_url", "params": { "url": "https://example.com" } }
70
+ ```
71
+
72
+ Search the web:
73
+ ```json
74
+ { "tool": "search_web", "params": { "query": "MCP server Node.js", "limit": 5 } }
75
+ ```
76
+
77
+ Extract markdown from an article:
78
+ ```json
79
+ { "tool": "extract_content", "params": { "url": "https://example.com/article", "output_format": "markdown" } }
80
+ ```
@@ -0,0 +1,104 @@
1
+ # CrawlForge Deep Research Workflow
2
+
3
+ ## When to Use deep_research
4
+
5
+ Use `deep_research` for comprehensive topic research that requires multiple sources:
6
+ - Competitive analysis (compare multiple competitors)
7
+ - Technology landscape research
8
+ - Fact-gathering with citations
9
+ - Market research with multiple data points
10
+ - Any topic requiring 5+ web sources synthesized
11
+
12
+ Do NOT use for:
13
+ - Single URL content extraction → use `extract_content`
14
+ - Simple web searches → use `search_web`
15
+ - Known URLs you want to read → use `fetch_url` or `batch_scrape`
16
+
17
+ ## How deep_research Works
18
+
19
+ 1. **Query Expansion** — Generates 3–5 related queries from your topic
20
+ 2. **Parallel Fetching** — Fetches up to `max_urls` sources simultaneously
21
+ 3. **URL Deduplication** — Skips already-visited URLs within the session
22
+ 4. **Content Extraction** — Extracts clean text from each source
23
+ 5. **Synthesis** — If Ollama/API key available: returns synthesized report; otherwise returns raw evidence for the calling LLM to synthesize
24
+
25
+ ## LLM Fallback Chain
26
+
27
+ ```
28
+ Ollama (local, default) → OpenAI API key → Anthropic API key → MCP Sampling → Raw evidence
29
+ ```
30
+
31
+ With no LLM configured, `deep_research` returns structured raw evidence that Claude or another LLM can synthesize.
32
+
33
+ ## MCP Tool Usage
34
+
35
+ ```json
36
+ // Standard research
37
+ {
38
+ "tool": "deep_research",
39
+ "params": {
40
+ "query": "React vs Vue vs Angular in 2025",
41
+ "depth": "standard",
42
+ "max_urls": 20
43
+ }
44
+ }
45
+
46
+ // Deep research with all sources
47
+ {
48
+ "tool": "deep_research",
49
+ "params": {
50
+ "query": "competitor pricing analysis for B2B SaaS",
51
+ "depth": "deep",
52
+ "max_urls": 50,
53
+ "output_format": "detailed"
54
+ }
55
+ }
56
+ ```
57
+
58
+ Note: When `max_urls > 50`, the tool triggers an elicitation asking for confirmation before proceeding (cost guard).
59
+
60
+ ## CLI Usage
61
+
62
+ ```bash
63
+ # Standard research
64
+ crawlforge research "React vs Vue in 2025" --depth standard
65
+
66
+ # Deep research with JSON output
67
+ crawlforge research "B2B SaaS pricing trends" --depth deep --max-urls 30 --json
68
+
69
+ # Save research report to file
70
+ crawlforge research "competitor analysis" --pretty > research-report.json
71
+ ```
72
+
73
+ ## Depth Levels
74
+
75
+ | Depth | URLs Analyzed | Use Case | Approx. Credits |
76
+ |-------|--------------|----------|-----------------|
77
+ | basic | 5–10 | Quick overview | 10–15 |
78
+ | standard | 15–25 | General research | 15–30 |
79
+ | deep | 30–75 | Comprehensive analysis | 30–75+ |
80
+
81
+ ## Cost Management
82
+
83
+ - `deep_research` costs 10 base credits + 1 per URL analyzed
84
+ - Elicitation fires when projected cost > 50 credits
85
+ - Use `max_urls` to cap costs: `max_urls: 10` ≈ 20 credits max
86
+ - Token budget auto-limits LLM synthesis costs (default: 200,000 chars)
87
+
88
+ ## Accessing Research Results as Resources
89
+
90
+ Completed research sessions are available as MCP Resources:
91
+ ```
92
+ crawlforge://research/{sessionId}
93
+ ```
94
+
95
+ List via `resources/list` — no need to re-run the research.
96
+
97
+ ## Combining with Other Tools
98
+
99
+ For targeted competitive research:
100
+ ```
101
+ 1. search_web "competitor X pricing" → get URLs
102
+ 2. batch_scrape [competitor URLs] → get content in parallel
103
+ 3. deep_research "competitor X vs us" → synthesized analysis
104
+ ```
@@ -0,0 +1,98 @@
1
+ # CrawlForge Stealth Mode Guide
2
+
3
+ ## When to Use stealth_mode
4
+
5
+ Use `stealth_mode` when a site returns bot-detection errors, 403 responses, CAPTCHAs, or JavaScript-rendered content that `fetch_url` and `extract_content` cannot access.
6
+
7
+ Signs you need stealth mode:
8
+ - Site returns 403 or 429 on regular fetch
9
+ - Content is empty or shows "please enable JavaScript"
10
+ - Site uses Cloudflare, DataDome, PerimeterX, or similar bot protection
11
+
12
+ ## Engines
13
+
14
+ ### playwright (default)
15
+ - Chromium-based with stealth patches
16
+ - Masks webdriver fingerprints, User-Agent, navigator properties
17
+ - Good for most sites with basic bot detection
18
+ - Lower resource usage
19
+
20
+ ### camoufox
21
+ - Firefox-based with native anti-detection
22
+ - No patches applied — uses Firefox's native properties
23
+ - Scores higher on CreepJS and DataDome than patched Chromium
24
+ - Use for sites with advanced fingerprinting (financial, e-commerce)
25
+
26
+ ## MCP Tool Usage
27
+
28
+ ```json
29
+ // Basic stealth scrape
30
+ {
31
+ "tool": "stealth_mode",
32
+ "params": {
33
+ "url": "https://protected-site.com",
34
+ "engine": "playwright"
35
+ }
36
+ }
37
+
38
+ // Advanced: Camoufox engine with screenshot
39
+ {
40
+ "tool": "stealth_mode",
41
+ "params": {
42
+ "url": "https://heavily-protected-site.com",
43
+ "engine": "camoufox",
44
+ "wait_for": 3000,
45
+ "screenshot": true
46
+ }
47
+ }
48
+ ```
49
+
50
+ ## CLI Usage
51
+
52
+ ```bash
53
+ # Default engine (playwright)
54
+ crawlforge stealth https://protected-site.com
55
+
56
+ # Camoufox for advanced bot detection bypass
57
+ crawlforge stealth https://protected-site.com --engine camoufox
58
+
59
+ # Wait for JS-heavy page to render, capture screenshot
60
+ crawlforge stealth https://spa-site.com --wait 3000 --screenshot
61
+
62
+ # Output as JSON
63
+ crawlforge stealth https://example.com --json
64
+ ```
65
+
66
+ ## Engine Selection Guide
67
+
68
+ | Scenario | Recommended Engine |
69
+ |----------|-------------------|
70
+ | General JS-rendered sites | playwright |
71
+ | Cloudflare-protected sites | camoufox |
72
+ | Sites with DataDome | camoufox |
73
+ | Sites with PerimeterX | camoufox |
74
+ | Financial/trading sites | camoufox |
75
+ | Speed-critical scraping | playwright |
76
+ | Basic bot detection bypass | playwright |
77
+
78
+ ## Environment Variable
79
+
80
+ Force engine globally:
81
+ ```bash
82
+ export CRAWLFORGE_STEALTH_ENGINE=camoufox
83
+ ```
84
+
85
+ ## Combining with Other Tools
86
+
87
+ After extracting raw HTML via stealth_mode, pipe to analyze_content or extract_structured:
88
+ ```json
89
+ // Step 1: get HTML via stealth
90
+ { "tool": "stealth_mode", "params": { "url": "https://example.com" } }
91
+
92
+ // Step 2: extract structured data from the result
93
+ { "tool": "extract_structured", "params": { "url": "https://example.com", "schema": {...} } }
94
+ ```
95
+
96
+ ## Credits
97
+ - `stealth_mode`: 5 credits per call
98
+ - Additional costs for screenshots (1 extra credit per screenshot)
@@ -0,0 +1,141 @@
1
+ /**
2
+ * installer.js — Skills installer for CrawlForge.
3
+ * Installs skill markdown files into Claude Code, Cursor, or VS Code.
4
+ *
5
+ * Targets:
6
+ * claude-code — ~/.claude/skills/crawlforge-*.md (one file per skill)
7
+ * cursor — .cursor/rules/crawlforge.mdc (concatenated)
8
+ * vscode — .github/instructions/crawlforge.instructions.md (concatenated)
9
+ *
10
+ * Idempotent: skips if already installed (use --force to overwrite).
11
+ */
12
+
13
+ import { readFileSync, writeFileSync, mkdirSync, existsSync, unlinkSync } from 'node:fs';
14
+ import { join, dirname } from 'node:path';
15
+ import { homedir } from 'node:os';
16
+ import { fileURLToPath } from 'node:url';
17
+
18
+ const __filename = fileURLToPath(import.meta.url);
19
+ const __dirname = dirname(__filename);
20
+
21
+ // Skill files shipped with the package
22
+ const SKILL_FILES = [
23
+ 'crawlforge-mcp.md',
24
+ 'crawlforge-cli.md',
25
+ 'crawlforge-stealth.md',
26
+ 'crawlforge-research.md'
27
+ ];
28
+
29
+ const SKILL_DIR = __dirname; // src/skills/
30
+
31
+ function readSkillFile(name) {
32
+ const p = join(SKILL_DIR, name);
33
+ if (!existsSync(p)) throw new Error(`Skill file not found: ${p}`);
34
+ return readFileSync(p, 'utf8');
35
+ }
36
+
37
+ function concatenateSkills() {
38
+ return SKILL_FILES.map(f => readSkillFile(f)).join('\n\n---\n\n');
39
+ }
40
+
41
+ /**
42
+ * Install skills into the given target.
43
+ * @param {{ target: 'claude-code'|'cursor'|'vscode'|'all', force?: boolean, dryRun?: boolean, cwd?: string }} opts
44
+ * @returns {{ installed: string[], skipped: string[], paths: string[] }}
45
+ */
46
+ export async function install({ target = 'all', force = false, dryRun = false, cwd = process.cwd() } = {}) {
47
+ const targets = target === 'all' ? ['claude-code', 'cursor', 'vscode'] : [target];
48
+ const results = { installed: [], skipped: [], paths: [] };
49
+
50
+ for (const t of targets) {
51
+ if (t === 'claude-code') {
52
+ const skillsDir = join(homedir(), '.claude', 'skills');
53
+ for (const fname of SKILL_FILES) {
54
+ const dest = join(skillsDir, fname);
55
+ results.paths.push(dest);
56
+ if (!dryRun) {
57
+ if (existsSync(dest) && !force) {
58
+ results.skipped.push(dest);
59
+ continue;
60
+ }
61
+ mkdirSync(skillsDir, { recursive: true });
62
+ writeFileSync(dest, readSkillFile(fname), 'utf8');
63
+ }
64
+ results.installed.push(dest);
65
+ }
66
+ } else if (t === 'cursor') {
67
+ const dir = join(cwd, '.cursor', 'rules');
68
+ const dest = join(dir, 'crawlforge.mdc');
69
+ results.paths.push(dest);
70
+ if (!dryRun) {
71
+ if (existsSync(dest) && !force) {
72
+ results.skipped.push(dest);
73
+ continue;
74
+ }
75
+ mkdirSync(dir, { recursive: true });
76
+ writeFileSync(dest, concatenateSkills(), 'utf8');
77
+ }
78
+ results.installed.push(dest);
79
+ } else if (t === 'vscode') {
80
+ const dir = join(cwd, '.github', 'instructions');
81
+ const dest = join(dir, 'crawlforge.instructions.md');
82
+ results.paths.push(dest);
83
+ if (!dryRun) {
84
+ if (existsSync(dest) && !force) {
85
+ results.skipped.push(dest);
86
+ continue;
87
+ }
88
+ mkdirSync(dir, { recursive: true });
89
+ writeFileSync(dest, concatenateSkills(), 'utf8');
90
+ }
91
+ results.installed.push(dest);
92
+ } else {
93
+ throw new Error(`Unknown target: ${t}. Valid targets: claude-code, cursor, vscode, all`);
94
+ }
95
+ }
96
+
97
+ return results;
98
+ }
99
+
100
+ /**
101
+ * Uninstall skills from the given target.
102
+ * @param {{ target: 'claude-code'|'cursor'|'vscode'|'all', cwd?: string }} opts
103
+ * @returns {{ removed: string[], notFound: string[] }}
104
+ */
105
+ export async function uninstall({ target = 'all', cwd = process.cwd() } = {}) {
106
+ const targets = target === 'all' ? ['claude-code', 'cursor', 'vscode'] : [target];
107
+ const results = { removed: [], notFound: [] };
108
+
109
+ for (const t of targets) {
110
+ if (t === 'claude-code') {
111
+ const skillsDir = join(homedir(), '.claude', 'skills');
112
+ for (const fname of SKILL_FILES) {
113
+ const dest = join(skillsDir, fname);
114
+ if (existsSync(dest)) {
115
+ unlinkSync(dest);
116
+ results.removed.push(dest);
117
+ } else {
118
+ results.notFound.push(dest);
119
+ }
120
+ }
121
+ } else if (t === 'cursor') {
122
+ const dest = join(cwd, '.cursor', 'rules', 'crawlforge.mdc');
123
+ if (existsSync(dest)) {
124
+ unlinkSync(dest);
125
+ results.removed.push(dest);
126
+ } else {
127
+ results.notFound.push(dest);
128
+ }
129
+ } else if (t === 'vscode') {
130
+ const dest = join(cwd, '.github', 'instructions', 'crawlforge.instructions.md');
131
+ if (existsSync(dest)) {
132
+ unlinkSync(dest);
133
+ results.removed.push(dest);
134
+ } else {
135
+ results.notFound.push(dest);
136
+ }
137
+ }
138
+ }
139
+
140
+ return results;
141
+ }
@@ -15,6 +15,7 @@
15
15
  */
16
16
 
17
17
  import { EventEmitter } from 'events';
18
+ import { ElicitationHelper } from '../../../core/ElicitationHelper.js'; // D1.4
18
19
  import JobManager from '../../../core/JobManager.js';
19
20
  import WebhookDispatcher from '../../../core/WebhookDispatcher.js';
20
21
  import { BatchScrapeSchema } from './schema.js';
@@ -53,6 +54,8 @@ export class BatchScrapeTool extends EventEmitter {
53
54
 
54
55
  this.activeBatches = new Map();
55
56
  this.batchResults = new Map();
57
+ // D1.4: Elicitation helper (set mcpServer after instantiation if desired)
58
+ this._elicitation = new ElicitationHelper({});
56
59
 
57
60
  this.stats = {
58
61
  totalBatches: 0,
@@ -68,6 +71,14 @@ export class BatchScrapeTool extends EventEmitter {
68
71
  this._initializeJobExecutors();
69
72
  }
70
73
 
74
+ /**
75
+ * D1.4: Set the MCP server instance for elicitation support.
76
+ * @param {object} mcpServer
77
+ */
78
+ setMcpServer(mcpServer) {
79
+ this._elicitation = new ElicitationHelper({ mcpServer });
80
+ }
81
+
71
82
  async execute(params) {
72
83
  try {
73
84
  const validated = BatchScrapeSchema.parse(params);
@@ -84,6 +95,25 @@ export class BatchScrapeTool extends EventEmitter {
84
95
  webhookConfig = this._registerWebhook(validated.webhook, batchId);
85
96
  }
86
97
 
98
+ // D1.4: Elicitation — warn when batch is large in sync mode
99
+ if (validated.mode === 'sync' && urlConfigs.length > 25) {
100
+ const proceed = await this._elicitation.confirm(
101
+ `batch_scrape (sync mode) will fetch ${urlConfigs.length} URLs synchronously. This may take a while and consume significant credits.`,
102
+ {
103
+ url_count: urlConfigs.length,
104
+ mode: 'sync',
105
+ suggestion: 'Consider using mode:"async" for large batches.',
106
+ }
107
+ );
108
+ if (!proceed) {
109
+ return {
110
+ batchId, mode: 'sync', success: false,
111
+ error: 'Batch scrape cancelled by user (elicitation declined).',
112
+ totalUrls: urlConfigs.length,
113
+ };
114
+ }
115
+ }
116
+
87
117
  if (validated.mode === 'sync') {
88
118
  return await this._processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime);
89
119
  } else {
@@ -14,7 +14,7 @@ export const UrlConfigSchema = z.object({
14
14
 
15
15
  export const BatchScrapeSchema = z.object({
16
16
  urls: z.array(z.union([z.string().url(), UrlConfigSchema])).min(1).max(50),
17
- formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
17
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']), // 4.2.1: aligned with server.js MCP registration default; markdown remains opt-in via formats:['markdown'] for RAG workflows
18
18
  mode: z.enum(['sync', 'async']).default('sync'),
19
19
  webhook: z.object({
20
20
  url: z.string().url(),
@@ -1,15 +1,17 @@
1
1
  /**
2
2
  * extract_text — Extract clean text content from HTML.
3
3
  * Extracted from server.js inline handler.
4
+ * D3.1: Added output_format:"markdown" option backed by Turndown.
4
5
  */
5
6
 
6
7
  import { load } from 'cheerio';
7
8
  import { fetchWithTimeout } from './_fetch.js';
9
+ import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
8
10
 
9
11
  /**
10
- * @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean }} params
12
+ * @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean, output_format?: "text"|"markdown" }} params
11
13
  */
12
- export async function extractTextHandler({ url, remove_scripts, remove_styles }) {
14
+ export async function extractTextHandler({ url, remove_scripts, remove_styles, output_format }) {
13
15
  try {
14
16
  const response = await fetchWithTimeout(url);
15
17
  if (!response.ok) {
@@ -26,15 +28,24 @@ export async function extractTextHandler({ url, remove_scripts, remove_styles })
26
28
 
27
29
  const text = $('body').text().replace(/\s+/g, ' ').trim();
28
30
 
31
+ const result = {
32
+ word_count: text.split(/\s+/).filter(w => w.length > 0).length,
33
+ char_count: text.length,
34
+ url: response.url
35
+ };
36
+
37
+ if (output_format === 'markdown') {
38
+ result.markdown = htmlToMarkdown($.html('body'));
39
+ result.output_format = 'markdown';
40
+ } else {
41
+ result.text = text;
42
+ result.output_format = 'text';
43
+ }
44
+
29
45
  return {
30
46
  content: [{
31
47
  type: 'text',
32
- text: JSON.stringify({
33
- text,
34
- word_count: text.split(/\s+/).filter(w => w.length > 0).length,
35
- char_count: text.length,
36
- url: response.url
37
- }, null, 2)
48
+ text: JSON.stringify(result, null, 2)
38
49
  }]
39
50
  };
40
51
  } catch (error) {
@@ -1,4 +1,5 @@
1
1
  import { z } from 'zod';
2
+ import { ElicitationHelper } from '../../core/ElicitationHelper.js'; // D1.4
2
3
  import { BFSCrawler } from '../../core/crawlers/BFSCrawler.js';
3
4
  import { DomainFilter } from '../../utils/domainFilter.js';
4
5
  import { CacheManager } from '../../core/cache/CacheManager.js';
@@ -87,6 +88,13 @@ export class CrawlDeepTool {
87
88
  this.timeout = timeout;
88
89
  // Per-session result cache: avoids redundant crawls of the same root URL
89
90
  this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
91
+ // D1.4: Elicitation helper
92
+ this._elicitation = new ElicitationHelper({});
93
+ }
94
+
95
+ /** D1.4: Wire MCP server for elicitation. Call from server.js after instantiation. */
96
+ setMcpServer(mcpServer) {
97
+ this._elicitation = new ElicitationHelper({ mcpServer });
90
98
  }
91
99
 
92
100
  async execute(params) {
@@ -100,6 +108,25 @@ export class CrawlDeepTool {
100
108
  if (cached) return cached;
101
109
  }
102
110
 
111
+ // D1.4: Elicitation — warn when max_pages is very high
112
+ if (validated.max_pages > 500) {
113
+ const proceed = await this._elicitation.confirm(
114
+ `crawl_deep will crawl up to ${validated.max_pages} pages from ${validated.url}. Large crawls consume many credits.`,
115
+ {
116
+ url: validated.url,
117
+ max_pages: validated.max_pages,
118
+ max_depth: validated.max_depth,
119
+ }
120
+ );
121
+ if (!proceed) {
122
+ return {
123
+ success: false,
124
+ error: 'Crawl cancelled by user (elicitation declined).',
125
+ url: validated.url,
126
+ };
127
+ }
128
+ }
129
+
103
130
  // Create domain filter if configuration provided
104
131
  let domainFilter = null;
105
132
  if (validated.import_filter_config) {
@@ -7,6 +7,7 @@ import { z } from 'zod';
7
7
  import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
8
8
  import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
9
9
  import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
10
+ import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
10
11
 
11
12
  const ExtractContentSchema = z.object({
12
13
  url: z.string().url(),
@@ -294,25 +295,12 @@ export class ExtractContentTool {
294
295
  }
295
296
 
296
297
  /**
297
- * Convert HTML content to Markdown
298
- * @param {string} html - HTML content
299
- * @returns {string} - Markdown content
298
+ * Convert HTML content to Markdown using Turndown (D3.1).
299
+ * @param {string} html
300
+ * @returns {string}
300
301
  */
301
302
  convertToMarkdown(html) {
302
- // Simple HTML to Markdown conversion
303
- return html
304
- .replace(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi, (match, level, text) => {
305
- const hashes = '#'.repeat(parseInt(level));
306
- return `\n${hashes} ${text}\n`;
307
- })
308
- .replace(/<p[^>]*>(.*?)<\/p>/gi, '\n$1\n')
309
- .replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
310
- .replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
311
- .replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
312
- .replace(/<br[^>]*>/gi, '\n')
313
- .replace(/<[^>]+>/g, '') // Remove remaining HTML tags
314
- .replace(/\n{3,}/g, '\n\n') // Normalize line breaks
315
- .trim();
303
+ return htmlToMarkdown(html);
316
304
  }
317
305
 
318
306
  /**
@@ -5,6 +5,7 @@
5
5
  */
6
6
 
7
7
  import { z } from 'zod';
8
+ import { ElicitationHelper } from '../../core/ElicitationHelper.js'; // D1.4
8
9
  import { load } from 'cheerio';
9
10
  import { LLMManager } from '../../core/llm/LLMManager.js';
10
11
  import { fetchAndParse } from './_fetchAndParse.js';
@@ -30,6 +31,13 @@ export class ExtractStructuredTool {
30
31
  this.llmManager = null;
31
32
  this.llmConfig = options.llmConfig || {};
32
33
  this.userAgent = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0; ExtractStructured)';
34
+ // D1.4: Elicitation helper
35
+ this._elicitation = new ElicitationHelper({});
36
+ }
37
+
38
+ /** D1.4: Wire MCP server for elicitation. */
39
+ setMcpServer(mcpServer) {
40
+ this._elicitation = new ElicitationHelper({ mcpServer });
33
41
  }
34
42
 
35
43
  /**