crawlforge-mcp-server 3.5.1 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -4
- package/server.js +138 -26
- package/src/cli/commands/actions.js +36 -0
- package/src/cli/commands/analyze.js +19 -0
- package/src/cli/commands/batch.js +45 -0
- package/src/cli/commands/crawl.js +30 -0
- package/src/cli/commands/extract.js +45 -0
- package/src/cli/commands/install-skills.js +46 -0
- package/src/cli/commands/llmstxt.js +24 -0
- package/src/cli/commands/localize.js +29 -0
- package/src/cli/commands/map.js +26 -0
- package/src/cli/commands/monitor.js +29 -0
- package/src/cli/commands/research.js +26 -0
- package/src/cli/commands/scrape.js +37 -0
- package/src/cli/commands/search.js +28 -0
- package/src/cli/commands/stealth.js +29 -0
- package/src/cli/commands/template.js +26 -0
- package/src/cli/commands/track.js +24 -0
- package/src/cli/commands/uninstall-skills.js +35 -0
- package/src/cli/formatter.js +57 -0
- package/src/cli/index.js +94 -0
- package/src/cli/lib/runTool.js +40 -0
- package/src/core/ActionExecutor.js +8 -6
- package/src/core/AuthManager.js +103 -3
- package/src/core/ChangeTracker.js +34 -0
- package/src/core/ElicitationHelper.js +112 -0
- package/src/core/JobManager.js +36 -2
- package/src/core/LocalizationManager.js +19 -5
- package/src/core/PerformanceManager.js +53 -17
- package/src/core/ResearchOrchestrator.js +40 -5
- package/src/core/SamplingClient.js +191 -0
- package/src/core/StealthBrowserManager.js +248 -2
- package/src/core/WebhookDispatcher.js +18 -10
- package/src/prompts/PromptRegistry.js +199 -0
- package/src/resources/ResourceRegistry.js +273 -0
- package/src/server/withAuth.js +25 -0
- package/src/skills/crawlforge-cli.md +157 -0
- package/src/skills/crawlforge-mcp.md +80 -0
- package/src/skills/crawlforge-research.md +104 -0
- package/src/skills/crawlforge-stealth.md +98 -0
- package/src/skills/installer.js +141 -0
- package/src/tools/advanced/batchScrape/index.js +30 -0
- package/src/tools/advanced/batchScrape/schema.js +1 -1
- package/src/tools/basic/extractText.js +19 -8
- package/src/tools/crawl/crawlDeep.js +27 -0
- package/src/tools/extract/extractContent.js +5 -17
- package/src/tools/extract/extractStructured.js +8 -0
- package/src/tools/extract/extractWithLlm.js +25 -5
- package/src/tools/extract/processDocument.js +7 -1
- package/src/tools/extract/summarizeContent.js +17 -0
- package/src/tools/research/deepResearch.js +34 -0
- package/src/tools/templates/ScrapeTemplateTool.js +68 -0
- package/src/tools/templates/TemplateRegistry.js +311 -0
- package/src/utils/Logger.js +15 -0
- package/src/utils/htmlToMarkdown.js +54 -0
- package/src/utils/secretMask.js +86 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# CrawlForge MCP Tools — When and How to Use
|
|
2
|
+
|
|
3
|
+
CrawlForge is a professional MCP server with 23 tools for web scraping, crawling, content extraction, research, and AI compliance.
|
|
4
|
+
|
|
5
|
+
## When to Use MCP Tools vs CLI
|
|
6
|
+
|
|
7
|
+
Use MCP tools when you need results inline within an AI assistant session.
|
|
8
|
+
Use the CLI (`crawlforge <command>`) for scripts, CI, and automation pipelines.
|
|
9
|
+
|
|
10
|
+
## All 23 Tools
|
|
11
|
+
|
|
12
|
+
### Basic Fetching (5 tools)
|
|
13
|
+
- **fetch_url** — Raw HTTP fetch; returns headers + body. Use for quick single-URL fetches.
|
|
14
|
+
- **extract_text** — Clean readable text from a page (strips HTML). Use for reading articles.
|
|
15
|
+
- **extract_links** — All links from a page with anchor text. Use for link analysis.
|
|
16
|
+
- **extract_metadata** — Title, description, OG tags, schema.org from a page.
|
|
17
|
+
- **scrape_structured** — CSS-selector based data extraction from a page.
|
|
18
|
+
|
|
19
|
+
### Search (1 tool)
|
|
20
|
+
- **search_web** — Search via CrawlForge API or SearXNG. Supports query expansion, ranking, dedup.
|
|
21
|
+
|
|
22
|
+
### Crawling (2 tools)
|
|
23
|
+
- **crawl_deep** — BFS crawl up to 1000 pages with configurable depth, content extraction, link analysis.
|
|
24
|
+
- **map_site** — Fast sitemap generation via sitemap.xml or crawl. Returns URL list with metadata.
|
|
25
|
+
|
|
26
|
+
### Content Extraction (7 tools)
|
|
27
|
+
- **extract_content** — Main content extraction with Readability, markdown output, image handling.
|
|
28
|
+
- **process_document** — PDF, DOCX, TXT processing with chunking and metadata.
|
|
29
|
+
- **summarize_content** — Abstractive summarization (via Ollama/API/sampling).
|
|
30
|
+
- **analyze_content** — Sentiment, entities, readability, keyword density, topic detection.
|
|
31
|
+
- **extract_structured** — JSON schema-driven extraction with LLM or CSS selectors.
|
|
32
|
+
- **extract_with_llm** — Natural language prompt-based extraction. Fallback: Ollama → API keys → sampling.
|
|
33
|
+
- **list_ollama_models** — List locally available Ollama models.
|
|
34
|
+
|
|
35
|
+
### Advanced (2 tools)
|
|
36
|
+
- **batch_scrape** — Scrape multiple URLs concurrently. Default output: markdown (RAG-ready).
|
|
37
|
+
- **scrape_with_actions** — Browser automation (click, type, scroll, wait) before scraping.
|
|
38
|
+
|
|
39
|
+
### Research (1 tool)
|
|
40
|
+
- **deep_research** — Multi-stage research: query expansion → parallel fetch → dedup → synthesis.
|
|
41
|
+
|
|
42
|
+
### Tracking (1 tool)
|
|
43
|
+
- **track_changes** — Snapshot URL and diff against baseline. Returns change percentage + diff.
|
|
44
|
+
|
|
45
|
+
### LLMs.txt (1 tool)
|
|
46
|
+
- **generate_llms_txt** — Generate llms.txt and llms-full.txt for AI compliance.
|
|
47
|
+
|
|
48
|
+
### Stealth (1 tool)
|
|
49
|
+
- **stealth_mode** — Anti-bot browser scraping. Engines: playwright (default) or camoufox.
|
|
50
|
+
|
|
51
|
+
### Localization (1 tool)
|
|
52
|
+
- **localization** — Fetch with locale/geo targeting, proxy routing, currency awareness.
|
|
53
|
+
|
|
54
|
+
### Templates (1 tool)
|
|
55
|
+
- **scrape_template** — Pre-built extractors for: amazon-product, linkedin-profile, github-repo, youtube-video, tweet, reddit-thread, hacker-news-front-page, producthunt-launch, stackoverflow-question, npm-package.
|
|
56
|
+
|
|
57
|
+
## Cost Reference (Credits)
|
|
58
|
+
- fetch_url, extract_text, extract_links, extract_metadata: 1 credit
|
|
59
|
+
- search_web, map_site: 2 credits
|
|
60
|
+
- extract_content, scrape_structured, analyze_content, summarize_content: 3 credits
|
|
61
|
+
- crawl_deep, batch_scrape, track_changes, generate_llms_txt: 5 credits
|
|
62
|
+
- extract_structured, extract_with_llm, stealth_mode, localization, scrape_with_actions: 5 credits
|
|
63
|
+
- deep_research: 10–50 credits (dynamic, triggers elicitation when >50)
|
|
64
|
+
|
|
65
|
+
## Example Tool Calls
|
|
66
|
+
|
|
67
|
+
Fetch a page:
|
|
68
|
+
```json
|
|
69
|
+
{ "tool": "fetch_url", "params": { "url": "https://example.com" } }
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Search the web:
|
|
73
|
+
```json
|
|
74
|
+
{ "tool": "search_web", "params": { "query": "MCP server Node.js", "limit": 5 } }
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Extract markdown from an article:
|
|
78
|
+
```json
|
|
79
|
+
{ "tool": "extract_content", "params": { "url": "https://example.com/article", "output_format": "markdown" } }
|
|
80
|
+
```
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# CrawlForge Deep Research Workflow
|
|
2
|
+
|
|
3
|
+
## When to Use deep_research
|
|
4
|
+
|
|
5
|
+
Use `deep_research` for comprehensive topic research that requires multiple sources:
|
|
6
|
+
- Competitive analysis (compare multiple competitors)
|
|
7
|
+
- Technology landscape research
|
|
8
|
+
- Fact-gathering with citations
|
|
9
|
+
- Market research with multiple data points
|
|
10
|
+
- Any topic requiring 5+ web sources synthesized
|
|
11
|
+
|
|
12
|
+
Do NOT use for:
|
|
13
|
+
- Single URL content extraction → use `extract_content`
|
|
14
|
+
- Simple web searches → use `search_web`
|
|
15
|
+
- Known URLs you want to read → use `fetch_url` or `batch_scrape`
|
|
16
|
+
|
|
17
|
+
## How deep_research Works
|
|
18
|
+
|
|
19
|
+
1. **Query Expansion** — Generates 3–5 related queries from your topic
|
|
20
|
+
2. **Parallel Fetching** — Fetches up to `max_urls` sources simultaneously
|
|
21
|
+
3. **URL Deduplication** — Skips already-visited URLs within the session
|
|
22
|
+
4. **Content Extraction** — Extracts clean text from each source
|
|
23
|
+
5. **Synthesis** — If Ollama/API key available: returns synthesized report; otherwise returns raw evidence for the calling LLM to synthesize
|
|
24
|
+
|
|
25
|
+
## LLM Fallback Chain
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
Ollama (local, default) → OpenAI API key → Anthropic API key → MCP Sampling → Raw evidence
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
With no LLM configured, `deep_research` returns structured raw evidence that Claude or another LLM can synthesize.
|
|
32
|
+
|
|
33
|
+
## MCP Tool Usage
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
// Standard research
|
|
37
|
+
{
|
|
38
|
+
"tool": "deep_research",
|
|
39
|
+
"params": {
|
|
40
|
+
"query": "React vs Vue vs Angular in 2025",
|
|
41
|
+
"depth": "standard",
|
|
42
|
+
"max_urls": 20
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Deep research with all sources
|
|
47
|
+
{
|
|
48
|
+
"tool": "deep_research",
|
|
49
|
+
"params": {
|
|
50
|
+
"query": "competitor pricing analysis for B2B SaaS",
|
|
51
|
+
"depth": "deep",
|
|
52
|
+
"max_urls": 50,
|
|
53
|
+
"output_format": "detailed"
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Note: When `max_urls > 50`, the tool triggers an elicitation asking for confirmation before proceeding (cost guard).
|
|
59
|
+
|
|
60
|
+
## CLI Usage
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Standard research
|
|
64
|
+
crawlforge research "React vs Vue in 2025" --depth standard
|
|
65
|
+
|
|
66
|
+
# Deep research with JSON output
|
|
67
|
+
crawlforge research "B2B SaaS pricing trends" --depth deep --max-urls 30 --json
|
|
68
|
+
|
|
69
|
+
# Save research report to file
|
|
70
|
+
crawlforge research "competitor analysis" --pretty > research-report.json
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Depth Levels
|
|
74
|
+
|
|
75
|
+
| Depth | URLs Analyzed | Use Case | Approx. Credits |
|
|
76
|
+
|-------|--------------|----------|-----------------|
|
|
77
|
+
| basic | 5–10 | Quick overview | 10–15 |
|
|
78
|
+
| standard | 15–25 | General research | 15–30 |
|
|
79
|
+
| deep | 30–75 | Comprehensive analysis | 30–75+ |
|
|
80
|
+
|
|
81
|
+
## Cost Management
|
|
82
|
+
|
|
83
|
+
- `deep_research` costs 10 base credits + 1 per URL analyzed
|
|
84
|
+
- Elicitation fires when projected cost > 50 credits
|
|
85
|
+
- Use `max_urls` to cap costs: `max_urls: 10` ≈ 20 credits max
|
|
86
|
+
- Token budget auto-limits LLM synthesis costs (default: 200,000 chars)
|
|
87
|
+
|
|
88
|
+
## Accessing Research Results as Resources
|
|
89
|
+
|
|
90
|
+
Completed research sessions are available as MCP Resources:
|
|
91
|
+
```
|
|
92
|
+
crawlforge://research/{sessionId}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
List via `resources/list` — no need to re-run the research.
|
|
96
|
+
|
|
97
|
+
## Combining with Other Tools
|
|
98
|
+
|
|
99
|
+
For targeted competitive research:
|
|
100
|
+
```
|
|
101
|
+
1. search_web "competitor X pricing" → get URLs
|
|
102
|
+
2. batch_scrape [competitor URLs] → get content in parallel
|
|
103
|
+
3. deep_research "competitor X vs us" → synthesized analysis
|
|
104
|
+
```
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# CrawlForge Stealth Mode Guide
|
|
2
|
+
|
|
3
|
+
## When to Use stealth_mode
|
|
4
|
+
|
|
5
|
+
Use `stealth_mode` when a site returns bot-detection errors, 403 responses, CAPTCHAs, or JavaScript-rendered content that `fetch_url` and `extract_content` cannot access.
|
|
6
|
+
|
|
7
|
+
Signs you need stealth mode:
|
|
8
|
+
- Site returns 403 or 429 on regular fetch
|
|
9
|
+
- Content is empty or shows "please enable JavaScript"
|
|
10
|
+
- Site uses Cloudflare, DataDome, PerimeterX, or similar bot protection
|
|
11
|
+
|
|
12
|
+
## Engines
|
|
13
|
+
|
|
14
|
+
### playwright (default)
|
|
15
|
+
- Chromium-based with stealth patches
|
|
16
|
+
- Masks webdriver fingerprints, User-Agent, navigator properties
|
|
17
|
+
- Good for most sites with basic bot detection
|
|
18
|
+
- Lower resource usage
|
|
19
|
+
|
|
20
|
+
### camoufox
|
|
21
|
+
- Firefox-based with native anti-detection
|
|
22
|
+
- No patches applied — uses Firefox's native properties
|
|
23
|
+
- Scores higher on CreepJS and DataDome than patched Chromium
|
|
24
|
+
- Use for sites with advanced fingerprinting (financial, e-commerce)
|
|
25
|
+
|
|
26
|
+
## MCP Tool Usage
|
|
27
|
+
|
|
28
|
+
```json
|
|
29
|
+
// Basic stealth scrape
|
|
30
|
+
{
|
|
31
|
+
"tool": "stealth_mode",
|
|
32
|
+
"params": {
|
|
33
|
+
"url": "https://protected-site.com",
|
|
34
|
+
"engine": "playwright"
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Advanced: Camoufox engine with screenshot
|
|
39
|
+
{
|
|
40
|
+
"tool": "stealth_mode",
|
|
41
|
+
"params": {
|
|
42
|
+
"url": "https://heavily-protected-site.com",
|
|
43
|
+
"engine": "camoufox",
|
|
44
|
+
"wait_for": 3000,
|
|
45
|
+
"screenshot": true
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## CLI Usage
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Default engine (playwright)
|
|
54
|
+
crawlforge stealth https://protected-site.com
|
|
55
|
+
|
|
56
|
+
# Camoufox for advanced bot detection bypass
|
|
57
|
+
crawlforge stealth https://protected-site.com --engine camoufox
|
|
58
|
+
|
|
59
|
+
# Wait for JS-heavy page to render, capture screenshot
|
|
60
|
+
crawlforge stealth https://spa-site.com --wait 3000 --screenshot
|
|
61
|
+
|
|
62
|
+
# Output as JSON
|
|
63
|
+
crawlforge stealth https://example.com --json
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Engine Selection Guide
|
|
67
|
+
|
|
68
|
+
| Scenario | Recommended Engine |
|
|
69
|
+
|----------|-------------------|
|
|
70
|
+
| General JS-rendered sites | playwright |
|
|
71
|
+
| Cloudflare-protected sites | camoufox |
|
|
72
|
+
| Sites with DataDome | camoufox |
|
|
73
|
+
| Sites with PerimeterX | camoufox |
|
|
74
|
+
| Financial/trading sites | camoufox |
|
|
75
|
+
| Speed-critical scraping | playwright |
|
|
76
|
+
| Basic bot detection bypass | playwright |
|
|
77
|
+
|
|
78
|
+
## Environment Variable
|
|
79
|
+
|
|
80
|
+
Force engine globally:
|
|
81
|
+
```bash
|
|
82
|
+
export CRAWLFORGE_STEALTH_ENGINE=camoufox
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Combining with Other Tools
|
|
86
|
+
|
|
87
|
+
After extracting raw HTML via stealth_mode, pipe to analyze_content or extract_structured:
|
|
88
|
+
```json
|
|
89
|
+
// Step 1: get HTML via stealth
|
|
90
|
+
{ "tool": "stealth_mode", "params": { "url": "https://example.com" } }
|
|
91
|
+
|
|
92
|
+
// Step 2: extract structured data from the result
|
|
93
|
+
{ "tool": "extract_structured", "params": { "url": "https://example.com", "schema": {...} } }
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Credits
|
|
97
|
+
- `stealth_mode`: 5 credits per call
|
|
98
|
+
- Additional costs for screenshots (1 extra credit per screenshot)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* installer.js — Skills installer for CrawlForge.
|
|
3
|
+
* Installs skill markdown files into Claude Code, Cursor, or VS Code.
|
|
4
|
+
*
|
|
5
|
+
* Targets:
|
|
6
|
+
* claude-code — ~/.claude/skills/crawlforge-*.md (one file per skill)
|
|
7
|
+
* cursor — .cursor/rules/crawlforge.mdc (concatenated)
|
|
8
|
+
* vscode — .github/instructions/crawlforge.instructions.md (concatenated)
|
|
9
|
+
*
|
|
10
|
+
* Idempotent: skips if already installed (use --force to overwrite).
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { readFileSync, writeFileSync, mkdirSync, existsSync, unlinkSync } from 'node:fs';
|
|
14
|
+
import { join, dirname } from 'node:path';
|
|
15
|
+
import { homedir } from 'node:os';
|
|
16
|
+
import { fileURLToPath } from 'node:url';
|
|
17
|
+
|
|
18
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
19
|
+
const __dirname = dirname(__filename);
|
|
20
|
+
|
|
21
|
+
// Skill files shipped with the package
|
|
22
|
+
const SKILL_FILES = [
|
|
23
|
+
'crawlforge-mcp.md',
|
|
24
|
+
'crawlforge-cli.md',
|
|
25
|
+
'crawlforge-stealth.md',
|
|
26
|
+
'crawlforge-research.md'
|
|
27
|
+
];
|
|
28
|
+
|
|
29
|
+
const SKILL_DIR = __dirname; // src/skills/
|
|
30
|
+
|
|
31
|
+
function readSkillFile(name) {
|
|
32
|
+
const p = join(SKILL_DIR, name);
|
|
33
|
+
if (!existsSync(p)) throw new Error(`Skill file not found: ${p}`);
|
|
34
|
+
return readFileSync(p, 'utf8');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function concatenateSkills() {
|
|
38
|
+
return SKILL_FILES.map(f => readSkillFile(f)).join('\n\n---\n\n');
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Install skills into the given target.
|
|
43
|
+
* @param {{ target: 'claude-code'|'cursor'|'vscode'|'all', force?: boolean, dryRun?: boolean, cwd?: string }} opts
|
|
44
|
+
* @returns {{ installed: string[], skipped: string[], paths: string[] }}
|
|
45
|
+
*/
|
|
46
|
+
export async function install({ target = 'all', force = false, dryRun = false, cwd = process.cwd() } = {}) {
|
|
47
|
+
const targets = target === 'all' ? ['claude-code', 'cursor', 'vscode'] : [target];
|
|
48
|
+
const results = { installed: [], skipped: [], paths: [] };
|
|
49
|
+
|
|
50
|
+
for (const t of targets) {
|
|
51
|
+
if (t === 'claude-code') {
|
|
52
|
+
const skillsDir = join(homedir(), '.claude', 'skills');
|
|
53
|
+
for (const fname of SKILL_FILES) {
|
|
54
|
+
const dest = join(skillsDir, fname);
|
|
55
|
+
results.paths.push(dest);
|
|
56
|
+
if (!dryRun) {
|
|
57
|
+
if (existsSync(dest) && !force) {
|
|
58
|
+
results.skipped.push(dest);
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
mkdirSync(skillsDir, { recursive: true });
|
|
62
|
+
writeFileSync(dest, readSkillFile(fname), 'utf8');
|
|
63
|
+
}
|
|
64
|
+
results.installed.push(dest);
|
|
65
|
+
}
|
|
66
|
+
} else if (t === 'cursor') {
|
|
67
|
+
const dir = join(cwd, '.cursor', 'rules');
|
|
68
|
+
const dest = join(dir, 'crawlforge.mdc');
|
|
69
|
+
results.paths.push(dest);
|
|
70
|
+
if (!dryRun) {
|
|
71
|
+
if (existsSync(dest) && !force) {
|
|
72
|
+
results.skipped.push(dest);
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
mkdirSync(dir, { recursive: true });
|
|
76
|
+
writeFileSync(dest, concatenateSkills(), 'utf8');
|
|
77
|
+
}
|
|
78
|
+
results.installed.push(dest);
|
|
79
|
+
} else if (t === 'vscode') {
|
|
80
|
+
const dir = join(cwd, '.github', 'instructions');
|
|
81
|
+
const dest = join(dir, 'crawlforge.instructions.md');
|
|
82
|
+
results.paths.push(dest);
|
|
83
|
+
if (!dryRun) {
|
|
84
|
+
if (existsSync(dest) && !force) {
|
|
85
|
+
results.skipped.push(dest);
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
mkdirSync(dir, { recursive: true });
|
|
89
|
+
writeFileSync(dest, concatenateSkills(), 'utf8');
|
|
90
|
+
}
|
|
91
|
+
results.installed.push(dest);
|
|
92
|
+
} else {
|
|
93
|
+
throw new Error(`Unknown target: ${t}. Valid targets: claude-code, cursor, vscode, all`);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return results;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Uninstall skills from the given target.
|
|
102
|
+
* @param {{ target: 'claude-code'|'cursor'|'vscode'|'all', cwd?: string }} opts
|
|
103
|
+
* @returns {{ removed: string[], notFound: string[] }}
|
|
104
|
+
*/
|
|
105
|
+
export async function uninstall({ target = 'all', cwd = process.cwd() } = {}) {
|
|
106
|
+
const targets = target === 'all' ? ['claude-code', 'cursor', 'vscode'] : [target];
|
|
107
|
+
const results = { removed: [], notFound: [] };
|
|
108
|
+
|
|
109
|
+
for (const t of targets) {
|
|
110
|
+
if (t === 'claude-code') {
|
|
111
|
+
const skillsDir = join(homedir(), '.claude', 'skills');
|
|
112
|
+
for (const fname of SKILL_FILES) {
|
|
113
|
+
const dest = join(skillsDir, fname);
|
|
114
|
+
if (existsSync(dest)) {
|
|
115
|
+
unlinkSync(dest);
|
|
116
|
+
results.removed.push(dest);
|
|
117
|
+
} else {
|
|
118
|
+
results.notFound.push(dest);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
} else if (t === 'cursor') {
|
|
122
|
+
const dest = join(cwd, '.cursor', 'rules', 'crawlforge.mdc');
|
|
123
|
+
if (existsSync(dest)) {
|
|
124
|
+
unlinkSync(dest);
|
|
125
|
+
results.removed.push(dest);
|
|
126
|
+
} else {
|
|
127
|
+
results.notFound.push(dest);
|
|
128
|
+
}
|
|
129
|
+
} else if (t === 'vscode') {
|
|
130
|
+
const dest = join(cwd, '.github', 'instructions', 'crawlforge.instructions.md');
|
|
131
|
+
if (existsSync(dest)) {
|
|
132
|
+
unlinkSync(dest);
|
|
133
|
+
results.removed.push(dest);
|
|
134
|
+
} else {
|
|
135
|
+
results.notFound.push(dest);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return results;
|
|
141
|
+
}
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
17
|
import { EventEmitter } from 'events';
|
|
18
|
+
import { ElicitationHelper } from '../../../core/ElicitationHelper.js'; // D1.4
|
|
18
19
|
import JobManager from '../../../core/JobManager.js';
|
|
19
20
|
import WebhookDispatcher from '../../../core/WebhookDispatcher.js';
|
|
20
21
|
import { BatchScrapeSchema } from './schema.js';
|
|
@@ -53,6 +54,8 @@ export class BatchScrapeTool extends EventEmitter {
|
|
|
53
54
|
|
|
54
55
|
this.activeBatches = new Map();
|
|
55
56
|
this.batchResults = new Map();
|
|
57
|
+
// D1.4: Elicitation helper (set mcpServer after instantiation if desired)
|
|
58
|
+
this._elicitation = new ElicitationHelper({});
|
|
56
59
|
|
|
57
60
|
this.stats = {
|
|
58
61
|
totalBatches: 0,
|
|
@@ -68,6 +71,14 @@ export class BatchScrapeTool extends EventEmitter {
|
|
|
68
71
|
this._initializeJobExecutors();
|
|
69
72
|
}
|
|
70
73
|
|
|
74
|
+
/**
|
|
75
|
+
* D1.4: Set the MCP server instance for elicitation support.
|
|
76
|
+
* @param {object} mcpServer
|
|
77
|
+
*/
|
|
78
|
+
setMcpServer(mcpServer) {
|
|
79
|
+
this._elicitation = new ElicitationHelper({ mcpServer });
|
|
80
|
+
}
|
|
81
|
+
|
|
71
82
|
async execute(params) {
|
|
72
83
|
try {
|
|
73
84
|
const validated = BatchScrapeSchema.parse(params);
|
|
@@ -84,6 +95,25 @@ export class BatchScrapeTool extends EventEmitter {
|
|
|
84
95
|
webhookConfig = this._registerWebhook(validated.webhook, batchId);
|
|
85
96
|
}
|
|
86
97
|
|
|
98
|
+
// D1.4: Elicitation — warn when batch is large in sync mode
|
|
99
|
+
if (validated.mode === 'sync' && urlConfigs.length > 25) {
|
|
100
|
+
const proceed = await this._elicitation.confirm(
|
|
101
|
+
`batch_scrape (sync mode) will fetch ${urlConfigs.length} URLs synchronously. This may take a while and consume significant credits.`,
|
|
102
|
+
{
|
|
103
|
+
url_count: urlConfigs.length,
|
|
104
|
+
mode: 'sync',
|
|
105
|
+
suggestion: 'Consider using mode:"async" for large batches.',
|
|
106
|
+
}
|
|
107
|
+
);
|
|
108
|
+
if (!proceed) {
|
|
109
|
+
return {
|
|
110
|
+
batchId, mode: 'sync', success: false,
|
|
111
|
+
error: 'Batch scrape cancelled by user (elicitation declined).',
|
|
112
|
+
totalUrls: urlConfigs.length,
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
87
117
|
if (validated.mode === 'sync') {
|
|
88
118
|
return await this._processBatchSync(batchId, urlConfigs, validated, webhookConfig, startTime);
|
|
89
119
|
} else {
|
|
@@ -14,7 +14,7 @@ export const UrlConfigSchema = z.object({
|
|
|
14
14
|
|
|
15
15
|
export const BatchScrapeSchema = z.object({
|
|
16
16
|
urls: z.array(z.union([z.string().url(), UrlConfigSchema])).min(1).max(50),
|
|
17
|
-
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
|
|
17
|
+
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']), // 4.2.1: aligned with server.js MCP registration default; markdown remains opt-in via formats:['markdown'] for RAG workflows
|
|
18
18
|
mode: z.enum(['sync', 'async']).default('sync'),
|
|
19
19
|
webhook: z.object({
|
|
20
20
|
url: z.string().url(),
|
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* extract_text — Extract clean text content from HTML.
|
|
3
3
|
* Extracted from server.js inline handler.
|
|
4
|
+
* D3.1: Added output_format:"markdown" option backed by Turndown.
|
|
4
5
|
*/
|
|
5
6
|
|
|
6
7
|
import { load } from 'cheerio';
|
|
7
8
|
import { fetchWithTimeout } from './_fetch.js';
|
|
9
|
+
import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
|
|
8
10
|
|
|
9
11
|
/**
|
|
10
|
-
* @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean }} params
|
|
12
|
+
* @param {{ url: string, remove_scripts?: boolean, remove_styles?: boolean, output_format?: "text"|"markdown" }} params
|
|
11
13
|
*/
|
|
12
|
-
export async function extractTextHandler({ url, remove_scripts, remove_styles }) {
|
|
14
|
+
export async function extractTextHandler({ url, remove_scripts, remove_styles, output_format }) {
|
|
13
15
|
try {
|
|
14
16
|
const response = await fetchWithTimeout(url);
|
|
15
17
|
if (!response.ok) {
|
|
@@ -26,15 +28,24 @@ export async function extractTextHandler({ url, remove_scripts, remove_styles })
|
|
|
26
28
|
|
|
27
29
|
const text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
28
30
|
|
|
31
|
+
const result = {
|
|
32
|
+
word_count: text.split(/\s+/).filter(w => w.length > 0).length,
|
|
33
|
+
char_count: text.length,
|
|
34
|
+
url: response.url
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
if (output_format === 'markdown') {
|
|
38
|
+
result.markdown = htmlToMarkdown($.html('body'));
|
|
39
|
+
result.output_format = 'markdown';
|
|
40
|
+
} else {
|
|
41
|
+
result.text = text;
|
|
42
|
+
result.output_format = 'text';
|
|
43
|
+
}
|
|
44
|
+
|
|
29
45
|
return {
|
|
30
46
|
content: [{
|
|
31
47
|
type: 'text',
|
|
32
|
-
text: JSON.stringify(
|
|
33
|
-
text,
|
|
34
|
-
word_count: text.split(/\s+/).filter(w => w.length > 0).length,
|
|
35
|
-
char_count: text.length,
|
|
36
|
-
url: response.url
|
|
37
|
-
}, null, 2)
|
|
48
|
+
text: JSON.stringify(result, null, 2)
|
|
38
49
|
}]
|
|
39
50
|
};
|
|
40
51
|
} catch (error) {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { z } from 'zod';
|
|
2
|
+
import { ElicitationHelper } from '../../core/ElicitationHelper.js'; // D1.4
|
|
2
3
|
import { BFSCrawler } from '../../core/crawlers/BFSCrawler.js';
|
|
3
4
|
import { DomainFilter } from '../../utils/domainFilter.js';
|
|
4
5
|
import { CacheManager } from '../../core/cache/CacheManager.js';
|
|
@@ -87,6 +88,13 @@ export class CrawlDeepTool {
|
|
|
87
88
|
this.timeout = timeout;
|
|
88
89
|
// Per-session result cache: avoids redundant crawls of the same root URL
|
|
89
90
|
this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
|
|
91
|
+
// D1.4: Elicitation helper
|
|
92
|
+
this._elicitation = new ElicitationHelper({});
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/** D1.4: Wire MCP server for elicitation. Call from server.js after instantiation. */
|
|
96
|
+
setMcpServer(mcpServer) {
|
|
97
|
+
this._elicitation = new ElicitationHelper({ mcpServer });
|
|
90
98
|
}
|
|
91
99
|
|
|
92
100
|
async execute(params) {
|
|
@@ -100,6 +108,25 @@ export class CrawlDeepTool {
|
|
|
100
108
|
if (cached) return cached;
|
|
101
109
|
}
|
|
102
110
|
|
|
111
|
+
// D1.4: Elicitation — warn when max_pages is very high
|
|
112
|
+
if (validated.max_pages > 500) {
|
|
113
|
+
const proceed = await this._elicitation.confirm(
|
|
114
|
+
`crawl_deep will crawl up to ${validated.max_pages} pages from ${validated.url}. Large crawls consume many credits.`,
|
|
115
|
+
{
|
|
116
|
+
url: validated.url,
|
|
117
|
+
max_pages: validated.max_pages,
|
|
118
|
+
max_depth: validated.max_depth,
|
|
119
|
+
}
|
|
120
|
+
);
|
|
121
|
+
if (!proceed) {
|
|
122
|
+
return {
|
|
123
|
+
success: false,
|
|
124
|
+
error: 'Crawl cancelled by user (elicitation declined).',
|
|
125
|
+
url: validated.url,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
103
130
|
// Create domain filter if configuration provided
|
|
104
131
|
let domainFilter = null;
|
|
105
132
|
if (validated.import_filter_config) {
|
|
@@ -7,6 +7,7 @@ import { z } from 'zod';
|
|
|
7
7
|
import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
|
|
8
8
|
import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
|
|
9
9
|
import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
|
|
10
|
+
import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
|
|
10
11
|
|
|
11
12
|
const ExtractContentSchema = z.object({
|
|
12
13
|
url: z.string().url(),
|
|
@@ -294,25 +295,12 @@ export class ExtractContentTool {
|
|
|
294
295
|
}
|
|
295
296
|
|
|
296
297
|
/**
|
|
297
|
-
* Convert HTML content to Markdown
|
|
298
|
-
* @param {string} html
|
|
299
|
-
* @returns {string}
|
|
298
|
+
* Convert HTML content to Markdown using Turndown (D3.1).
|
|
299
|
+
* @param {string} html
|
|
300
|
+
* @returns {string}
|
|
300
301
|
*/
|
|
301
302
|
convertToMarkdown(html) {
|
|
302
|
-
|
|
303
|
-
return html
|
|
304
|
-
.replace(/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/gi, (match, level, text) => {
|
|
305
|
-
const hashes = '#'.repeat(parseInt(level));
|
|
306
|
-
return `\n${hashes} ${text}\n`;
|
|
307
|
-
})
|
|
308
|
-
.replace(/<p[^>]*>(.*?)<\/p>/gi, '\n$1\n')
|
|
309
|
-
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
|
|
310
|
-
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
|
|
311
|
-
.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
|
|
312
|
-
.replace(/<br[^>]*>/gi, '\n')
|
|
313
|
-
.replace(/<[^>]+>/g, '') // Remove remaining HTML tags
|
|
314
|
-
.replace(/\n{3,}/g, '\n\n') // Normalize line breaks
|
|
315
|
-
.trim();
|
|
303
|
+
return htmlToMarkdown(html);
|
|
316
304
|
}
|
|
317
305
|
|
|
318
306
|
/**
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
import { z } from 'zod';
|
|
8
|
+
import { ElicitationHelper } from '../../core/ElicitationHelper.js'; // D1.4
|
|
8
9
|
import { load } from 'cheerio';
|
|
9
10
|
import { LLMManager } from '../../core/llm/LLMManager.js';
|
|
10
11
|
import { fetchAndParse } from './_fetchAndParse.js';
|
|
@@ -30,6 +31,13 @@ export class ExtractStructuredTool {
|
|
|
30
31
|
this.llmManager = null;
|
|
31
32
|
this.llmConfig = options.llmConfig || {};
|
|
32
33
|
this.userAgent = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0; ExtractStructured)';
|
|
34
|
+
// D1.4: Elicitation helper
|
|
35
|
+
this._elicitation = new ElicitationHelper({});
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** D1.4: Wire MCP server for elicitation. */
|
|
39
|
+
setMcpServer(mcpServer) {
|
|
40
|
+
this._elicitation = new ElicitationHelper({ mcpServer });
|
|
33
41
|
}
|
|
34
42
|
|
|
35
43
|
/**
|