crawlforge-mcp-server 4.2.12 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CLAUDE.md +19 -7
  2. package/README.md +11 -3
  3. package/package.json +3 -2
  4. package/server.js +195 -22
  5. package/src/cli/commands/init.js +107 -0
  6. package/src/cli/index.js +2 -0
  7. package/src/constants/config.js +5 -0
  8. package/src/core/ActionExecutor.js +13 -1
  9. package/src/core/AgentOrchestrator.js +300 -0
  10. package/src/core/AuthManager.js +21 -1
  11. package/src/core/ChangeTracker.js +8 -5
  12. package/src/core/LLMsTxtAnalyzer.js +71 -47
  13. package/src/core/LocalizationManager.js +7 -4
  14. package/src/core/ResearchOrchestrator.js +10 -6
  15. package/src/core/StealthBrowserManager.js +52 -13
  16. package/src/core/analysis/ContentAnalyzer.js +2 -2
  17. package/src/core/crawlers/BFSCrawler.js +23 -12
  18. package/src/core/processing/ContentProcessor.js +19 -3
  19. package/src/core/processing/PDFProcessor.js +72 -23
  20. package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
  21. package/src/tools/advanced/batchScrape/index.js +3 -1
  22. package/src/tools/advanced/batchScrape/reporter.js +5 -1
  23. package/src/tools/advanced/batchScrape/worker.js +6 -1
  24. package/src/tools/agent/agent.js +71 -0
  25. package/src/tools/basic/_fetch.js +78 -5
  26. package/src/tools/basic/extractLinks.js +1 -1
  27. package/src/tools/basic/extractMetadata.js +65 -1
  28. package/src/tools/basic/extractText.js +73 -5
  29. package/src/tools/basic/scrapeStructured.js +48 -10
  30. package/src/tools/crawl/crawlDeep.js +13 -5
  31. package/src/tools/crawl/mapSite.js +53 -52
  32. package/src/tools/extract/analyzeContent.js +11 -6
  33. package/src/tools/extract/extractContent.js +23 -5
  34. package/src/tools/extract/extractStructured.js +65 -16
  35. package/src/tools/extract/extractWithLlm.js +192 -11
  36. package/src/tools/extract/listOllamaModels.js +19 -8
  37. package/src/tools/extract/processDocument.js +10 -4
  38. package/src/tools/extract/summarizeContent.js +58 -1
  39. package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
  40. package/src/tools/research/deepResearch.js +43 -4
  41. package/src/tools/scrape/unifiedScrape.js +314 -0
  42. package/src/tools/search/providers/searxng.js +2 -2
  43. package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
  44. package/src/tools/search/ranking/ResultRanker.js +13 -4
  45. package/src/tools/search/searchWeb.js +5 -5
  46. package/src/tools/templates/TemplateRegistry.js +3 -2
  47. package/src/tools/tracking/trackChanges/differ.js +33 -1
  48. package/src/utils/htmlToMarkdown.js +5 -1
package/CLAUDE.md CHANGED
@@ -60,9 +60,9 @@ These guidelines are working if: fewer unnecessary changes in diffs, fewer rewri
60
60
 
61
61
  ## Project Overview
62
62
 
63
- CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing 23 web scraping, crawling, and content processing tools (5 inline + 18 advanced).
63
+ CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing 26 web scraping, crawling, and content processing tools (5 inline + 21 advanced).
64
64
 
65
- **Current Version:** 4.2.4
65
+ **Current Version:** 4.6.0
66
66
 
67
67
  ## Development Commands
68
68
 
@@ -92,8 +92,10 @@ npm run dev
92
92
  # Test MCP protocol compliance
93
93
  npm test
94
94
 
95
- # Unit tests (262 tests, no live network)
95
+ # Unit tests (400+ tests across tests/unit/, no live network)
96
96
  npm run test:unit
97
+ # Phase D regressions live in tests/unit/phaseD-regressions.test.js (agent hard stops, unified scrape, map_site ranking)
98
+ # Run a single test file: node --test tests/unit/phaseD-regressions.test.js
97
99
  # Note: add --test-force-exit if the run appears to hang at the end — importing
98
100
  # StealthBrowserManager (d2-reliability.test.js) leaves a Playwright handle that
99
101
  # otherwise delays process exit ~100s. Tests themselves pass either way.
@@ -109,7 +111,9 @@ node test-real-world.js # Test real-world usage scenarios
109
111
  node tests/integration/mcp-protocol-compliance.test.js
110
112
 
111
113
  # CLI (v4.1.0+, requires global install or npx)
112
- crawlforge --help # Show all 15 subcommands
114
+ crawlforge --help # Show all subcommands
115
+ crawlforge init # API-key detection + skill install + idempotent MCP-stanza merge (v4.6.0)
116
+ crawlforge init --all --yes # Merge MCP config into Claude Code / Desktop / Cursor non-interactively
113
117
  crawlforge scrape https://example.com
114
118
  crawlforge batch --urls urls.txt --format markdown
115
119
  crawlforge install-skills --target claude-code
@@ -140,6 +144,7 @@ npm run docker:prod # Run production container
140
144
  - **WebhookDispatcher**: Event notification system for job completion callbacks
141
145
  - **ActionExecutor**: Browser automation engine (Playwright-based)
142
146
  - **ResearchOrchestrator**: Multi-stage research with query expansion and synthesis
147
+ - **AgentOrchestrator**: Powers the `agent` tool — NL prompt → autonomous PLAN→GATHER→ACT→DECIDE→SHAPE loop with three orchestrator-enforced hard stops (maxSteps≤10, maxUrls≤20, wall-clock) never delegated to the LLM; degraded no-LLM-key path (D2, v4.6.0)
143
148
  - **StealthBrowserManager**: Stealth mode scraping with anti-detection; Camoufox (Firefox) engine added in v4.0.0
144
149
  - **LocalizationManager**: Multi-language content and localization
145
150
  - **ChangeTracker**: Content change tracking over time
@@ -155,7 +160,9 @@ npm run docker:prod # Run production container
155
160
  Tools are organized in subdirectories by category:
156
161
 
157
162
  - `advanced/` - BatchScrapeTool, ScrapeWithActionsTool
163
+ - `agent/` - agent (AgentOrchestrator-driven autonomous tool, v4.6.0)
158
164
  - `basic/` - fetchUrl, extractText, extractLinks, extractMetadata, scrapeStructured
165
+ - `scrape/` - unifiedScrape (single-fetch multi-format `scrape` tool, v4.6.0)
159
166
  - `crawl/` - crawlDeep, mapSite
160
167
  - `extract/` - analyzeContent, extractContent, extractStructured, extractWithLlm, listOllamaModels, processDocument, summarizeContent
161
168
  - `research/` - deepResearch
@@ -164,13 +171,18 @@ Tools are organized in subdirectories by category:
164
171
  - `tracking/` - trackChanges
165
172
  - `llmstxt/` - generateLLMsTxt
166
173
 
167
- ### Available MCP Tools (23 total)
174
+ ### Available MCP Tools (26 total)
168
175
 
169
176
  **Basic Tools (server.js inline, 5):**
170
177
  fetch_url, extract_text, extract_links, extract_metadata, scrape_structured
171
178
 
172
- **Advanced Tools (18):**
173
- search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, extract_structured, extract_with_llm, list_ollama_models, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization, scrape_template
179
+ **Advanced Tools (21):**
180
+ search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, extract_structured, extract_with_llm, list_ollama_models, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization, scrape_template, scrape, agent
181
+
182
+ **v4.6.0 additions (Phase D):**
183
+ - `scrape` — single fetch + one cheerio load dispatching a `formats` array (markdown/html/rawHtml/text/links/metadata/screenshot/json-schema) + `onlyMainContent`; partial-success via per-format `warnings[]`. Cost: 2.
184
+ - `agent` — NL prompt → autonomous research/extract, no URLs required (see AgentOrchestrator above). Cost: 8.
185
+ - `map_site` gained an optional `search=` param that ranks discovered URLs (`ranked_urls:[{url,score}]`); default output unchanged.
174
186
 
175
187
  ### MCP Server Entry Point
176
188
 
package/README.md CHANGED
@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
9
9
 
10
10
  ## 🎯 Features
11
11
 
12
- - **23 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis, local-LLM extraction (Ollama)
12
+ - **26 Professional Tools**: Web scraping, deep research, an autonomous `agent`, a unified multi-format `scrape`, stealth browsing, content analysis, local-LLM extraction (Ollama)
13
13
  - **Free Tier**: 1,000 credits to get started instantly
14
14
  - **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
15
15
  - **Enterprise Ready**: Scale up with paid plans for production use
@@ -37,6 +37,8 @@ This will:
37
37
 
38
38
  **Don't have an API key?** Get one free at [https://www.crawlforge.dev/signup](https://www.crawlforge.dev/signup)
39
39
 
40
+ > **One-step setup (v4.6.0+):** `crawlforge init` detects your API key, installs the agent skill, and idempotently merges the MCP config stanza into Claude Code, Claude Desktop, and Cursor. Use `crawlforge init --all --yes` to configure every detected client non-interactively.
41
+
40
42
  ### 3. Configure Your IDE (if not auto-configured)
41
43
 
42
44
  <details>
@@ -107,8 +109,10 @@ Restart Cursor to activate.
107
109
  - `extract_text` - Extract clean text from web pages
108
110
  - `extract_links` - Get all links from a page
109
111
  - `extract_metadata` - Extract page metadata
112
+ - `scrape_template` - Structured data from well-known sites (Amazon, GitHub, LinkedIn, YouTube, Reddit, Hacker News, npm, and more) without writing selectors
110
113
 
111
114
  ### Advanced Tools (2-3 credits)
115
+ - `scrape` - **Unified single-fetch, multi-format extraction.** Pass a `formats` array (markdown/html/rawHtml/text/links/metadata/screenshot/json-schema) plus `onlyMainContent`; one fetch serves every requested format with per-format partial-success warnings
112
116
  - `scrape_structured` - Extract structured data with CSS selectors
113
117
  - `search_web` - Search the web using Google Search API
114
118
  - `summarize_content` - Generate intelligent summaries
@@ -117,10 +121,12 @@ Restart Cursor to activate.
117
121
  - `extract_with_llm` - Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" | "anthropic"` with the matching key for cloud models.
118
122
  - `list_ollama_models` - List the Ollama models installed locally (free; helps you pick a `model` for `extract_with_llm`)
119
123
  - `track_changes` - Monitor content changes over time
124
+ - `get_batch_results` - Retrieve paginated results for a `batch_scrape` job by `batchId`
120
125
 
121
126
  ### Premium Tools (5-10 credits)
127
+ - `agent` - **Autonomous research/extraction from a natural-language prompt — no URLs required.** Plans, gathers, and shapes an answer under hard safety stops (max steps/URLs/wall-clock enforced by the orchestrator, never the LLM)
122
128
  - `crawl_deep` - Deep crawl entire websites
123
- - `map_site` - Discover and map website structure
129
+ - `map_site` - Discover and map website structure (optional `search=` ranks the discovered URLs)
124
130
  - `batch_scrape` - Process multiple URLs simultaneously
125
131
  - `deep_research` - Multi-stage research with source verification
126
132
  - `stealth_mode` - Anti-detection browser management
@@ -132,6 +138,8 @@ Restart Cursor to activate.
132
138
  - `generate_llms_txt` - Generate AI interaction guidelines
133
139
  - `localization` - Multi-language and geo-location management
134
140
 
141
+ For the full canonical capabilities reference (all tools, CLI commands, stealth engines, research workflow), see [SKILL.md](SKILL.md).
142
+
135
143
  ## 💳 Pricing
136
144
 
137
145
  | Plan | Credits/Month | Best For |
@@ -142,7 +150,7 @@ Restart Cursor to activate.
142
150
  | **Enterprise** | 250,000 | Large scale operations |
143
151
 
144
152
  **All plans include:**
145
- - Access to all 23 tools
153
+ - Access to all 26 tools
146
154
  - Credits never expire and roll over month-to-month
147
155
  - API access and webhook notifications
148
156
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawlforge-mcp-server",
3
- "version": "4.2.12",
3
+ "version": "4.6.0",
4
4
  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
5
5
  "main": "server.js",
6
6
  "bin": {
@@ -21,7 +21,7 @@
21
21
  "test:tools": "node test-tools.js",
22
22
  "test:real-world": "node test-real-world.js",
23
23
  "test:all": "bash run-all-tests.sh",
24
- "postinstall": "echo '\n🎉 CrawlForge MCP Server installed!\n\nRun \"npx crawlforge-setup\" to configure your API key and get started.\n'",
24
+ "postinstall": "echo '\nCrawlForge MCP Server installed!\n\nQuick start: run \"npx crawlforge init\" to configure your API key, install skills, and register the MCP server with your AI clients.\nOr run \"npx crawlforge-setup\" to configure your API key only.\n'",
25
25
  "docker:build": "docker build -t crawlforge .",
26
26
  "docker:dev": "docker-compose up crawlforge-dev",
27
27
  "docker:prod": "docker-compose up crawlforge-prod"
@@ -113,6 +113,7 @@
113
113
  "playwright": "^1.54.2",
114
114
  "robots-parser": "^3.0.1",
115
115
  "turndown": "^7.2.4",
116
+ "turndown-plugin-gfm": "^1.0.2",
116
117
  "undici": "^7.24.0",
117
118
  "winston": "^3.11.0",
118
119
  "zod": "^3.23.8"
package/server.js CHANGED
@@ -24,6 +24,8 @@ import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
24
24
  import { TrackChangesTool } from "./src/tools/tracking/trackChanges/index.js";
25
25
  import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
26
26
  import { ScrapeTemplateTool } from "./src/tools/templates/ScrapeTemplateTool.js"; // D3.3
27
+ import { UnifiedScrapeTool } from "./src/tools/scrape/unifiedScrape.js"; // D4 D1
28
+ import { AgentTool } from "./src/tools/agent/agent.js"; // D4 D2
27
29
  import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
28
30
  import { LocalizationManager } from "./src/core/LocalizationManager.js";
29
31
  import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
@@ -96,8 +98,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
96
98
  // Create the server
97
99
  const server = new McpServer({
98
100
  name: "crawlforge",
99
- version: "4.2.6",
100
- description: "Production-ready MCP server with 23 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
101
+ version: "4.5.0",
102
+ description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
101
103
  homepage: "https://www.crawlforge.dev",
102
104
  icon: "https://www.crawlforge.dev/icon.png"
103
105
  });
@@ -111,7 +113,7 @@ server.prompt("getting-started", {
111
113
  role: "user",
112
114
  content: {
113
115
  type: "text",
114
- text: "You have access to CrawlForge MCP with 23 web scraping tools. Key tools:\n\n" +
116
+ text: "You have access to CrawlForge MCP with 26 web scraping tools. Key tools:\n\n" +
115
117
  "- fetch_url: Fetch raw HTML/content from any URL\n" +
116
118
  "- extract_text: Extract clean text from a webpage\n" +
117
119
  "- extract_content: Smart content extraction with readability\n" +
@@ -161,6 +163,8 @@ const deepResearchTool = new DeepResearchTool();
161
163
  const trackChangesTool = new TrackChangesTool();
162
164
  const generateLLMsTxtTool = new GenerateLLMsTxtTool();
163
165
  const scrapeTemplateTool = new ScrapeTemplateTool(); // D3.3
166
+ const unifiedScrapeTool = new UnifiedScrapeTool(); // D4 D1
167
+ const agentTool = new AgentTool(); // D4 D2
164
168
  const stealthBrowserManager = new StealthBrowserManager();
165
169
  const localizationManager = new LocalizationManager();
166
170
 
@@ -181,6 +185,7 @@ deepResearchTool.setMcpServer(server);
181
185
  batchScrapeTool.setMcpServer(server);
182
186
  crawlDeepTool.setMcpServer(server);
183
187
  extractStructuredTool.setMcpServer(server);
188
+ agentTool.setMcpServer(server); // D4 D2: SamplingClient + Elicitation
184
189
  AuthManager.setElicitation(elicitation);
185
190
 
186
191
  // ─── D1.1 Resource Templates (MCP Resources) ─────────────────────────────────
@@ -299,7 +304,8 @@ server.registerTool("scrape_structured", {
299
304
  annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
300
305
  inputSchema: {
301
306
  url: z.string().url().describe("The URL to scrape"),
302
- selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
307
+ selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors. Append @attr to extract an attribute instead of text (e.g. \"a.link@href\", \"img@src\")"),
308
+ max_results: z.number().int().min(1).optional().describe("Maximum number of matches to return per field when a selector matches multiple elements")
303
309
  }
304
310
  }, withAuth("scrape_structured", scrapeStructuredHandler));
305
311
 
@@ -315,14 +321,50 @@ server.registerTool("search_web", {
315
321
  safe_search: z.boolean().optional().describe("Enable safe search filtering"),
316
322
  time_range: z.enum(["day", "week", "month", "year", "all"]).optional().describe("Filter results by time range"),
317
323
  site: z.string().optional().describe("Limit results to a specific domain"),
318
- file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')")
324
+ file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')"),
325
+ provider: z.enum(["crawlforge", "searxng"]).optional().describe("Search backend to use"),
326
+ expand_query: z.boolean().optional().describe("Expand the query with synonyms/stemming/etc."),
327
+ expansion_options: z.object({
328
+ enableSynonyms: z.boolean().optional(),
329
+ enableSpellCheck: z.boolean().optional(),
330
+ enableStemming: z.boolean().optional(),
331
+ enablePhraseDetection: z.boolean().optional(),
332
+ enableBooleanOperators: z.boolean().optional(),
333
+ maxExpansions: z.number().min(1).max(10).optional()
334
+ }).optional().describe("Query-expansion tuning"),
335
+ enable_ranking: z.boolean().optional().describe("Re-rank results (BM25 + signals)"),
336
+ ranking_weights: z.object({
337
+ bm25: z.number().min(0).max(1).optional(),
338
+ semantic: z.number().min(0).max(1).optional(),
339
+ authority: z.number().min(0).max(1).optional(),
340
+ freshness: z.number().min(0).max(1).optional()
341
+ }).optional().describe("Relative weights for ranking signals"),
342
+ enable_deduplication: z.boolean().optional().describe("Remove near-duplicate results"),
343
+ deduplication_thresholds: z.object({
344
+ url: z.number().min(0).max(1).optional(),
345
+ title: z.number().min(0).max(1).optional(),
346
+ content: z.number().min(0).max(1).optional(),
347
+ combined: z.number().min(0).max(1).optional()
348
+ }).optional().describe("Similarity thresholds for dedup"),
349
+ include_ranking_details: z.boolean().optional().describe("Include per-result ranking breakdown"),
350
+ include_deduplication_details: z.boolean().optional().describe("Include dedup decision details"),
351
+ localization: z.object({
352
+ countryCode: z.string().length(2).optional(),
353
+ language: z.string().optional(),
354
+ timezone: z.string().optional(),
355
+ enableGeoTargeting: z.boolean().optional(),
356
+ customLocation: z.object({
357
+ latitude: z.number().min(-90).max(90),
358
+ longitude: z.number().min(-180).max(180)
359
+ }).optional()
360
+ }).optional().describe("Geo/locale targeting for results")
319
361
  }
320
- }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
362
+ }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization }) => {
321
363
  try {
322
364
  if (!query) {
323
365
  return { content: [{ type: "text", text: "Query parameter is required" }], isError: true };
324
366
  }
325
- const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
367
+ const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization });
326
368
  return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
327
369
  } catch (error) {
328
370
  return { content: [{ type: "text", text: `Search failed: ${error.message}` }], isError: true };
@@ -342,14 +384,38 @@ server.registerTool("crawl_deep", {
342
384
  follow_external: z.boolean().optional().describe("Follow links to external domains"),
343
385
  respect_robots: z.boolean().optional().describe("Respect robots.txt directives"),
344
386
  extract_content: z.boolean().optional().describe("Extract page content during crawl"),
345
- concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests")
387
+ content_max_length: z.number().min(1).max(100000).optional().describe("Maximum characters of page content to include per page (default 500); sets a truncated flag when trimmed"),
388
+ concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests"),
389
+ enable_link_analysis: z.boolean().optional().describe("Compute PageRank/link-graph analysis over crawled pages"),
390
+ link_analysis_options: z.object({
391
+ dampingFactor: z.number().min(0).max(1).optional(),
392
+ maxIterations: z.number().min(1).max(1000).optional(),
393
+ enableCaching: z.boolean().optional()
394
+ }).optional().describe("PageRank tuning options"),
395
+ domain_filter: z.object({
396
+ whitelist: z.array(z.any()).optional(),
397
+ blacklist: z.array(z.any()).optional(),
398
+ domain_rules: z.record(z.any()).optional()
399
+ }).optional().describe("Per-domain allow/deny lists and crawl rules"),
400
+ import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config"),
401
+ session: z.object({
402
+ enabled: z.boolean(),
403
+ persistCookies: z.boolean().optional(),
404
+ headers: z.record(z.string()).optional(),
405
+ initialRequest: z.object({
406
+ url: z.string().url(),
407
+ method: z.string().optional(),
408
+ headers: z.record(z.string()).optional(),
409
+ body: z.string().optional()
410
+ }).optional()
411
+ }).optional().describe("Shared cookie-jar/session for login-then-crawl workflows")
346
412
  }
347
- }, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
413
+ }, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session }) => {
348
414
  try {
349
415
  if (!url) {
350
416
  return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
351
417
  }
352
- const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency });
418
+ const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session });
353
419
  return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
354
420
  } catch (error) {
355
421
  return { content: [{ type: "text", text: `Crawl failed: ${error.message}` }], isError: true };
@@ -365,14 +431,22 @@ server.registerTool("map_site", {
365
431
  include_sitemap: z.boolean().optional().describe("Include sitemap.xml data in results"),
366
432
  max_urls: z.number().min(1).max(10000).optional().describe("Maximum number of URLs to discover"),
367
433
  group_by_path: z.boolean().optional().describe("Group URLs by path segments"),
368
- include_metadata: z.boolean().optional().describe("Include page metadata for each URL")
434
+ include_metadata: z.boolean().optional().describe("Include page metadata for each URL"),
435
+ domain_filter: z.object({
436
+ whitelist: z.array(z.string()).optional(),
437
+ blacklist: z.array(z.string()).optional(),
438
+ include_patterns: z.array(z.string()).optional(),
439
+ exclude_patterns: z.array(z.string()).optional()
440
+ }).optional().describe("Per-domain allow/deny lists and URL include/exclude patterns"),
441
+ import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config"),
442
+ search: z.string().optional().describe("When set, rank discovered URLs by relevance to this string and emit ranked_urls:[{url,score}]")
369
443
  }
370
- }, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
444
+ }, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config, search }) => {
371
445
  try {
372
446
  if (!url) {
373
447
  return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
374
448
  }
375
- const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata });
449
+ const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config, search });
376
450
  return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
377
451
  } catch (error) {
378
452
  return { content: [{ type: "text", text: `Site mapping failed: ${error.message}` }], isError: true };
@@ -406,7 +480,9 @@ server.registerTool("process_document", {
406
480
  inputSchema: {
407
481
  source: z.string().describe("Document source - URL or file path"),
408
482
  sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional().describe("Type of document source"),
409
- options: z.object({}).optional().describe("Additional processing options")
483
+ // C3: passthrough so granular options (maxPages, pageRange:{start,end},
484
+ // extractText, outputFormat, etc.) reach the tool instead of being stripped.
485
+ options: z.object({}).passthrough().optional().describe("Additional processing options (maxPages, pageRange:{start,end}, extractText, extractMetadata, password, outputFormat, ...)")
410
486
  }
411
487
  }, withAuth("process_document", async ({ source, sourceType, options }) => {
412
488
  try {
@@ -572,6 +648,27 @@ server.registerTool("batch_scrape", {
572
648
  }
573
649
  }));
574
650
 
651
+ // Tool: get_batch_results — C3: retrieve paginated results for a completed batch
652
+ server.registerTool("get_batch_results", {
653
+ description: "Retrieve paginated results for a completed or in-progress batch_scrape job. Use the batchId returned by batch_scrape. Example: get_batch_results({batchId: \"batch_1234567890_abc\", page: 2, pageSize: 25})",
654
+ annotations: { title: "Get Batch Results", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
655
+ inputSchema: {
656
+ batchId: z.string().describe("The batch ID returned by batch_scrape"),
657
+ page: z.number().min(1).default(1).describe("Page number (1-based)"),
658
+ pageSize: z.number().min(1).max(100).default(25).describe("Number of results per page")
659
+ }
660
+ }, withAuth("get_batch_results", async ({ batchId, page = 1, pageSize = 25 }) => {
661
+ try {
662
+ if (!batchId) {
663
+ return { content: [{ type: "text", text: "batchId parameter is required" }], isError: true };
664
+ }
665
+ const result = await batchScrapeTool.getBatchResults(batchId, page, pageSize);
666
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
667
+ } catch (error) {
668
+ return { content: [{ type: "text", text: `get_batch_results failed: ${error.message}` }], isError: true };
669
+ }
670
+ }));
671
+
575
672
  // Tool: scrape_with_actions
576
673
  server.registerTool("scrape_with_actions", {
577
674
  description: "Use this when you need to interact with a page before scraping — login, click buttons, fill forms, scroll, or wait for dynamic content to load. Use for SPAs, login-gated content, or multi-step flows. Screenshots from this tool are stored as crawlforge://screenshot/{actionId} resources. Example: scrape_with_actions({url: \"https://app.com/dashboard\", actions: [{type:\"click\",selector:\"#login\"},{type:\"type\",selector:\"#email\",text:\"user@a.com\"}]})",
@@ -586,8 +683,34 @@ server.registerTool("scrape_with_actions", {
586
683
  script: z.string().optional(),
587
684
  timeout: z.number().optional(),
588
685
  description: z.string().optional(),
589
- continueOnError: z.boolean().default(false),
590
- retries: z.number().min(0).max(5).default(0)
686
+ continueOnError: z.boolean().optional(),
687
+ retries: z.number().min(0).max(5).optional(),
688
+ captureAfter: z.boolean().optional().describe("Capture page content after this action"),
689
+ // wait
690
+ duration: z.number().min(0).max(30000).optional().describe("wait: milliseconds to wait"),
691
+ condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional().describe("wait: condition on selector"),
692
+ // click
693
+ button: z.enum(['left', 'right', 'middle']).optional().describe("click: mouse button"),
694
+ clickCount: z.number().min(1).max(3).optional().describe("click: number of clicks"),
695
+ delay: z.number().min(0).max(1000).optional().describe("click/type: delay in ms"),
696
+ force: z.boolean().optional().describe("click: bypass actionability checks"),
697
+ position: z.object({ x: z.number(), y: z.number() }).optional().describe("click: relative position"),
698
+ // type
699
+ clear: z.boolean().optional().describe("type: clear field before typing"),
700
+ // press
701
+ modifiers: z.array(z.enum(['Alt', 'Control', 'Meta', 'Shift'])).optional().describe("press: modifier keys"),
702
+ // scroll
703
+ direction: z.enum(['up', 'down', 'left', 'right']).optional().describe("scroll: direction"),
704
+ distance: z.number().min(0).optional().describe("scroll: pixels to scroll"),
705
+ smooth: z.boolean().optional().describe("scroll: smooth scrolling"),
706
+ toElement: z.string().optional().describe("scroll: selector to scroll to"),
707
+ // screenshot
708
+ fullPage: z.boolean().optional().describe("screenshot: capture full page"),
709
+ quality: z.number().min(0).max(100).optional().describe("screenshot: jpeg quality"),
710
+ format: z.enum(['png', 'jpeg']).optional().describe("screenshot: image format"),
711
+ // executeJavaScript
712
+ args: z.array(z.any()).optional().describe("executeJavaScript: arguments passed to the script"),
713
+ returnResult: z.boolean().optional().describe("executeJavaScript: return the script result")
591
714
  })).min(1).max(20).describe("Browser actions to perform before scraping"),
592
715
  formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']).describe("Output formats for scraped content"),
593
716
  captureIntermediateStates: z.boolean().default(false).describe("Capture page state after each action"),
@@ -684,6 +807,53 @@ server.registerTool("deep_research", {
684
807
  }
685
808
  }));
686
809
 
810
+ // Tool: scrape (D4 D1 — unified multi-format single-fetch)
811
+ server.registerTool("scrape", {
812
+ description: "Use this when you need multiple content formats from a single URL in one call — e.g. markdown + links + metadata together. One fetch, no N-request fan-out. Formats: \"markdown\", \"html\", \"rawHtml\", \"text\", \"links\", \"metadata\", or {type:\"json\",schema,prompt} for LLM-structured extraction. onlyMainContent:true (default) strips boilerplate via Readability. Partial success: per-format warnings never fail the whole call. Example: scrape({url:\"https://example.com\", formats:[\"markdown\",\"links\",\"metadata\"]})",
813
+ annotations: { title: "Scrape (Multi-Format)", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
814
+ inputSchema: {
815
+ url: z.string().url().describe("The URL to scrape"),
816
+ formats: z.array(z.union([
817
+ z.enum(["markdown", "html", "rawHtml", "text", "links", "metadata", "screenshot"]),
818
+ z.object({
819
+ type: z.literal("json"),
820
+ schema: z.record(z.any()).optional().describe("JSON schema for extraction"),
821
+ prompt: z.string().optional().describe("Extraction instruction for the LLM")
822
+ })
823
+ ])).min(1).optional().default(["markdown"]).describe("Formats to return (default: [\"markdown\"])"),
824
+ onlyMainContent: z.boolean().optional().default(true).describe("Strip boilerplate via Readability (default: true)"),
825
+ timeoutMs: z.number().min(1000).max(60000).optional().default(15000).describe("Fetch timeout in ms")
826
+ }
827
+ }, withAuth("scrape", async (params) => {
828
+ try {
829
+ const result = await unifiedScrapeTool.execute(params);
830
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
831
+ } catch (error) {
832
+ return { content: [{ type: "text", text: `Scrape failed: ${error.message}` }], isError: true };
833
+ }
834
+ }));
835
+
836
+ // Tool: agent (D4 D2 — autonomous NL prompt → search/navigate/extract)
837
+ server.registerTool("agent", {
838
+ description: "Use this when you need an autonomous agent to research, navigate, and synthesise an answer from the web — no URLs required. The agent plans search queries, fetches and filters relevant pages, and returns a prose or structured answer. model:\"pro\" uses deep multi-source research. Hard limits: maxSteps≤10, maxUrls≤20, 120s wall-clock. Confirms before pro runs. Degraded-but-useful output if no LLM keys/Ollama. Example: agent({prompt:\"What are the top 5 MCP servers in 2025?\", maxUrls:10})",
839
+ annotations: { title: "Agent (Autonomous)", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
840
+ inputSchema: {
841
+ prompt: z.string().min(1).max(2000).describe("Natural-language task or question"),
842
+ urls: z.array(z.string().url()).max(20).optional().describe("Optional seed URLs to include (max 20)"),
843
+ schema: z.record(z.any()).optional().describe("Optional JSON schema for structured output"),
844
+ model: z.enum(["default", "pro"]).optional().default("default").describe("\"default\" = SamplingClient loop (no keys needed); \"pro\" = full ResearchOrchestrator"),
845
+ maxSteps: z.number().min(1).max(10).optional().default(5).describe("Max fetch iterations (hard cap: 10)"),
846
+ maxUrls: z.number().min(1).max(20).optional().default(10).describe("Max URLs to fetch (hard cap: 20)")
847
+ }
848
+ }, withAuth("agent", async (params) => {
849
+ try {
850
+ const result = await agentTool.execute(params);
851
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
852
+ } catch (error) {
853
+ return { content: [{ type: "text", text: `Agent failed: ${error.message}` }], isError: true };
854
+ }
855
+ }));
856
+
687
857
  // Tool: track_changes
688
858
  server.registerTool("track_changes", {
689
859
  description: "Use this when you need to monitor a URL for content changes over time — e.g. competitor pricing, regulation updates, product availability. Start with operation:\"create_baseline\", then periodically use operation:\"compare\" to diff. Supports webhooks and scheduled monitoring. Example: track_changes({url: \"https://example.com/pricing\", operation: \"create_baseline\"})",
@@ -1012,8 +1182,9 @@ server.registerTool("localization", {
1012
1182
  };
1013
1183
  break;
1014
1184
  case 'handle_geo_blocking':
1015
- if (!params.url || !params.response) throw new Error('url and response are required for handle_geo_blocking operation');
1016
- result = await localizationManager.handleGeoBlocking(params.url, params.response);
1185
+ case 'detect_geo_blocking':
1186
+ if (!params.url || !params.response) throw new Error('url and response are required for detect_geo_blocking operation');
1187
+ result = await localizationManager.detectGeoBlocking(params.url, params.response);
1017
1188
  break;
1018
1189
  case 'auto_detect':
1019
1190
  if (!params.content || !params.url) throw new Error('content and url are required for auto_detect operation');
@@ -1103,12 +1274,13 @@ async function runServer() {
1103
1274
  "fetch_url", "extract_text", "extract_links", "extract_metadata", "scrape_structured",
1104
1275
  "search_web", "crawl_deep", "map_site",
1105
1276
  "extract_content", "process_document", "summarize_content", "analyze_content",
1106
- "batch_scrape", "scrape_with_actions",
1277
+ "batch_scrape", "get_batch_results", "scrape_with_actions",
1107
1278
  "deep_research", "track_changes", "generate_llms_txt",
1108
1279
  "stealth_mode", "localization", "extract_structured", "extract_with_llm",
1109
- "scrape_template" // D3.3
1280
+ "list_ollama_models", "scrape_template", // D3.3
1281
+ "scrape", "agent" // D4
1110
1282
  ];
1111
- console.error(`Tools available (23): ${allTools.join(", ")}`);
1283
+ console.error(`Tools available (26): ${allTools.join(", ")}`);
1112
1284
 
1113
1285
  // Start memory monitoring in development
1114
1286
  if (config.server.nodeEnv === "development") {
@@ -1134,7 +1306,8 @@ async function gracefulShutdown(signal) {
1134
1306
  const toolsToCleanup = [
1135
1307
  batchScrapeTool, scrapeWithActionsTool, deepResearchTool,
1136
1308
  trackChangesTool, generateLLMsTxtTool, stealthBrowserManager,
1137
- localizationManager, extractStructuredTool
1309
+ localizationManager, extractStructuredTool,
1310
+ agentTool // D4 D2: may hold ResearchOrchestrator
1138
1311
  ].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
1139
1312
 
1140
1313
  console.error(`Cleaning up ${toolsToCleanup.length} tools...`);
@@ -0,0 +1,107 @@
1
+ /**
2
+ * init command — one-shot setup: API key check + skill install + MCP stanza merge.
3
+ */
4
+ import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
5
+ import { join } from 'node:path';
6
+ import { install } from '../../skills/installer.js';
7
+
8
+ const HOME = process.env.HOME || process.env.USERPROFILE || '';
9
+
10
+ function loadStoredApiKey() {
11
+ try {
12
+ const cfg = JSON.parse(readFileSync(join(HOME, '.crawlforge', 'config.json'), 'utf8'));
13
+ return cfg.apiKey || undefined;
14
+ } catch {
15
+ return undefined;
16
+ }
17
+ }
18
+
19
+ function mcpStanza(apiKey) {
20
+ const stanza = { command: 'npx', args: ['-y', 'crawlforge@latest', 'mcp'] };
21
+ if (apiKey) stanza.env = { CRAWLFORGE_API_KEY: apiKey };
22
+ return stanza;
23
+ }
24
+
25
+ function mergeClientConfig(configPath, apiKey) {
26
+ let existing = {};
27
+ if (existsSync(configPath)) {
28
+ try { existing = JSON.parse(readFileSync(configPath, 'utf8')); } catch { /* keep {} */ }
29
+ } else {
30
+ const dir = configPath.substring(0, configPath.lastIndexOf('/'));
31
+ if (dir) mkdirSync(dir, { recursive: true });
32
+ }
33
+ existing.mcpServers = existing.mcpServers || {};
34
+ existing.mcpServers.crawlforge = mcpStanza(apiKey);
35
+ writeFileSync(configPath, JSON.stringify(existing, null, 2) + '\n', 'utf8');
36
+ return configPath;
37
+ }
38
+
39
+ function resolveClientPaths(client) {
40
+ const paths = [];
41
+ if (!client || client === 'claude-code') {
42
+ paths.push({ label: 'Claude Code', path: join(HOME, '.claude.json') });
43
+ }
44
+ if (!client || client === 'claude-desktop') {
45
+ const desktopPath = process.platform === 'darwin'
46
+ ? join(HOME, 'Library', 'Application Support', 'Claude', 'claude_desktop_config.json')
47
+ : process.platform === 'win32'
48
+ ? join(process.env.APPDATA || join(HOME, 'AppData', 'Roaming'), 'Claude', 'claude_desktop_config.json')
49
+ : join(HOME, '.config', 'Claude', 'claude_desktop_config.json');
50
+ paths.push({ label: 'Claude Desktop', path: desktopPath });
51
+ }
52
+ if (!client || client === 'cursor') {
53
+ paths.push({ label: 'Cursor', path: join(HOME, '.cursor', 'mcp.json') });
54
+ }
55
+ return paths;
56
+ }
57
+
58
+ export function register(program) {
59
+ program
60
+ .command('init')
61
+ .description('Set up CrawlForge: verify API key, install skills, and register the MCP server with your AI clients')
62
+ .option('--all', 'Install skills to all targets and register all detected client configs')
63
+ .option('--client <name>', 'Target client to register: claude-code, claude-desktop, or cursor')
64
+ .option('--yes', 'Non-interactive — assume yes to all prompts')
65
+ .action(async (opts) => {
66
+ const out = (msg) => process.stderr.write(msg + '\n');
67
+
68
+ // 1. API key check
69
+ const apiKey = loadStoredApiKey() || process.env.CRAWLFORGE_API_KEY;
70
+ if (!apiKey) {
71
+ out('No CrawlForge API key found.');
72
+ out('Run: npx crawlforge-setup');
73
+ out('Then re-run: crawlforge init');
74
+ process.exit(1);
75
+ }
76
+ out('API key: found (' + apiKey.slice(0, 8) + '...)');
77
+
78
+ // 2. Install skills
79
+ const skillTarget = opts.all ? 'all' : 'claude-code';
80
+ try {
81
+ const results = await install({ target: skillTarget, force: false, cwd: process.cwd() });
82
+ if (results.installed.length > 0) {
83
+ out('Skills installed: ' + results.installed.length + ' file(s)');
84
+ } else {
85
+ out('Skills: already up to date (use crawlforge install-skills --force to overwrite)');
86
+ }
87
+ } catch (err) {
88
+ out('Warning: skill install failed — ' + err.message);
89
+ }
90
+
91
+ // 3. MCP stanza merge
92
+ const clientFilter = opts.client || (opts.all ? undefined : 'claude-code');
93
+ const targets = resolveClientPaths(clientFilter);
94
+
95
+ for (const { label, path: cfgPath } of targets) {
96
+ try {
97
+ mergeClientConfig(cfgPath, apiKey);
98
+ out('MCP registered: ' + label + ' (' + cfgPath + ')');
99
+ } catch (err) {
100
+ out('Warning: could not update ' + label + ' config — ' + err.message);
101
+ }
102
+ }
103
+
104
+ out('Done. Restart your AI client to pick up the crawlforge MCP server.');
105
+ process.exit(0);
106
+ });
107
+ }
package/src/cli/index.js CHANGED
@@ -58,6 +58,7 @@ import { register as registerTemplate } from './commands/template.js';
58
58
  import { register as registerMonitor } from './commands/monitor.js';
59
59
  import { register as registerInstallSkills } from './commands/install-skills.js';
60
60
  import { register as registerUninstallSkills } from './commands/uninstall-skills.js';
61
+ import { register as registerInit } from './commands/init.js';
61
62
 
62
63
  // ─── MCP stdio server mode (backward compatibility) ──────────────────────────
63
64
  // Before v4.1.0 the `crawlforge` bin WAS the MCP server. v4.1.0 turned it into
@@ -136,6 +137,7 @@ registerTemplate(program);
136
137
  registerMonitor(program);
137
138
  registerInstallSkills(program);
138
139
  registerUninstallSkills(program);
140
+ registerInit(program);
139
141
 
140
142
  // `crawlforge mcp` / `crawlforge serve` — explicitly start the MCP server over
141
143
  // stdio. Extra args (e.g. --http) are read directly by server.js from argv.
@@ -15,6 +15,11 @@ export const config = {
15
15
  apiBaseUrl: resolveApiEndpoint(process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev')
16
16
  },
17
17
 
18
+ // Fetch body-size cap
19
+ fetch: {
20
+ maxBodySize: parseInt(process.env.MAX_FETCH_BODY_SIZE || String(25 * 1024 * 1024)) // 25 MB
21
+ },
22
+
18
23
  // Performance
19
24
  performance: {
20
25
  maxWorkers: parseInt(process.env.MAX_WORKERS || '10'),