npm - crawlforge-mcp-server - Versions diffs - 4.2.12 → 4.6.0 - Mend

crawlforge-mcp-server 4.2.12 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/CLAUDE.md +19 -7
package/README.md +11 -3
package/package.json +3 -2
package/server.js +195 -22
package/src/cli/commands/init.js +107 -0
package/src/cli/index.js +2 -0
package/src/constants/config.js +5 -0
package/src/core/ActionExecutor.js +13 -1
package/src/core/AgentOrchestrator.js +300 -0
package/src/core/AuthManager.js +21 -1
package/src/core/ChangeTracker.js +8 -5
package/src/core/LLMsTxtAnalyzer.js +71 -47
package/src/core/LocalizationManager.js +7 -4
package/src/core/ResearchOrchestrator.js +10 -6
package/src/core/StealthBrowserManager.js +52 -13
package/src/core/analysis/ContentAnalyzer.js +2 -2
package/src/core/crawlers/BFSCrawler.js +23 -12
package/src/core/processing/ContentProcessor.js +19 -3
package/src/core/processing/PDFProcessor.js +72 -23
package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
package/src/tools/advanced/batchScrape/index.js +3 -1
package/src/tools/advanced/batchScrape/reporter.js +5 -1
package/src/tools/advanced/batchScrape/worker.js +6 -1
package/src/tools/agent/agent.js +71 -0
package/src/tools/basic/_fetch.js +78 -5
package/src/tools/basic/extractLinks.js +1 -1
package/src/tools/basic/extractMetadata.js +65 -1
package/src/tools/basic/extractText.js +73 -5
package/src/tools/basic/scrapeStructured.js +48 -10
package/src/tools/crawl/crawlDeep.js +13 -5
package/src/tools/crawl/mapSite.js +53 -52
package/src/tools/extract/analyzeContent.js +11 -6
package/src/tools/extract/extractContent.js +23 -5
package/src/tools/extract/extractStructured.js +65 -16
package/src/tools/extract/extractWithLlm.js +192 -11
package/src/tools/extract/listOllamaModels.js +19 -8
package/src/tools/extract/processDocument.js +10 -4
package/src/tools/extract/summarizeContent.js +58 -1
package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
package/src/tools/research/deepResearch.js +43 -4
package/src/tools/scrape/unifiedScrape.js +314 -0
package/src/tools/search/providers/searxng.js +2 -2
package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
package/src/tools/search/ranking/ResultRanker.js +13 -4
package/src/tools/search/searchWeb.js +5 -5
package/src/tools/templates/TemplateRegistry.js +3 -2
package/src/tools/tracking/trackChanges/differ.js +33 -1
package/src/utils/htmlToMarkdown.js +5 -1

package/CLAUDE.md CHANGED Viewed

@@ -60,9 +60,9 @@ These guidelines are working if: fewer unnecessary changes in diffs, fewer rewri
 ## Project Overview
-CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing 23 web scraping, crawling, and content processing tools (5 inline + 18 advanced).
+CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing 26 web scraping, crawling, and content processing tools (5 inline + 21 advanced).
-**Current Version:** 4.2.4
+**Current Version:** 4.6.0
 ## Development Commands
@@ -92,8 +92,10 @@ npm run dev
 # Test MCP protocol compliance
 npm test
-# Unit tests (262 tests, no live network)
+# Unit tests (400+ tests across tests/unit/, no live network)
 npm run test:unit
+# Phase D regressions live in tests/unit/phaseD-regressions.test.js (agent hard stops, unified scrape, map_site ranking)
+# Run a single test file:  node --test tests/unit/phaseD-regressions.test.js
 # Note: add --test-force-exit if the run appears to hang at the end — importing
 # StealthBrowserManager (d2-reliability.test.js) leaves a Playwright handle that
 # otherwise delays process exit ~100s. Tests themselves pass either way.
@@ -109,7 +111,9 @@ node test-real-world.js        # Test real-world usage scenarios
 node tests/integration/mcp-protocol-compliance.test.js
 # CLI (v4.1.0+, requires global install or npx)
-crawlforge --help              # Show all 15 subcommands
+crawlforge --help              # Show all subcommands
+crawlforge init                # API-key detection + skill install + idempotent MCP-stanza merge (v4.6.0)
+crawlforge init --all --yes    # Merge MCP config into Claude Code / Desktop / Cursor non-interactively
 crawlforge scrape https://example.com
 crawlforge batch --urls urls.txt --format markdown
 crawlforge install-skills --target claude-code
@@ -140,6 +144,7 @@ npm run docker:prod         # Run production container
 - **WebhookDispatcher**: Event notification system for job completion callbacks
 - **ActionExecutor**: Browser automation engine (Playwright-based)
 - **ResearchOrchestrator**: Multi-stage research with query expansion and synthesis
+- **AgentOrchestrator**: Powers the `agent` tool — NL prompt → autonomous PLAN→GATHER→ACT→DECIDE→SHAPE loop with three orchestrator-enforced hard stops (maxSteps≤10, maxUrls≤20, wall-clock) never delegated to the LLM; degraded no-LLM-key path (D2, v4.6.0)
 - **StealthBrowserManager**: Stealth mode scraping with anti-detection; Camoufox (Firefox) engine added in v4.0.0
 - **LocalizationManager**: Multi-language content and localization
 - **ChangeTracker**: Content change tracking over time
@@ -155,7 +160,9 @@ npm run docker:prod         # Run production container
 Tools are organized in subdirectories by category:
 - `advanced/` - BatchScrapeTool, ScrapeWithActionsTool
+- `agent/` - agent (AgentOrchestrator-driven autonomous tool, v4.6.0)
 - `basic/` - fetchUrl, extractText, extractLinks, extractMetadata, scrapeStructured
+- `scrape/` - unifiedScrape (single-fetch multi-format `scrape` tool, v4.6.0)
 - `crawl/` - crawlDeep, mapSite
 - `extract/` - analyzeContent, extractContent, extractStructured, extractWithLlm, listOllamaModels, processDocument, summarizeContent
 - `research/` - deepResearch
@@ -164,13 +171,18 @@ Tools are organized in subdirectories by category:
 - `tracking/` - trackChanges
 - `llmstxt/` - generateLLMsTxt
-### Available MCP Tools (23 total)
+### Available MCP Tools (26 total)
 **Basic Tools (server.js inline, 5):**
 fetch_url, extract_text, extract_links, extract_metadata, scrape_structured
-**Advanced Tools (18):**
-search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, extract_structured, extract_with_llm, list_ollama_models, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization, scrape_template
+**Advanced Tools (21):**
+search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, extract_structured, extract_with_llm, list_ollama_models, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization, scrape_template, scrape, agent
+**v4.6.0 additions (Phase D):**
+- `scrape` — single fetch + one cheerio load dispatching a `formats` array (markdown/html/rawHtml/text/links/metadata/screenshot/json-schema) + `onlyMainContent`; partial-success via per-format `warnings[]`. Cost: 2.
+- `agent` — NL prompt → autonomous research/extract, no URLs required (see AgentOrchestrator above). Cost: 8.
+- `map_site` gained an optional `search=` param that ranks discovered URLs (`ranked_urls:[{url,score}]`); default output unchanged.
 ### MCP Server Entry Point

package/README.md CHANGED Viewed

@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
 ## 🎯 Features
-- **23 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis, local-LLM extraction (Ollama)
+- **26 Professional Tools**: Web scraping, deep research, an autonomous `agent`, a unified multi-format `scrape`, stealth browsing, content analysis, local-LLM extraction (Ollama)
 - **Free Tier**: 1,000 credits to get started instantly
 - **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
 - **Enterprise Ready**: Scale up with paid plans for production use
@@ -37,6 +37,8 @@ This will:
 **Don't have an API key?** Get one free at [https://www.crawlforge.dev/signup](https://www.crawlforge.dev/signup)
+> **One-step setup (v4.6.0+):** `crawlforge init` detects your API key, installs the agent skill, and idempotently merges the MCP config stanza into Claude Code, Claude Desktop, and Cursor. Use `crawlforge init --all --yes` to configure every detected client non-interactively.
 ### 3. Configure Your IDE (if not auto-configured)
 <details>
@@ -107,8 +109,10 @@ Restart Cursor to activate.
 - `extract_text` - Extract clean text from web pages
 - `extract_links` - Get all links from a page
 - `extract_metadata` - Extract page metadata
+- `scrape_template` - Structured data from well-known sites (Amazon, GitHub, LinkedIn, YouTube, Reddit, Hacker News, npm, and more) without writing selectors
 ### Advanced Tools (2-3 credits)
+- `scrape` - **Unified single-fetch, multi-format extraction.** Pass a `formats` array (markdown/html/rawHtml/text/links/metadata/screenshot/json-schema) plus `onlyMainContent`; one fetch serves every requested format with per-format partial-success warnings
 - `scrape_structured` - Extract structured data with CSS selectors
 - `search_web` - Search the web using Google Search API
 - `summarize_content` - Generate intelligent summaries
@@ -117,10 +121,12 @@ Restart Cursor to activate.
 - `extract_with_llm` - Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" | "anthropic"` with the matching key for cloud models.
 - `list_ollama_models` - List the Ollama models installed locally (free; helps you pick a `model` for `extract_with_llm`)
 - `track_changes` - Monitor content changes over time
+- `get_batch_results` - Retrieve paginated results for a `batch_scrape` job by `batchId`
 ### Premium Tools (5-10 credits)
+- `agent` - **Autonomous research/extraction from a natural-language prompt — no URLs required.** Plans, gathers, and shapes an answer under hard safety stops (max steps/URLs/wall-clock enforced by the orchestrator, never the LLM)
 - `crawl_deep` - Deep crawl entire websites
-- `map_site` - Discover and map website structure
+- `map_site` - Discover and map website structure (optional `search=` ranks the discovered URLs)
 - `batch_scrape` - Process multiple URLs simultaneously
 - `deep_research` - Multi-stage research with source verification
 - `stealth_mode` - Anti-detection browser management
@@ -132,6 +138,8 @@ Restart Cursor to activate.
 - `generate_llms_txt` - Generate AI interaction guidelines
 - `localization` - Multi-language and geo-location management
+For the full canonical capabilities reference (all tools, CLI commands, stealth engines, research workflow), see [SKILL.md](SKILL.md).
 ## 💳 Pricing
 | Plan | Credits/Month | Best For |
@@ -142,7 +150,7 @@ Restart Cursor to activate.
 | **Enterprise** | 250,000 | Large scale operations |
 **All plans include:**
-- Access to all 23 tools
+- Access to all 26 tools
 - Credits never expire and roll over month-to-month
 - API access and webhook notifications

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawlforge-mcp-server",
-  "version": "4.2.12",
+  "version": "4.6.0",
   "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
   "main": "server.js",
   "bin": {
@@ -21,7 +21,7 @@
     "test:tools": "node test-tools.js",
     "test:real-world": "node test-real-world.js",
     "test:all": "bash run-all-tests.sh",
-    "postinstall": "echo '\n🎉 CrawlForge MCP Server installed!\n\nRun \"npx crawlforge-setup\" to configure your API key and get started.\n'",
+    "postinstall": "echo '\nCrawlForge MCP Server installed!\n\nQuick start: run \"npx crawlforge init\" to configure your API key, install skills, and register the MCP server with your AI clients.\nOr run \"npx crawlforge-setup\" to configure your API key only.\n'",
     "docker:build": "docker build -t crawlforge .",
     "docker:dev": "docker-compose up crawlforge-dev",
     "docker:prod": "docker-compose up crawlforge-prod"
@@ -113,6 +113,7 @@
     "playwright": "^1.54.2",
     "robots-parser": "^3.0.1",
     "turndown": "^7.2.4",
+    "turndown-plugin-gfm": "^1.0.2",
     "undici": "^7.24.0",
     "winston": "^3.11.0",
     "zod": "^3.23.8"

package/server.js CHANGED Viewed

@@ -24,6 +24,8 @@ import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
 import { TrackChangesTool } from "./src/tools/tracking/trackChanges/index.js";
 import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
 import { ScrapeTemplateTool } from "./src/tools/templates/ScrapeTemplateTool.js"; // D3.3
+import { UnifiedScrapeTool } from "./src/tools/scrape/unifiedScrape.js"; // D4 D1
+import { AgentTool } from "./src/tools/agent/agent.js"; // D4 D2
 import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
 import { LocalizationManager } from "./src/core/LocalizationManager.js";
 import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
@@ -96,8 +98,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
 // Create the server
 const server = new McpServer({
   name: "crawlforge",
-  version: "4.2.6",
-  description: "Production-ready MCP server with 23 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
+  version: "4.5.0",
+  description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
   homepage: "https://www.crawlforge.dev",
   icon: "https://www.crawlforge.dev/icon.png"
 });
@@ -111,7 +113,7 @@ server.prompt("getting-started", {
       role: "user",
       content: {
         type: "text",
-        text: "You have access to CrawlForge MCP with 23 web scraping tools. Key tools:\n\n" +
+        text: "You have access to CrawlForge MCP with 26 web scraping tools. Key tools:\n\n" +
           "- fetch_url: Fetch raw HTML/content from any URL\n" +
           "- extract_text: Extract clean text from a webpage\n" +
           "- extract_content: Smart content extraction with readability\n" +
@@ -161,6 +163,8 @@ const deepResearchTool = new DeepResearchTool();
 const trackChangesTool = new TrackChangesTool();
 const generateLLMsTxtTool = new GenerateLLMsTxtTool();
 const scrapeTemplateTool = new ScrapeTemplateTool(); // D3.3
+const unifiedScrapeTool = new UnifiedScrapeTool(); // D4 D1
+const agentTool = new AgentTool(); // D4 D2
 const stealthBrowserManager = new StealthBrowserManager();
 const localizationManager = new LocalizationManager();
@@ -181,6 +185,7 @@ deepResearchTool.setMcpServer(server);
 batchScrapeTool.setMcpServer(server);
 crawlDeepTool.setMcpServer(server);
 extractStructuredTool.setMcpServer(server);
+agentTool.setMcpServer(server); // D4 D2: SamplingClient + Elicitation
 AuthManager.setElicitation(elicitation);
 // ─── D1.1 Resource Templates (MCP Resources) ─────────────────────────────────
@@ -299,7 +304,8 @@ server.registerTool("scrape_structured", {
   annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to scrape"),
-    selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
+    selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors. Append @attr to extract an attribute instead of text (e.g. \"a.link@href\", \"img@src\")"),
+    max_results: z.number().int().min(1).optional().describe("Maximum number of matches to return per field when a selector matches multiple elements")
   }
 }, withAuth("scrape_structured", scrapeStructuredHandler));
@@ -315,14 +321,50 @@ server.registerTool("search_web", {
     safe_search: z.boolean().optional().describe("Enable safe search filtering"),
     time_range: z.enum(["day", "week", "month", "year", "all"]).optional().describe("Filter results by time range"),
     site: z.string().optional().describe("Limit results to a specific domain"),
-    file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')")
+    file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')"),
+    provider: z.enum(["crawlforge", "searxng"]).optional().describe("Search backend to use"),
+    expand_query: z.boolean().optional().describe("Expand the query with synonyms/stemming/etc."),
+    expansion_options: z.object({
+      enableSynonyms: z.boolean().optional(),
+      enableSpellCheck: z.boolean().optional(),
+      enableStemming: z.boolean().optional(),
+      enablePhraseDetection: z.boolean().optional(),
+      enableBooleanOperators: z.boolean().optional(),
+      maxExpansions: z.number().min(1).max(10).optional()
+    }).optional().describe("Query-expansion tuning"),
+    enable_ranking: z.boolean().optional().describe("Re-rank results (BM25 + signals)"),
+    ranking_weights: z.object({
+      bm25: z.number().min(0).max(1).optional(),
+      semantic: z.number().min(0).max(1).optional(),
+      authority: z.number().min(0).max(1).optional(),
+      freshness: z.number().min(0).max(1).optional()
+    }).optional().describe("Relative weights for ranking signals"),
+    enable_deduplication: z.boolean().optional().describe("Remove near-duplicate results"),
+    deduplication_thresholds: z.object({
+      url: z.number().min(0).max(1).optional(),
+      title: z.number().min(0).max(1).optional(),
+      content: z.number().min(0).max(1).optional(),
+      combined: z.number().min(0).max(1).optional()
+    }).optional().describe("Similarity thresholds for dedup"),
+    include_ranking_details: z.boolean().optional().describe("Include per-result ranking breakdown"),
+    include_deduplication_details: z.boolean().optional().describe("Include dedup decision details"),
+    localization: z.object({
+      countryCode: z.string().length(2).optional(),
+      language: z.string().optional(),
+      timezone: z.string().optional(),
+      enableGeoTargeting: z.boolean().optional(),
+      customLocation: z.object({
+        latitude: z.number().min(-90).max(90),
+        longitude: z.number().min(-180).max(180)
+      }).optional()
+    }).optional().describe("Geo/locale targeting for results")
   }
-}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
+}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization }) => {
   try {
     if (!query) {
       return { content: [{ type: "text", text: "Query parameter is required" }], isError: true };
     }
-    const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
+    const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization });
     return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
   } catch (error) {
     return { content: [{ type: "text", text: `Search failed: ${error.message}` }], isError: true };
@@ -342,14 +384,38 @@ server.registerTool("crawl_deep", {
     follow_external: z.boolean().optional().describe("Follow links to external domains"),
     respect_robots: z.boolean().optional().describe("Respect robots.txt directives"),
     extract_content: z.boolean().optional().describe("Extract page content during crawl"),
-    concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests")
+    content_max_length: z.number().min(1).max(100000).optional().describe("Maximum characters of page content to include per page (default 500); sets a truncated flag when trimmed"),
+    concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests"),
+    enable_link_analysis: z.boolean().optional().describe("Compute PageRank/link-graph analysis over crawled pages"),
+    link_analysis_options: z.object({
+      dampingFactor: z.number().min(0).max(1).optional(),
+      maxIterations: z.number().min(1).max(1000).optional(),
+      enableCaching: z.boolean().optional()
+    }).optional().describe("PageRank tuning options"),
+    domain_filter: z.object({
+      whitelist: z.array(z.any()).optional(),
+      blacklist: z.array(z.any()).optional(),
+      domain_rules: z.record(z.any()).optional()
+    }).optional().describe("Per-domain allow/deny lists and crawl rules"),
+    import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config"),
+    session: z.object({
+      enabled: z.boolean(),
+      persistCookies: z.boolean().optional(),
+      headers: z.record(z.string()).optional(),
+      initialRequest: z.object({
+        url: z.string().url(),
+        method: z.string().optional(),
+        headers: z.record(z.string()).optional(),
+        body: z.string().optional()
+      }).optional()
+    }).optional().describe("Shared cookie-jar/session for login-then-crawl workflows")
   }
-}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
+}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session }) => {
   try {
     if (!url) {
       return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
     }
-    const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency });
+    const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session });
     return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
   } catch (error) {
     return { content: [{ type: "text", text: `Crawl failed: ${error.message}` }], isError: true };
@@ -365,14 +431,22 @@ server.registerTool("map_site", {
     include_sitemap: z.boolean().optional().describe("Include sitemap.xml data in results"),
     max_urls: z.number().min(1).max(10000).optional().describe("Maximum number of URLs to discover"),
     group_by_path: z.boolean().optional().describe("Group URLs by path segments"),
-    include_metadata: z.boolean().optional().describe("Include page metadata for each URL")
+    include_metadata: z.boolean().optional().describe("Include page metadata for each URL"),
+    domain_filter: z.object({
+      whitelist: z.array(z.string()).optional(),
+      blacklist: z.array(z.string()).optional(),
+      include_patterns: z.array(z.string()).optional(),
+      exclude_patterns: z.array(z.string()).optional()
+    }).optional().describe("Per-domain allow/deny lists and URL include/exclude patterns"),
+    import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config"),
+    search: z.string().optional().describe("When set, rank discovered URLs by relevance to this string and emit ranked_urls:[{url,score}]")
   }
-}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
+}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config, search }) => {
   try {
     if (!url) {
       return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
     }
-    const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata });
+    const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config, search });
     return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
   } catch (error) {
     return { content: [{ type: "text", text: `Site mapping failed: ${error.message}` }], isError: true };
@@ -406,7 +480,9 @@ server.registerTool("process_document", {
   inputSchema: {
     source: z.string().describe("Document source - URL or file path"),
     sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional().describe("Type of document source"),
-    options: z.object({}).optional().describe("Additional processing options")
+    // C3: passthrough so granular options (maxPages, pageRange:{start,end},
+    // extractText, outputFormat, etc.) reach the tool instead of being stripped.
+    options: z.object({}).passthrough().optional().describe("Additional processing options (maxPages, pageRange:{start,end}, extractText, extractMetadata, password, outputFormat, ...)")
   }
 }, withAuth("process_document", async ({ source, sourceType, options }) => {
   try {
@@ -572,6 +648,27 @@ server.registerTool("batch_scrape", {
   }
 }));
+// Tool: get_batch_results — C3: retrieve paginated results for a completed batch
+server.registerTool("get_batch_results", {
+  description: "Retrieve paginated results for a completed or in-progress batch_scrape job. Use the batchId returned by batch_scrape. Example: get_batch_results({batchId: \"batch_1234567890_abc\", page: 2, pageSize: 25})",
+  annotations: { title: "Get Batch Results", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
+  inputSchema: {
+    batchId: z.string().describe("The batch ID returned by batch_scrape"),
+    page: z.number().min(1).default(1).describe("Page number (1-based)"),
+    pageSize: z.number().min(1).max(100).default(25).describe("Number of results per page")
+  }
+}, withAuth("get_batch_results", async ({ batchId, page = 1, pageSize = 25 }) => {
+  try {
+    if (!batchId) {
+      return { content: [{ type: "text", text: "batchId parameter is required" }], isError: true };
+    }
+    const result = await batchScrapeTool.getBatchResults(batchId, page, pageSize);
+    return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
+  } catch (error) {
+    return { content: [{ type: "text", text: `get_batch_results failed: ${error.message}` }], isError: true };
+  }
+}));
 // Tool: scrape_with_actions
 server.registerTool("scrape_with_actions", {
   description: "Use this when you need to interact with a page before scraping — login, click buttons, fill forms, scroll, or wait for dynamic content to load. Use for SPAs, login-gated content, or multi-step flows. Screenshots from this tool are stored as crawlforge://screenshot/{actionId} resources. Example: scrape_with_actions({url: \"https://app.com/dashboard\", actions: [{type:\"click\",selector:\"#login\"},{type:\"type\",selector:\"#email\",text:\"user@a.com\"}]})",
@@ -586,8 +683,34 @@ server.registerTool("scrape_with_actions", {
       script: z.string().optional(),
       timeout: z.number().optional(),
       description: z.string().optional(),
-      continueOnError: z.boolean().default(false),
-      retries: z.number().min(0).max(5).default(0)
+      continueOnError: z.boolean().optional(),
+      retries: z.number().min(0).max(5).optional(),
+      captureAfter: z.boolean().optional().describe("Capture page content after this action"),
+      // wait
+      duration: z.number().min(0).max(30000).optional().describe("wait: milliseconds to wait"),
+      condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional().describe("wait: condition on selector"),
+      // click
+      button: z.enum(['left', 'right', 'middle']).optional().describe("click: mouse button"),
+      clickCount: z.number().min(1).max(3).optional().describe("click: number of clicks"),
+      delay: z.number().min(0).max(1000).optional().describe("click/type: delay in ms"),
+      force: z.boolean().optional().describe("click: bypass actionability checks"),
+      position: z.object({ x: z.number(), y: z.number() }).optional().describe("click: relative position"),
+      // type
+      clear: z.boolean().optional().describe("type: clear field before typing"),
+      // press
+      modifiers: z.array(z.enum(['Alt', 'Control', 'Meta', 'Shift'])).optional().describe("press: modifier keys"),
+      // scroll
+      direction: z.enum(['up', 'down', 'left', 'right']).optional().describe("scroll: direction"),
+      distance: z.number().min(0).optional().describe("scroll: pixels to scroll"),
+      smooth: z.boolean().optional().describe("scroll: smooth scrolling"),
+      toElement: z.string().optional().describe("scroll: selector to scroll to"),
+      // screenshot
+      fullPage: z.boolean().optional().describe("screenshot: capture full page"),
+      quality: z.number().min(0).max(100).optional().describe("screenshot: jpeg quality"),
+      format: z.enum(['png', 'jpeg']).optional().describe("screenshot: image format"),
+      // executeJavaScript
+      args: z.array(z.any()).optional().describe("executeJavaScript: arguments passed to the script"),
+      returnResult: z.boolean().optional().describe("executeJavaScript: return the script result")
     })).min(1).max(20).describe("Browser actions to perform before scraping"),
     formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']).describe("Output formats for scraped content"),
     captureIntermediateStates: z.boolean().default(false).describe("Capture page state after each action"),
@@ -684,6 +807,53 @@ server.registerTool("deep_research", {
   }
 }));
+// Tool: scrape (D4 D1 — unified multi-format single-fetch)
+server.registerTool("scrape", {
+  description: "Use this when you need multiple content formats from a single URL in one call — e.g. markdown + links + metadata together. One fetch, no N-request fan-out. Formats: \"markdown\", \"html\", \"rawHtml\", \"text\", \"links\", \"metadata\", or {type:\"json\",schema,prompt} for LLM-structured extraction. onlyMainContent:true (default) strips boilerplate via Readability. Partial success: per-format warnings never fail the whole call. Example: scrape({url:\"https://example.com\", formats:[\"markdown\",\"links\",\"metadata\"]})",
+  annotations: { title: "Scrape (Multi-Format)", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
+  inputSchema: {
+    url: z.string().url().describe("The URL to scrape"),
+    formats: z.array(z.union([
+      z.enum(["markdown", "html", "rawHtml", "text", "links", "metadata", "screenshot"]),
+      z.object({
+        type: z.literal("json"),
+        schema: z.record(z.any()).optional().describe("JSON schema for extraction"),
+        prompt: z.string().optional().describe("Extraction instruction for the LLM")
+      })
+    ])).min(1).optional().default(["markdown"]).describe("Formats to return (default: [\"markdown\"])"),
+    onlyMainContent: z.boolean().optional().default(true).describe("Strip boilerplate via Readability (default: true)"),
+    timeoutMs: z.number().min(1000).max(60000).optional().default(15000).describe("Fetch timeout in ms")
+  }
+}, withAuth("scrape", async (params) => {
+  try {
+    const result = await unifiedScrapeTool.execute(params);
+    return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
+  } catch (error) {
+    return { content: [{ type: "text", text: `Scrape failed: ${error.message}` }], isError: true };
+  }
+}));
+// Tool: agent (D4 D2 — autonomous NL prompt → search/navigate/extract)
+server.registerTool("agent", {
+  description: "Use this when you need an autonomous agent to research, navigate, and synthesise an answer from the web — no URLs required. The agent plans search queries, fetches and filters relevant pages, and returns a prose or structured answer. model:\"pro\" uses deep multi-source research. Hard limits: maxSteps≤10, maxUrls≤20, 120s wall-clock. Confirms before pro runs. Degraded-but-useful output if no LLM keys/Ollama. Example: agent({prompt:\"What are the top 5 MCP servers in 2025?\", maxUrls:10})",
+  annotations: { title: "Agent (Autonomous)", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
+  inputSchema: {
+    prompt: z.string().min(1).max(2000).describe("Natural-language task or question"),
+    urls: z.array(z.string().url()).max(20).optional().describe("Optional seed URLs to include (max 20)"),
+    schema: z.record(z.any()).optional().describe("Optional JSON schema for structured output"),
+    model: z.enum(["default", "pro"]).optional().default("default").describe("\"default\" = SamplingClient loop (no keys needed); \"pro\" = full ResearchOrchestrator"),
+    maxSteps: z.number().min(1).max(10).optional().default(5).describe("Max fetch iterations (hard cap: 10)"),
+    maxUrls: z.number().min(1).max(20).optional().default(10).describe("Max URLs to fetch (hard cap: 20)")
+  }
+}, withAuth("agent", async (params) => {
+  try {
+    const result = await agentTool.execute(params);
+    return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
+  } catch (error) {
+    return { content: [{ type: "text", text: `Agent failed: ${error.message}` }], isError: true };
+  }
+}));
 // Tool: track_changes
 server.registerTool("track_changes", {
   description: "Use this when you need to monitor a URL for content changes over time — e.g. competitor pricing, regulation updates, product availability. Start with operation:\"create_baseline\", then periodically use operation:\"compare\" to diff. Supports webhooks and scheduled monitoring. Example: track_changes({url: \"https://example.com/pricing\", operation: \"create_baseline\"})",
@@ -1012,8 +1182,9 @@ server.registerTool("localization", {
         };
         break;
       case 'handle_geo_blocking':
-        if (!params.url || !params.response) throw new Error('url and response are required for handle_geo_blocking operation');
-        result = await localizationManager.handleGeoBlocking(params.url, params.response);
+      case 'detect_geo_blocking':
+        if (!params.url || !params.response) throw new Error('url and response are required for detect_geo_blocking operation');
+        result = await localizationManager.detectGeoBlocking(params.url, params.response);
         break;
       case 'auto_detect':
         if (!params.content || !params.url) throw new Error('content and url are required for auto_detect operation');
@@ -1103,12 +1274,13 @@ async function runServer() {
     "fetch_url", "extract_text", "extract_links", "extract_metadata", "scrape_structured",
     "search_web", "crawl_deep", "map_site",
     "extract_content", "process_document", "summarize_content", "analyze_content",
-    "batch_scrape", "scrape_with_actions",
+    "batch_scrape", "get_batch_results", "scrape_with_actions",
     "deep_research", "track_changes", "generate_llms_txt",
     "stealth_mode", "localization", "extract_structured", "extract_with_llm",
-    "scrape_template"  // D3.3
+    "list_ollama_models", "scrape_template", // D3.3
+    "scrape", "agent"  // D4
   ];
-  console.error(`Tools available (23): ${allTools.join(", ")}`);
+  console.error(`Tools available (26): ${allTools.join(", ")}`);
   // Start memory monitoring in development
   if (config.server.nodeEnv === "development") {
@@ -1134,7 +1306,8 @@ async function gracefulShutdown(signal) {
     const toolsToCleanup = [
       batchScrapeTool, scrapeWithActionsTool, deepResearchTool,
       trackChangesTool, generateLLMsTxtTool, stealthBrowserManager,
-      localizationManager, extractStructuredTool
+      localizationManager, extractStructuredTool,
+      agentTool // D4 D2: may hold ResearchOrchestrator
     ].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
     console.error(`Cleaning up ${toolsToCleanup.length} tools...`);

package/src/cli/commands/init.js ADDED Viewed

@@ -0,0 +1,107 @@
+/**
+ * init command — one-shot setup: API key check + skill install + MCP stanza merge.
+ */
+import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'node:fs';
+import { join } from 'node:path';
+import { install } from '../../skills/installer.js';
+const HOME = process.env.HOME || process.env.USERPROFILE || '';
+function loadStoredApiKey() {
+  try {
+    const cfg = JSON.parse(readFileSync(join(HOME, '.crawlforge', 'config.json'), 'utf8'));
+    return cfg.apiKey || undefined;
+  } catch {
+    return undefined;
+  }
+}
+function mcpStanza(apiKey) {
+  const stanza = { command: 'npx', args: ['-y', 'crawlforge@latest', 'mcp'] };
+  if (apiKey) stanza.env = { CRAWLFORGE_API_KEY: apiKey };
+  return stanza;
+}
+function mergeClientConfig(configPath, apiKey) {
+  let existing = {};
+  if (existsSync(configPath)) {
+    try { existing = JSON.parse(readFileSync(configPath, 'utf8')); } catch { /* keep {} */ }
+  } else {
+    const dir = configPath.substring(0, configPath.lastIndexOf('/'));
+    if (dir) mkdirSync(dir, { recursive: true });
+  }
+  existing.mcpServers = existing.mcpServers || {};
+  existing.mcpServers.crawlforge = mcpStanza(apiKey);
+  writeFileSync(configPath, JSON.stringify(existing, null, 2) + '\n', 'utf8');
+  return configPath;
+}
+function resolveClientPaths(client) {
+  const paths = [];
+  if (!client || client === 'claude-code') {
+    paths.push({ label: 'Claude Code', path: join(HOME, '.claude.json') });
+  }
+  if (!client || client === 'claude-desktop') {
+    const desktopPath = process.platform === 'darwin'
+      ? join(HOME, 'Library', 'Application Support', 'Claude', 'claude_desktop_config.json')
+      : process.platform === 'win32'
+        ? join(process.env.APPDATA || join(HOME, 'AppData', 'Roaming'), 'Claude', 'claude_desktop_config.json')
+        : join(HOME, '.config', 'Claude', 'claude_desktop_config.json');
+    paths.push({ label: 'Claude Desktop', path: desktopPath });
+  }
+  if (!client || client === 'cursor') {
+    paths.push({ label: 'Cursor', path: join(HOME, '.cursor', 'mcp.json') });
+  }
+  return paths;
+}
+export function register(program) {
+  program
+    .command('init')
+    .description('Set up CrawlForge: verify API key, install skills, and register the MCP server with your AI clients')
+    .option('--all', 'Install skills to all targets and register all detected client configs')
+    .option('--client <name>', 'Target client to register: claude-code, claude-desktop, or cursor')
+    .option('--yes', 'Non-interactive — assume yes to all prompts')
+    .action(async (opts) => {
+      const out = (msg) => process.stderr.write(msg + '\n');
+      // 1. API key check
+      const apiKey = loadStoredApiKey() || process.env.CRAWLFORGE_API_KEY;
+      if (!apiKey) {
+        out('No CrawlForge API key found.');
+        out('Run: npx crawlforge-setup');
+        out('Then re-run: crawlforge init');
+        process.exit(1);
+      }
+      out('API key: found (' + apiKey.slice(0, 8) + '...)');
+      // 2. Install skills
+      const skillTarget = opts.all ? 'all' : 'claude-code';
+      try {
+        const results = await install({ target: skillTarget, force: false, cwd: process.cwd() });
+        if (results.installed.length > 0) {
+          out('Skills installed: ' + results.installed.length + ' file(s)');
+        } else {
+          out('Skills: already up to date (use crawlforge install-skills --force to overwrite)');
+        }
+      } catch (err) {
+        out('Warning: skill install failed — ' + err.message);
+      }
+      // 3. MCP stanza merge
+      const clientFilter = opts.client || (opts.all ? undefined : 'claude-code');
+      const targets = resolveClientPaths(clientFilter);
+      for (const { label, path: cfgPath } of targets) {
+        try {
+          mergeClientConfig(cfgPath, apiKey);
+          out('MCP registered: ' + label + ' (' + cfgPath + ')');
+        } catch (err) {
+          out('Warning: could not update ' + label + ' config — ' + err.message);
+        }
+      }
+      out('Done. Restart your AI client to pick up the crawlforge MCP server.');
+      process.exit(0);
+    });
+}

package/src/cli/index.js CHANGED Viewed

@@ -58,6 +58,7 @@ import { register as registerTemplate } from './commands/template.js';
 import { register as registerMonitor } from './commands/monitor.js';
 import { register as registerInstallSkills } from './commands/install-skills.js';
 import { register as registerUninstallSkills } from './commands/uninstall-skills.js';
+import { register as registerInit } from './commands/init.js';
 // ─── MCP stdio server mode (backward compatibility) ──────────────────────────
 // Before v4.1.0 the `crawlforge` bin WAS the MCP server. v4.1.0 turned it into
@@ -136,6 +137,7 @@ registerTemplate(program);
 registerMonitor(program);
 registerInstallSkills(program);
 registerUninstallSkills(program);
+registerInit(program);
 // `crawlforge mcp` / `crawlforge serve` — explicitly start the MCP server over
 // stdio. Extra args (e.g. --http) are read directly by server.js from argv.

package/src/constants/config.js CHANGED Viewed

@@ -15,6 +15,11 @@ export const config = {
     apiBaseUrl: resolveApiEndpoint(process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev')
   },
+  // Fetch body-size cap
+  fetch: {
+    maxBodySize: parseInt(process.env.MAX_FETCH_BODY_SIZE || String(25 * 1024 * 1024)) // 25 MB
+  },
   // Performance
   performance: {
     maxWorkers: parseInt(process.env.MAX_WORKERS || '10'),