npm - crawlforge-mcp-server - Versions diffs - 3.4.0 → 4.2.1 - Mend

crawlforge-mcp-server 3.4.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/README.md +28 -2
package/package.json +6 -4
package/server.js +166 -32
package/src/cli/commands/actions.js +36 -0
package/src/cli/commands/analyze.js +19 -0
package/src/cli/commands/batch.js +45 -0
package/src/cli/commands/crawl.js +30 -0
package/src/cli/commands/extract.js +45 -0
package/src/cli/commands/install-skills.js +46 -0
package/src/cli/commands/llmstxt.js +24 -0
package/src/cli/commands/localize.js +29 -0
package/src/cli/commands/map.js +26 -0
package/src/cli/commands/monitor.js +29 -0
package/src/cli/commands/research.js +26 -0
package/src/cli/commands/scrape.js +37 -0
package/src/cli/commands/search.js +28 -0
package/src/cli/commands/stealth.js +29 -0
package/src/cli/commands/template.js +26 -0
package/src/cli/commands/track.js +24 -0
package/src/cli/commands/uninstall-skills.js +35 -0
package/src/cli/formatter.js +57 -0
package/src/cli/index.js +94 -0
package/src/cli/lib/runTool.js +40 -0
package/src/core/ActionExecutor.js +8 -6
package/src/core/AuthManager.js +103 -3
package/src/core/ChangeTracker.js +34 -0
package/src/core/ElicitationHelper.js +112 -0
package/src/core/JobManager.js +36 -2
package/src/core/LocalizationManager.js +19 -5
package/src/core/PerformanceManager.js +53 -17
package/src/core/ResearchOrchestrator.js +40 -5
package/src/core/SamplingClient.js +191 -0
package/src/core/StealthBrowserManager.js +248 -2
package/src/core/WebhookDispatcher.js +18 -10
package/src/prompts/PromptRegistry.js +199 -0
package/src/resources/ResourceRegistry.js +273 -0
package/src/server/transports/streamableHttp.js +6 -6
package/src/server/withAuth.js +25 -0
package/src/skills/crawlforge-cli.md +157 -0
package/src/skills/crawlforge-mcp.md +80 -0
package/src/skills/crawlforge-research.md +104 -0
package/src/skills/crawlforge-stealth.md +98 -0
package/src/skills/installer.js +141 -0
package/src/tools/advanced/batchScrape/index.js +30 -0
package/src/tools/advanced/batchScrape/schema.js +1 -1
package/src/tools/basic/extractText.js +19 -8
package/src/tools/crawl/crawlDeep.js +27 -0
package/src/tools/extract/extractContent.js +5 -17
package/src/tools/extract/extractStructured.js +8 -0
package/src/tools/extract/extractWithLlm.js +35 -25
package/src/tools/extract/listOllamaModels.js +66 -0
package/src/tools/extract/processDocument.js +7 -1
package/src/tools/extract/summarizeContent.js +17 -0
package/src/tools/research/deepResearch.js +34 -0
package/src/tools/templates/ScrapeTemplateTool.js +68 -0
package/src/tools/templates/TemplateRegistry.js +311 -0
package/src/utils/Logger.js +15 -0
package/src/utils/htmlToMarkdown.js +54 -0
package/src/utils/secretMask.js +86 -0

package/README.md CHANGED Viewed

@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
 ## 🎯 Features
-- **20 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis
+- **22 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis, local-LLM extraction (Ollama)
 - **Free Tier**: 1,000 credits to get started instantly
 - **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
 - **Enterprise Ready**: Scale up with paid plans for production use
@@ -112,6 +112,8 @@ Restart Cursor to activate.
 - `summarize_content` - Generate intelligent summaries
 - `analyze_content` - Comprehensive content analysis
 - `extract_structured` - LLM-powered schema-driven extraction
+- `extract_with_llm` - Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" | "anthropic"` with the matching key for cloud models.
+- `list_ollama_models` - List the Ollama models installed locally (free; helps you pick a `model` for `extract_with_llm`)
 - `track_changes` - Monitor content changes over time
 ### Premium Tools (5-10 credits)
@@ -138,7 +140,7 @@ Restart Cursor to activate.
 | **Enterprise** | 250,000 | Large scale operations |
 **All plans include:**
-- Access to all 20 tools
+- Access to all 22 tools
 - Credits never expire and roll over month-to-month
 - API access and webhook notifications
@@ -155,6 +157,30 @@ export CRAWLFORGE_API_KEY="cf_live_your_api_key_here"
 # Optional: Custom API endpoint (for enterprise)
 export CRAWLFORGE_API_URL="https://api.crawlforge.dev"
 # As of v3.0.18, this variable is validated against an allow-list of CrawlForge backend hosts.
+# Optional: Local LLM (Ollama) overrides — extract_with_llm defaults to Ollama
+export OLLAMA_BASE_URL="http://localhost:11434"   # default
+export OLLAMA_DEFAULT_MODEL="llama3.2"             # default; any locally-pulled model name works
+# Optional: Cloud LLM keys — only needed when you pass provider: "openai" or "anthropic"
+export OPENAI_API_KEY="sk-..."
+export ANTHROPIC_API_KEY="sk-ant-..."
+```
+### Local-LLM quickstart (`extract_with_llm` with Ollama)
+`extract_with_llm` defaults to a local Ollama model — no API key, no API costs, no data leaving your machine.
+```bash
+# 1. Install Ollama:  https://ollama.com
+# 2. Pull any model from https://ollama.com/library
+ollama pull llama3.2
+# 3. Discover what's installed (from your MCP client)
+#    list_ollama_models()
+# 4. Extract — defaults to Ollama with the model from step 2
+#    extract_with_llm({ url: "https://example.com", prompt: "…", model: "llama3.2" })
 ```
 ### Manual Configuration

package/package.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "name": "crawlforge-mcp-server",
-  "version": "3.4.0",
-  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 21 comprehensive web scraping, crawling, and content processing tools.",
+  "version": "4.2.1",
+  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
   "main": "server.js",
   "bin": {
-    "crawlforge": "server.js",
+    "crawlforge": "src/cli/index.js",
     "crawlforge-setup": "setup.js"
   },
   "scripts": {
@@ -19,7 +19,7 @@
     "test:tools": "node test-tools.js",
     "test:real-world": "node test-real-world.js",
     "test:all": "bash run-all-tests.sh",
-    "postinstall": "echo '\n🎉 CrawlForge MCP Server installed!\n\nRun \"npx crawlforge-setup\" to configure your API key and get started.\n'",
+    "postinstall": "echo '\n\ud83c\udf89 CrawlForge MCP Server installed!\n\nRun \"npx crawlforge-setup\" to configure your API key and get started.\n'",
     "docker:build": "docker build -t crawlforge .",
     "docker:dev": "docker-compose up crawlforge-dev",
     "docker:prod": "docker-compose up crawlforge-prod"
@@ -96,6 +96,7 @@
     "@modelcontextprotocol/sdk": "^1.29.0",
     "@mozilla/readability": "^0.6.0",
     "cheerio": "^1.1.2",
+    "commander": "^12.1.0",
     "compromise": "^14.14.4",
     "diff": "^8.0.2",
     "dotenv": "^17.2.1",
@@ -109,6 +110,7 @@
     "pdf-parse": "^1.1.1",
     "playwright": "^1.54.2",
     "robots-parser": "^3.0.1",
+    "turndown": "^7.2.4",
     "winston": "^3.11.0",
     "zod": "^3.23.8"
   },

package/server.js CHANGED Viewed

@@ -5,7 +5,7 @@
 export { isCreatorModeVerified } from './src/core/creatorMode.js';
 // Import everything else
-import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
 import { z } from "zod";
 import { logger } from "./src/utils/Logger.js";
 import { SearchWebTool } from "./src/tools/search/searchWeb.js";
@@ -17,11 +17,13 @@ import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
 import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
 import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
 import { ExtractWithLlm } from "./src/tools/extract/extractWithLlm.js";
+import { ListOllamaModelsTool } from "./src/tools/extract/listOllamaModels.js";
 import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
 import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
 import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
 import { TrackChangesTool } from "./src/tools/tracking/trackChanges/index.js";
 import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
+import { ScrapeTemplateTool } from "./src/tools/templates/ScrapeTemplateTool.js"; // D3.3
 import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
 import { LocalizationManager } from "./src/core/LocalizationManager.js";
 import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
@@ -42,6 +44,10 @@ import { extractTextHandler } from "./src/tools/basic/extractText.js";
 import { extractLinksHandler } from "./src/tools/basic/extractLinks.js";
 import { extractMetadataHandler } from "./src/tools/basic/extractMetadata.js";
 import { scrapeStructuredHandler } from "./src/tools/basic/scrapeStructured.js";
+// D1.1 Resources + D1.2 Prompts + D1.4 Elicitation
+import { ResourceRegistry } from "./src/resources/ResourceRegistry.js";
+import { PROMPTS, getPromptMessages } from "./src/prompts/PromptRegistry.js";
+import { ElicitationHelper } from "./src/core/ElicitationHelper.js";
 // Initialize Authentication Manager
 await AuthManager.initialize();
@@ -89,8 +95,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
 // Create the server
 const server = new McpServer({
   name: "crawlforge",
-  version: "3.2.0",
-  description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
+  version: "4.2.1",
+  description: "Production-ready MCP server with 23 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
   homepage: "https://www.crawlforge.dev",
   icon: "https://www.crawlforge.dev/icon.png"
 });
@@ -104,7 +110,7 @@ server.prompt("getting-started", {
       role: "user",
       content: {
         type: "text",
-        text: "You have access to CrawlForge MCP with 21 web scraping tools. Key tools:\n\n" +
+        text: "You have access to CrawlForge MCP with 22 web scraping tools. Key tools:\n\n" +
           "- fetch_url: Fetch raw HTML/content from any URL\n" +
           "- extract_text: Extract clean text from a webpage\n" +
           "- extract_content: Smart content extraction with readability\n" +
@@ -116,7 +122,8 @@ server.prompt("getting-started", {
           "- deep_research: Multi-source research on any topic\n" +
           "- stealth_mode: Anti-detection browsing for protected sites\n" +
           "- extract_structured: LLM-powered structured data extraction\n" +
-          "- extract_with_llm: Natural-language extraction via OpenAI/Anthropic\n" +
+          "- extract_with_llm: Natural-language extraction — defaults to local Ollama (no API key); openai/anthropic available with key\n" +
+          "- list_ollama_models: List installed Ollama models so you can pick one for extract_with_llm\n" +
           "- track_changes: Monitor website changes over time\n" +
           "- generate_llms_txt: Generate llms.txt for any website\n\n" +
           "Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
@@ -146,19 +153,105 @@ const summarizeContentTool = new SummarizeContentTool();
 const analyzeContentTool = new AnalyzeContentTool();
 const extractStructuredTool = new ExtractStructuredTool();
 const extractWithLlmTool = new ExtractWithLlm();
+const listOllamaModelsTool = new ListOllamaModelsTool();
 const batchScrapeTool = new BatchScrapeTool();
 const scrapeWithActionsTool = new ScrapeWithActionsTool();
 const deepResearchTool = new DeepResearchTool();
 const trackChangesTool = new TrackChangesTool();
 const generateLLMsTxtTool = new GenerateLLMsTxtTool();
+const scrapeTemplateTool = new ScrapeTemplateTool(); // D3.3
 const stealthBrowserManager = new StealthBrowserManager();
 const localizationManager = new LocalizationManager();
+// D1.1: Resource Registry (wired to existing singletons)
+const resourceRegistry = new ResourceRegistry({
+  researchOrchestrator: deepResearchTool, // exposes activeSessions
+  snapshotManager: null, // SnapshotManager not directly instantiated in server.js
+  jobManager: batchScrapeTool.jobManager,
+  mapSiteTool,
+  scrapeWithActionsTool,
+});
+// D1.4: Elicitation helper (client may not support — fails open)
+const elicitation = new ElicitationHelper({ mcpServer: server, logger });
+// D1.4: Wire elicitation into tools and AuthManager
+deepResearchTool.setMcpServer(server);
+batchScrapeTool.setMcpServer(server);
+crawlDeepTool.setMcpServer(server);
+extractStructuredTool.setMcpServer(server);
+AuthManager.setElicitation(elicitation);
+// ─── D1.1 Resource Templates (MCP Resources) ─────────────────────────────────
+// Resources use the MCP ResourceTemplate URI pattern for dynamic crawlforge:// URIs.
+// The registry is populated at runtime as tools produce artifacts.
+// Research sessions: crawlforge://research/{sessionId}
+server.resource(
+  "crawlforge-research",
+  new ResourceTemplate("crawlforge://research/{sessionId}", {
+    list: async () => ({
+      resources: resourceRegistry.listResources().filter(r => r.uri.startsWith("crawlforge://research/"))
+    })
+  }),
+  { description: "Completed deep_research report stored in the server session" },
+  async (uri) => resourceRegistry.readResource(uri)
+);
+// Job results: crawlforge://job/{jobId}
+server.resource(
+  "crawlforge-job",
+  new ResourceTemplate("crawlforge://job/{jobId}", {
+    list: async () => ({
+      resources: resourceRegistry.listResources().filter(r => r.uri.startsWith("crawlforge://job/"))
+    })
+  }),
+  { description: "Completed batch_scrape job result" },
+  async (uri) => resourceRegistry.readResource(uri)
+);
+// Crawl sitemaps: crawlforge://crawl/{sessionId}/sitemap
+server.resource(
+  "crawlforge-crawl-sitemap",
+  new ResourceTemplate("crawlforge://crawl/{sessionId}/sitemap", {
+    list: async () => ({
+      resources: resourceRegistry.listResources().filter(r => r.uri.startsWith("crawlforge://crawl/"))
+    })
+  }),
+  { description: "map_site output stored for a crawl session" },
+  async (uri) => resourceRegistry.readResource(uri)
+);
+// Screenshots: crawlforge://screenshot/{actionId}
+server.resource(
+  "crawlforge-screenshot",
+  new ResourceTemplate("crawlforge://screenshot/{actionId}", {
+    list: async () => ({
+      resources: resourceRegistry.listResources().filter(r => r.uri.startsWith("crawlforge://screenshot/"))
+    })
+  }),
+  { description: "Screenshot from scrape_with_actions" },
+  async (uri) => resourceRegistry.readResource(uri)
+);
+// ─── D1.2 Prompts (workflow templates) ────────────────────────────────────────
+// Register the 5 CrawlForge workflow prompts from PromptRegistry.
+for (const p of PROMPTS) {
+  const argsShape = {};
+  for (const arg of p.arguments) {
+    argsShape[arg.name] = z.string().optional().describe(arg.description);
+  }
+  server.registerPrompt(p.name, { description: p.description, argsSchema: argsShape }, async (args) => {
+    return getPromptMessages(p.name, args || {});
+  });
+}
 // ─── Tool registrations ────────────────────────────────────────────────────────
 // Tool: fetch_url
 server.registerTool("fetch_url", {
-  description: "Fetch content from a URL with optional headers and timeout",
+  description: "Use this when you need raw HTTP content from a URL — HTML, JSON, XML, or plain text. Ideal as the first step before extract_text or extract_content. Supports custom headers (e.g. auth tokens) and configurable timeout. Example: fetch_url({url: \"https://example.com\", timeout: 15000})",
   annotations: { title: "Fetch URL", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to fetch content from"),
@@ -169,18 +262,19 @@ server.registerTool("fetch_url", {
 // Tool: extract_text
 server.registerTool("extract_text", {
-  description: "Extract clean text content from a webpage",
+  description: "Use this when you need a page's human-readable text or markdown stripped of HTML tags, scripts, and styles — e.g. for keyword search, summarization, RAG ingestion, or NLP. Use output_format:\"markdown\" for RAG workflows. Faster than extract_content but returns unstructured content. Example: extract_text({url: \"https://example.com/article\", output_format:\"markdown\"})",
   annotations: { title: "Extract Text", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to extract text from"),
     remove_scripts: z.boolean().optional().default(true).describe("Remove script tags before extraction"),
-    remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction")
+    remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction"),
+    output_format: z.enum(["text", "markdown"]).optional().default("text").describe("Output format: \"text\" (default) or \"markdown\" — use markdown for RAG workflows")
   }
 }, withAuth("extract_text", extractTextHandler));
 // Tool: extract_links
 server.registerTool("extract_links", {
-  description: "Extract all links from a webpage with optional filtering",
+  description: "Use this when you need to discover all hyperlinks on a page — e.g. to build a crawl seed list, audit broken links, or find related resources. Use filter_external:true to get only outbound links. Example: extract_links({url: \"https://example.com\", filter_external: true})",
   annotations: { title: "Extract Links", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to extract links from"),
@@ -191,7 +285,7 @@ server.registerTool("extract_links", {
 // Tool: extract_metadata
 server.registerTool("extract_metadata", {
-  description: "Extract metadata from a webpage (title, description, keywords, etc.)",
+  description: "Use this when you need a page's SEO metadata: title, meta description, Open Graph tags, canonical URL, schema.org data. Ideal for site audits and competitive SEO analysis. Example: extract_metadata({url: \"https://example.com\"})",
   annotations: { title: "Extract Metadata", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to extract metadata from")
@@ -200,7 +294,7 @@ server.registerTool("extract_metadata", {
 // Tool: scrape_structured
 server.registerTool("scrape_structured", {
-  description: "Extract structured data from a webpage using CSS selectors",
+  description: "Use this when you know the exact CSS selectors for the data you want — e.g. scraping a pricing table or product list with consistent markup. More reliable than LLM extraction for well-structured pages. Example: scrape_structured({url: \"https://shop.com/products\", selectors: {price: \".price\", name: \".product-title\"}})",
   annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to scrape"),
@@ -210,7 +304,7 @@ server.registerTool("scrape_structured", {
 // Tool: search_web
 server.registerTool("search_web", {
-  description: "Search the web using Google Search API (proxied through CrawlForge)",
+  description: "Use this when you need web search results for a query — returns titles, URLs, snippets, and optional metadata. Supports language, date range, and site filters. Start research workflows here before using fetch_url or deep_research. Example: search_web({query: \"best MCP servers 2025\", limit: 10, time_range: \"month\"})",
   annotations: { title: "Search the Web", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     query: z.string().describe("Search query string"),
@@ -236,7 +330,7 @@ server.registerTool("search_web", {
 // Tool: crawl_deep
 server.registerTool("crawl_deep", {
-  description: "Crawl websites deeply using breadth-first search",
+  description: "Use this when you need to discover and optionally extract content from many pages within a site — e.g. building a knowledge base, indexing docs, or auditing all pages. Use map_site first to estimate scope, then crawl_deep for content. Example: crawl_deep({url: \"https://docs.example.com\", max_depth: 3, max_pages: 200, extract_content: true})",
   annotations: { title: "Deep Crawl", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("Starting URL for the crawl"),
@@ -263,7 +357,7 @@ server.registerTool("crawl_deep", {
 // Tool: map_site
 server.registerTool("map_site", {
-  description: "Discover and map website structure",
+  description: "Use this when you need to know all URLs on a domain without fetching full page content — e.g. before a crawl_deep, for a site audit, or to find specific section URLs. Reads sitemap.xml when available. Example: map_site({url: \"https://example.com\", include_sitemap: true, max_urls: 500})",
   annotations: { title: "Map Website", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The website URL to map"),
@@ -286,7 +380,7 @@ server.registerTool("map_site", {
 // Tool: extract_content
 server.registerTool("extract_content", {
-  description: "Extract and analyze main content from web pages with enhanced readability detection",
+  description: "Use this when you need a clean, readable version of a web article or page — removes ads, nav, footers, and boilerplate. Ideal for RAG ingestion, summarization, or LLM context. Prefer this over extract_text for article-style pages. Example: extract_content({url: \"https://blog.example.com/post-title\"})",
   annotations: { title: "Extract Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to extract content from"),
@@ -306,7 +400,7 @@ server.registerTool("extract_content", {
 // Tool: process_document
 server.registerTool("process_document", {
-  description: "Process documents from multiple sources and formats including PDFs and web pages",
+  description: "Use this when you need to extract text from a PDF URL or file — e.g. research papers, contracts, reports. Also handles HTML URLs. Returns structured sections, metadata, and word count. Example: process_document({source: \"https://example.com/report.pdf\", sourceType: \"pdf_url\"})",
   annotations: { title: "Process Document", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     source: z.string().describe("Document source - URL or file path"),
@@ -327,7 +421,7 @@ server.registerTool("process_document", {
 // Tool: summarize_content
 server.registerTool("summarize_content", {
-  description: "Generate intelligent summaries of text content with configurable options",
+  description: "Use this when you have text content (from extract_text or extract_content) and need a condensed version — e.g. for briefings, comparison tables, or LLM context reduction. Supports extractive (sentence selection) and abstractive (rewrite via Ollama/sampling) modes. Example: summarize_content({text: \"..long article..\", options: {summaryLength: \"short\", summaryType: \"abstractive\"}})",
   annotations: { title: "Summarize Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
   inputSchema: {
     text: z.string().describe("The text content to summarize"),
@@ -347,7 +441,7 @@ server.registerTool("summarize_content", {
 // Tool: analyze_content
 server.registerTool("analyze_content", {
-  description: "Perform comprehensive content analysis including language detection and topic extraction",
+  description: "Use this when you need NLP metrics for text — language detection, sentiment, topic extraction, entity recognition, readability score. Good for content auditing and classification. Example: analyze_content({text: \"..article text..\", options: {extractTopics: true, includeSentiment: true}})",
   annotations: { title: "Analyze Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
   inputSchema: {
     text: z.string().describe("The text content to analyze"),
@@ -367,7 +461,7 @@ server.registerTool("analyze_content", {
 // Tool: extract_structured
 server.registerTool("extract_structured", {
-  description: "Extract structured data from a webpage using LLM-powered analysis and a JSON Schema. Falls back to CSS selector extraction when no LLM provider is configured.",
+  description: "Use this when you need a specific data shape extracted from a page using a JSON schema — e.g. product details, job listings, event data. Uses LLM by default; falls back to CSS selectors when no LLM is configured. Example: extract_structured({url: \"https://jobs.example.com/post/123\", schema: {properties: {title: {type:\"string\"}, salary: {type:\"string\"}}, required:[\"title\"]}})",
   annotations: { title: "Extract Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to extract structured data from"),
@@ -395,15 +489,15 @@ server.registerTool("extract_structured", {
 // Tool: extract_with_llm
 server.registerTool("extract_with_llm", {
-  description: "Extract structured data from a URL or text using a natural-language prompt. Supports OpenAI, Anthropic, or a local Ollama model. Cloud providers require OPENAI_API_KEY or ANTHROPIC_API_KEY; Ollama requires no key (set provider: \"ollama\" with a running `ollama serve` on http://localhost:11434).",
+  description: "Extract structured data from a URL or text using a natural-language prompt. Defaults to a local Ollama model (http://localhost:11434, no API key required) — call list_ollama_models first to see what's installed and pass the name via the `model` parameter. Pass provider: \"openai\" or \"anthropic\" with the matching API key to use a cloud model instead.",
   annotations: { title: "Extract With LLM", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
   inputSchema: {
     url: z.string().url().optional().describe("URL to fetch and extract from (one of url/content required)"),
     content: z.string().optional().describe("Pre-fetched text to extract from (one of url/content required)"),
     prompt: z.string().describe("Natural-language extraction instruction"),
     schema: z.record(z.unknown()).optional().describe("Optional JSON-schema for output shape (used as Ollama structured-outputs format when provider is 'ollama')"),
-    provider: z.enum(["openai", "anthropic", "ollama", "auto"]).optional().default("auto").describe("LLM provider. Use 'ollama' for a local model on http://localhost:11434"),
-    model: z.string().optional().describe("Override default model (e.g. 'llama3.2' for ollama)"),
+    provider: z.enum(["openai", "anthropic", "ollama", "auto"]).optional().default("auto").describe("LLM provider. Defaults to 'ollama' (local, no key, http://localhost:11434). Use 'openai' or 'anthropic' for cloud models (requires the matching API key)."),
+    model: z.string().optional().describe("Override the model. For ollama, pass a name returned by list_ollama_models (e.g. 'llama3.2', 'qwen2.5:7b'). Defaults: openai='gpt-4o-mini', anthropic='claude-haiku-4-5-20251001', ollama='llama3.2' or $OLLAMA_DEFAULT_MODEL."),
     maxTokens: z.number().optional().default(4096).describe("Maximum output tokens")
   }
 }, withAuth("extract_with_llm", async (params) => {
@@ -415,9 +509,26 @@ server.registerTool("extract_with_llm", {
   }
 }));
+// Tool: list_ollama_models
+server.registerTool("list_ollama_models", {
+  description: "List the Ollama models installed locally on this machine. Use this to discover which `model` values you can pass to extract_with_llm. Requires Ollama running on http://localhost:11434 (or $OLLAMA_BASE_URL).",
+  annotations: { title: "List Ollama Models", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
+  inputSchema: {}
+}, withAuth("list_ollama_models", async () => {
+  try {
+    const result = await listOllamaModelsTool.execute();
+    return {
+      content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
+      isError: !result.success
+    };
+  } catch (error) {
+    return { content: [{ type: "text", text: `Listing Ollama models failed: ${error.message}` }], isError: true };
+  }
+}));
 // Tool: batch_scrape
 server.registerTool("batch_scrape", {
-  description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
+  description: "Use this when you need to scrape 2–50 URLs in parallel — e.g. batch-collecting product pages, news articles, or competitor pages. Use mode:\"async\" with a webhook for large batches; mode:\"sync\" for up to ~25 URLs when you need results immediately. Example: batch_scrape({urls: [\"https://a.com\",\"https://b.com\"], formats: [\"json\"], maxConcurrency: 5})",
   annotations: { title: "Batch Scrape", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
   inputSchema: {
     urls: z.array(z.union([
@@ -462,7 +573,7 @@ server.registerTool("batch_scrape", {
 // Tool: scrape_with_actions
 server.registerTool("scrape_with_actions", {
-  description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture",
+  description: "Use this when you need to interact with a page before scraping — login, click buttons, fill forms, scroll, or wait for dynamic content to load. Use for SPAs, login-gated content, or multi-step flows. Screenshots from this tool are stored as crawlforge://screenshot/{actionId} resources. Example: scrape_with_actions({url: \"https://app.com/dashboard\", actions: [{type:\"click\",selector:\"#login\"},{type:\"type\",selector:\"#email\",text:\"user@a.com\"}]})",
   annotations: { title: "Scrape with Browser Actions", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to scrape"),
@@ -518,7 +629,7 @@ server.registerTool("scrape_with_actions", {
 // Tool: deep_research
 server.registerTool("deep_research", {
-  description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection",
+  description: "Use this when you need exhaustive multi-source research on a topic — it searches the web, fetches and analyses sources, detects conflicts, and (when LLM keys or Ollama are configured) synthesizes a report. Best for complex questions needing 10+ sources. Will request confirmation (elicitation) if maxUrls > 50. Results are stored as crawlforge://research/{sessionId} resources. Example: deep_research({topic: \"quantum computing NISQ devices 2025\", maxUrls: 30, researchApproach: \"academic\"})",
   annotations: { title: "Deep Research", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
   inputSchema: {
     topic: z.string().min(3).max(500).describe("Research topic or question"),
@@ -574,7 +685,7 @@ server.registerTool("deep_research", {
 // Tool: track_changes
 server.registerTool("track_changes", {
-  description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis",
+  description: "Use this when you need to monitor a URL for content changes over time — e.g. competitor pricing, regulation updates, product availability. Start with operation:\"create_baseline\", then periodically use operation:\"compare\" to diff. Supports webhooks and scheduled monitoring. Example: track_changes({url: \"https://example.com/pricing\", operation: \"create_baseline\"})",
   annotations: { title: "Track Changes", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to track changes for"),
@@ -679,7 +790,7 @@ server.registerTool("track_changes", {
 // Tool: generate_llms_txt
 server.registerTool("generate_llms_txt", {
-  description: "Analyze websites and generate standard-compliant LLMs.txt and LLMs-full.txt files defining AI model interaction guidelines",
+  description: "Use this when you need to generate an llms.txt file for a website — the standard that tells AI models how to interact with a site's content. Useful for site owners preparing for AI discoverability, or for understanding a site's AI access policy. Example: generate_llms_txt({url: \"https://example.com\"})",
   annotations: { title: "Generate llms.txt", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The website URL to generate llms.txt for"),
@@ -713,7 +824,7 @@ server.registerTool("generate_llms_txt", {
 // Tool: stealth_mode
 server.registerTool("stealth_mode", {
-  description: "Advanced anti-detection browser management with stealth features, fingerprint randomization, and human behavior simulation",
+  description: "Use this when a site blocks normal scraping — Cloudflare, Datadome, or other bot-detection systems. Manages a Playwright browser with randomized fingerprints, human behavior simulation, WebRTC/canvas spoofing. Start with operation:\"create_context\" then use the contextId. Example: stealth_mode({operation:\"create_context\", stealthConfig:{level:\"advanced\", simulateHumanBehavior:true}})",
   annotations: { title: "Stealth Mode", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
   inputSchema: {
     operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure').describe("Stealth operation to perform"),
@@ -755,6 +866,7 @@ server.registerTool("stealth_mode", {
         hardwareSpoofing: z.boolean().default(true)
       }).optional()
     }).optional().describe("Stealth browser configuration with anti-detection settings"),
+    engine: z.enum(["playwright", "camoufox"]).optional().default("playwright").describe("Browser engine: \"playwright\" (Chromium, default) or \"camoufox\" (Firefox-based, higher anti-detect score — install with npm install camoufox)"),
     contextId: z.string().optional().describe("Browser context ID for page operations"),
     urlToTest: z.string().url().optional().describe("URL to navigate to when creating a page")
   }
@@ -807,7 +919,7 @@ server.registerTool("stealth_mode", {
 // Tool: localization
 server.registerTool("localization", {
-  description: "Multi-language and geo-location management with country-specific settings, browser locale emulation, timezone spoofing, and geo-blocked content handling",
+  description: "Use this when you need to scrape geo-restricted content or emulate a specific locale/timezone — e.g. seeing region-specific pricing, bypassing geo-blocks, or searching in another language. Use operation:\"configure_country\" to set country context. Example: localization({operation:\"configure_country\", countryCode:\"DE\", language:\"de\"})",
   annotations: { title: "Localization", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
   inputSchema: {
     operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country').describe("Localization operation to perform"),
@@ -911,6 +1023,25 @@ server.registerTool("localization", {
   }
 }));
+// Tool: scrape_template (D3.3 — pre-built site templates)
+server.registerTool("scrape_template", {
+  description: "Use this when you want structured data from a well-known site without writing custom selectors. Pass template:\"list\" to see all available templates. Supports: amazon-product, linkedin-profile, github-repo, youtube-video, tweet, reddit-thread, hacker-news-front-page, producthunt-launch, stackoverflow-question, npm-package. Example: scrape_template({template:\"github-repo\", url:\"https://github.com/user/repo\"})",
+  annotations: { title: "Scrape Template", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
+  inputSchema: {
+    template: z.string().describe("Template ID (e.g. github-repo) or list to enumerate available templates"),
+    url: z.string().url().optional().describe("URL to scrape — required unless template is list"),
+    timeout: z.number().min(5000).max(60000).optional().default(15000).describe("Request timeout in milliseconds")
+  }
+}, withAuth("scrape_template", async (params) => {
+  try {
+    const result = await scrapeTemplateTool.execute(params);
+    return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
+  } catch (error) {
+    return { content: [{ type: "text", text: `Template scrape failed: ${error.message}` }], isError: true };
+  }
+}));
 // ─── Transport + startup ───────────────────────────────────────────────────────
 const useHttp = process.argv.includes('--http') || process.env.MCP_HTTP === 'true';
@@ -918,7 +1049,9 @@ const useLegacyHttp = process.argv.includes('--legacy-http') || process.env.CRAW
 async function runServer() {
   if (useHttp) {
-    const port = parseInt(process.env.PORT || '3000', 10);
+    // Default to 10000 to match Render's default port-scan target and the
+    // Dockerfile `EXPOSE 10000`. Most PaaS providers inject $PORT — we honor it.
+    const port = parseInt(process.env.PORT || '10000', 10);
     if (useLegacyHttp) {
       // One-release deprecation window for stateless legacy transport.
@@ -958,9 +1091,10 @@ async function runServer() {
     "extract_content", "process_document", "summarize_content", "analyze_content",
     "batch_scrape", "scrape_with_actions",
     "deep_research", "track_changes", "generate_llms_txt",
-    "stealth_mode", "localization", "extract_structured", "extract_with_llm"
+    "stealth_mode", "localization", "extract_structured", "extract_with_llm",
+    "scrape_template"  // D3.3
   ];
-  console.error(`Tools available: ${allTools.join(', ')}`);
+  console.error(`Tools available (23): ${allTools.join(", ")}`);
   // Start memory monitoring in development
   if (config.server.nodeEnv === "development") {

package/src/cli/commands/actions.js ADDED Viewed

@@ -0,0 +1,36 @@
+/**
+ * actions command — run browser automation actions from a script file.
+ */
+import { ScrapeWithActionsTool } from '../../tools/advanced/ScrapeWithActionsTool.js';
+import { getToolConfig } from '../../constants/config.js';
+import { runTool } from '../lib/runTool.js';
+import { readFileSync } from 'node:fs';
+export function register(program) {
+  program
+    .command('actions <url>')
+    .description('Run browser automation actions against a URL')
+    .requiredOption('--script <file>', 'JSON file containing action script')
+    .option('--screenshot', 'Capture screenshot after actions')
+    .option('--wait <ms>', 'Wait time between actions in milliseconds', '500')
+    .action(async (url, opts, cmd) => {
+      const globals = cmd.parent.opts();
+      const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
+      let actions;
+      try {
+        actions = JSON.parse(readFileSync(opts.script, 'utf8'));
+      } catch (e) {
+        process.stderr.write(`Error reading script file: ${e.message}\n`);
+        process.exit(1);
+      }
+      const tool = new ScrapeWithActionsTool(getToolConfig('scrape_with_actions'));
+      await runTool(tool, {
+        url,
+        actions,
+        screenshot: !!opts.screenshot,
+        wait_between_actions: parseInt(opts.wait, 10)
+      }, cliFlags);
+    });
+}

package/src/cli/commands/analyze.js ADDED Viewed

@@ -0,0 +1,19 @@
+/**
+ * analyze command — analyze content of a URL.
+ */
+import { AnalyzeContentTool } from '../../tools/extract/analyzeContent.js';
+import { getToolConfig } from '../../constants/config.js';
+import { runTool } from '../lib/runTool.js';
+export function register(program) {
+  program
+    .command('analyze <url>')
+    .description('Analyze content of a URL (sentiment, entities, readability)')
+    .option('--depth <level>', 'Analysis depth: basic or full', 'basic')
+    .action(async (url, opts, cmd) => {
+      const globals = cmd.parent.opts();
+      const cliFlags = { json: globals.json, pretty: globals.pretty, quiet: globals.quiet };
+      const tool = new AnalyzeContentTool(getToolConfig('analyze_content'));
+      await runTool(tool, { url, analysis_depth: opts.depth }, cliFlags);
+    });
+}