npm - crawlforge-mcp-server - Versions diffs - 3.3.1 → 3.5.1 - Mend

crawlforge-mcp-server 3.3.1 → 3.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +28 -2
package/package.json +2 -2
package/server.js +31 -9
package/src/server/transports/streamableHttp.js +6 -6
package/src/tools/extract/extractWithLlm.js +79 -19
package/src/tools/extract/listOllamaModels.js +66 -0

package/README.md CHANGED Viewed

@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
 ## 🎯 Features
-- **20 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis
+- **22 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis, local-LLM extraction (Ollama)
 - **Free Tier**: 1,000 credits to get started instantly
 - **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
 - **Enterprise Ready**: Scale up with paid plans for production use
@@ -112,6 +112,8 @@ Restart Cursor to activate.
 - `summarize_content` - Generate intelligent summaries
 - `analyze_content` - Comprehensive content analysis
 - `extract_structured` - LLM-powered schema-driven extraction
+- `extract_with_llm` - Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" | "anthropic"` with the matching key for cloud models.
+- `list_ollama_models` - List the Ollama models installed locally (free; helps you pick a `model` for `extract_with_llm`)
 - `track_changes` - Monitor content changes over time
 ### Premium Tools (5-10 credits)
@@ -138,7 +140,7 @@ Restart Cursor to activate.
 | **Enterprise** | 250,000 | Large scale operations |
 **All plans include:**
-- Access to all 20 tools
+- Access to all 22 tools
 - Credits never expire and roll over month-to-month
 - API access and webhook notifications
@@ -155,6 +157,30 @@ export CRAWLFORGE_API_KEY="cf_live_your_api_key_here"
 # Optional: Custom API endpoint (for enterprise)
 export CRAWLFORGE_API_URL="https://api.crawlforge.dev"
 # As of v3.0.18, this variable is validated against an allow-list of CrawlForge backend hosts.
+# Optional: Local LLM (Ollama) overrides — extract_with_llm defaults to Ollama
+export OLLAMA_BASE_URL="http://localhost:11434"   # default
+export OLLAMA_DEFAULT_MODEL="llama3.2"             # default; any locally-pulled model name works
+# Optional: Cloud LLM keys — only needed when you pass provider: "openai" or "anthropic"
+export OPENAI_API_KEY="sk-..."
+export ANTHROPIC_API_KEY="sk-ant-..."
+```
+### Local-LLM quickstart (`extract_with_llm` with Ollama)
+`extract_with_llm` defaults to a local Ollama model — no API key, no API costs, no data leaving your machine.
+```bash
+# 1. Install Ollama:  https://ollama.com
+# 2. Pull any model from https://ollama.com/library
+ollama pull llama3.2
+# 3. Discover what's installed (from your MCP client)
+#    list_ollama_models()
+# 4. Extract — defaults to Ollama with the model from step 2
+#    extract_with_llm({ url: "https://example.com", prompt: "…", model: "llama3.2" })
 ```
 ### Manual Configuration

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "crawlforge-mcp-server",
-  "version": "3.3.1",
-  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 21 comprehensive web scraping, crawling, and content processing tools.",
+  "version": "3.5.1",
+  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 22 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in.",
   "main": "server.js",
   "bin": {
     "crawlforge": "server.js",

package/server.js CHANGED Viewed

@@ -17,6 +17,7 @@ import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
 import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
 import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
 import { ExtractWithLlm } from "./src/tools/extract/extractWithLlm.js";
+import { ListOllamaModelsTool } from "./src/tools/extract/listOllamaModels.js";
 import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
 import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
 import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
@@ -89,8 +90,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
 // Create the server
 const server = new McpServer({
   name: "crawlforge",
-  version: "3.2.0",
-  description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
+  version: "3.5.1",
+  description: "Production-ready MCP server with 21 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
   homepage: "https://www.crawlforge.dev",
   icon: "https://www.crawlforge.dev/icon.png"
 });
@@ -104,7 +105,7 @@ server.prompt("getting-started", {
       role: "user",
       content: {
         type: "text",
-        text: "You have access to CrawlForge MCP with 21 web scraping tools. Key tools:\n\n" +
+        text: "You have access to CrawlForge MCP with 22 web scraping tools. Key tools:\n\n" +
           "- fetch_url: Fetch raw HTML/content from any URL\n" +
           "- extract_text: Extract clean text from a webpage\n" +
           "- extract_content: Smart content extraction with readability\n" +
@@ -116,7 +117,8 @@ server.prompt("getting-started", {
           "- deep_research: Multi-source research on any topic\n" +
           "- stealth_mode: Anti-detection browsing for protected sites\n" +
           "- extract_structured: LLM-powered structured data extraction\n" +
-          "- extract_with_llm: Natural-language extraction via OpenAI/Anthropic\n" +
+          "- extract_with_llm: Natural-language extraction — defaults to local Ollama (no API key); openai/anthropic available with key\n" +
+          "- list_ollama_models: List installed Ollama models so you can pick one for extract_with_llm\n" +
           "- track_changes: Monitor website changes over time\n" +
           "- generate_llms_txt: Generate llms.txt for any website\n\n" +
           "Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
@@ -146,6 +148,7 @@ const summarizeContentTool = new SummarizeContentTool();
 const analyzeContentTool = new AnalyzeContentTool();
 const extractStructuredTool = new ExtractStructuredTool();
 const extractWithLlmTool = new ExtractWithLlm();
+const listOllamaModelsTool = new ListOllamaModelsTool();
 const batchScrapeTool = new BatchScrapeTool();
 const scrapeWithActionsTool = new ScrapeWithActionsTool();
 const deepResearchTool = new DeepResearchTool();
@@ -395,15 +398,15 @@ server.registerTool("extract_structured", {
 // Tool: extract_with_llm
 server.registerTool("extract_with_llm", {
-  description: "Extract structured data from a URL or text using a natural-language prompt, powered by OpenAI or Anthropic. Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in the environment.",
+  description: "Extract structured data from a URL or text using a natural-language prompt. Defaults to a local Ollama model (http://localhost:11434, no API key required) — call list_ollama_models first to see what's installed and pass the name via the `model` parameter. Pass provider: \"openai\" or \"anthropic\" with the matching API key to use a cloud model instead.",
   annotations: { title: "Extract With LLM", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
   inputSchema: {
     url: z.string().url().optional().describe("URL to fetch and extract from (one of url/content required)"),
     content: z.string().optional().describe("Pre-fetched text to extract from (one of url/content required)"),
     prompt: z.string().describe("Natural-language extraction instruction"),
-    schema: z.record(z.unknown()).optional().describe("Optional JSON-schema-like hint for output shape"),
-    provider: z.enum(["openai", "anthropic", "auto"]).optional().default("auto").describe("LLM provider"),
-    model: z.string().optional().describe("Override default model"),
+    schema: z.record(z.unknown()).optional().describe("Optional JSON-schema for output shape (used as Ollama structured-outputs format when provider is 'ollama')"),
+    provider: z.enum(["openai", "anthropic", "ollama", "auto"]).optional().default("auto").describe("LLM provider. Defaults to 'ollama' (local, no key, http://localhost:11434). Use 'openai' or 'anthropic' for cloud models (requires the matching API key)."),
+    model: z.string().optional().describe("Override the model. For ollama, pass a name returned by list_ollama_models (e.g. 'llama3.2', 'qwen2.5:7b'). Defaults: openai='gpt-4o-mini', anthropic='claude-haiku-4-5-20251001', ollama='llama3.2' or $OLLAMA_DEFAULT_MODEL."),
     maxTokens: z.number().optional().default(4096).describe("Maximum output tokens")
   }
 }, withAuth("extract_with_llm", async (params) => {
@@ -415,6 +418,23 @@ server.registerTool("extract_with_llm", {
   }
 }));
+// Tool: list_ollama_models
+server.registerTool("list_ollama_models", {
+  description: "List the Ollama models installed locally on this machine. Use this to discover which `model` values you can pass to extract_with_llm. Requires Ollama running on http://localhost:11434 (or $OLLAMA_BASE_URL).",
+  annotations: { title: "List Ollama Models", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
+  inputSchema: {}
+}, withAuth("list_ollama_models", async () => {
+  try {
+    const result = await listOllamaModelsTool.execute();
+    return {
+      content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
+      isError: !result.success
+    };
+  } catch (error) {
+    return { content: [{ type: "text", text: `Listing Ollama models failed: ${error.message}` }], isError: true };
+  }
+}));
 // Tool: batch_scrape
 server.registerTool("batch_scrape", {
   description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
@@ -918,7 +938,9 @@ const useLegacyHttp = process.argv.includes('--legacy-http') || process.env.CRAW
 async function runServer() {
   if (useHttp) {
-    const port = parseInt(process.env.PORT || '3000', 10);
+    // Default to 10000 to match Render's default port-scan target and the
+    // Dockerfile `EXPOSE 10000`. Most PaaS providers inject $PORT — we honor it.
+    const port = parseInt(process.env.PORT || '10000', 10);
     if (useLegacyHttp) {
       // One-release deprecation window for stateless legacy transport.

package/src/server/transports/streamableHttp.js CHANGED Viewed

@@ -28,7 +28,7 @@ import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/
 import { createServer } from 'node:http';
 import { randomUUID } from 'node:crypto';
-const SERVER_VERSION = '3.2.0';
+const SERVER_VERSION = '3.5.1';
 /**
  * Stateful, session-aware Streamable HTTP transport.
@@ -160,11 +160,11 @@ export async function connectStreamableHttp(server, authManager, logger, options
   await new Promise((resolve) => {
     httpServer.listen(port, host, () => {
       const actual = httpServer.address()?.port ?? port;
-      console.error(`CrawlForge MCP Server v${SERVER_VERSION} running on Streamable HTTP (${mode}) port ${actual}`);
-      console.error(`MCP endpoint: http://localhost:${actual}/mcp`);
-      console.error(`Health check: http://localhost:${actual}/health`);
-      if (metrics) console.error(`Metrics:      http://localhost:${actual}/metrics`);
-      if (oauthProvider) console.error(`OAuth:        http://localhost:${actual}/.well-known/oauth-authorization-server`);
+      console.error(`CrawlForge MCP Server v${SERVER_VERSION} listening on ${host}:${actual} (Streamable HTTP, ${mode})`);
+      console.error(`MCP endpoint:   http://${host}:${actual}/mcp`);
+      console.error(`Health check:   http://${host}:${actual}/health`);
+      if (metrics) console.error(`Metrics:        http://${host}:${actual}/metrics`);
+      if (oauthProvider) console.error(`OAuth discovery: http://${host}:${actual}/.well-known/oauth-authorization-server`);
       resolve();
     });
   });

package/src/tools/extract/extractWithLlm.js CHANGED Viewed

@@ -1,10 +1,10 @@
 /**
  * Extract With LLM MCP Tool
- * Natural-language extraction powered by OpenAI or Anthropic.
- * Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
+ * Natural-language extraction powered by a local Ollama model (default) or
+ * a cloud provider (OpenAI / Anthropic, explicit opt-in).
  *
- * Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
- * Gate: tool throws a clear error when neither key is present.
+ * Default: provider 'auto' → Ollama at http://localhost:11434, no API key required.
+ * Pass provider: "openai" | "anthropic" with the matching API key to use a cloud model.
  */
 import { fetchAndParse } from './_fetchAndParse.js';
@@ -15,6 +15,7 @@ const MAX_INPUT_CHARS = 50_000;
 const OPENAI_DEFAULT_MODEL = 'gpt-4o-mini';
 const ANTHROPIC_DEFAULT_MODEL = 'claude-haiku-4-5-20251001';
+const OLLAMA_DEFAULT_MODEL = 'llama3.2';
 // Support test-time overrides so the test suite can stub endpoints.
 function openaiBaseUrl() {
@@ -23,32 +24,32 @@ function openaiBaseUrl() {
 function anthropicBaseUrl() {
   return (process.env.ANTHROPIC_BASE_URL || 'https://api.anthropic.com').replace(/\/$/, '');
 }
+function ollamaBaseUrl() {
+  return (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
+}
 // ── Helpers ───────────────────────────────────────────────────────────────────
 /**
  * Resolve which provider to use.
- * @param {'openai'|'anthropic'|'auto'} provider
- * @returns {{ provider: 'openai'|'anthropic', apiKey: string }}
+ * @param {'openai'|'anthropic'|'ollama'|'auto'} provider
+ * @returns {{ provider: 'openai'|'anthropic'|'ollama', apiKey: string|null }}
  */
 function resolveProvider(provider) {
-  const anthropicKey = process.env.ANTHROPIC_API_KEY;
-  const openaiKey = process.env.OPENAI_API_KEY;
-  if (provider === 'auto') {
-    if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
-    if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
-    throw new Error(
-      'extract_with_llm requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment'
-    );
+  if (provider === 'auto' || provider === 'ollama') {
+    // Local Ollama is the default. No API key required; OLLAMA_BASE_URL is
+    // an optional override (defaults to http://localhost:11434).
+    return { provider: 'ollama', apiKey: null };
   }
   if (provider === 'anthropic') {
+    const anthropicKey = process.env.ANTHROPIC_API_KEY;
     if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
     return { provider: 'anthropic', apiKey: anthropicKey };
   }
   if (provider === 'openai') {
+    const openaiKey = process.env.OPENAI_API_KEY;
     if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
     return { provider: 'openai', apiKey: openaiKey };
   }
@@ -157,12 +158,68 @@ async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTok
   return { rawText: content, usage, model: json.model || model };
 }
+// ── Ollama call ───────────────────────────────────────────────────────────────
+async function callOllama({ model, systemMessage, userMessage, maxTokens, schema }) {
+  const url = `${ollamaBaseUrl()}/api/chat`;
+  const body = {
+    model,
+    messages: [
+      { role: 'system', content: systemMessage },
+      { role: 'user', content: userMessage }
+    ],
+    stream: false,
+    options: { num_predict: maxTokens, temperature: 0 },
+    format: (schema && Object.keys(schema).length > 0) ? schema : 'json'
+  };
+  let response;
+  try {
+    response = await fetch(url, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(body),
+      signal: AbortSignal.timeout(120_000)
+    });
+  } catch (err) {
+    const code = err?.cause?.code;
+    if (code === 'ECONNREFUSED' || code === 'ENOTFOUND' || /ECONNREFUSED|ENOTFOUND|fetch failed/i.test(err.message || '')) {
+      throw new Error(
+        `Ollama is not running at ${ollamaBaseUrl()}. ` +
+        `Start it with "ollama serve" and pull a model: "ollama pull ${model}".`
+      );
+    }
+    throw err;
+  }
+  if (!response.ok) {
+    const errText = await response.text().catch(() => '');
+    if (response.status === 404 && /model.*not found|pull/i.test(errText)) {
+      throw new Error(
+        `Ollama model "${model}" is not pulled. Run: "ollama pull ${model}"`
+      );
+    }
+    throw new Error(`Ollama API error ${response.status}: ${errText.slice(0, 200)}`);
+  }
+  const json = await response.json();
+  const content = json.message?.content ?? '';
+  const usage = {
+    input_tokens: json.prompt_eval_count ?? 0,
+    output_tokens: json.eval_count ?? 0
+  };
+  return { rawText: content, usage, model: json.model || model };
+}
 // ── LLM dispatch ─────────────────────────────────────────────────────────────
-async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens }) {
+async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens, schema }) {
   if (provider === 'openai') {
     return callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens });
   }
+  if (provider === 'ollama') {
+    return callOllama({ model, systemMessage, userMessage, maxTokens, schema });
+  }
   return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens });
 }
@@ -216,7 +273,10 @@ export class ExtractWithLlm {
     }
     const { provider, apiKey } = resolved;
-    const defaultModel = provider === 'openai' ? OPENAI_DEFAULT_MODEL : ANTHROPIC_DEFAULT_MODEL;
+    const defaultModel =
+      provider === 'openai' ? OPENAI_DEFAULT_MODEL :
+      provider === 'ollama' ? (process.env.OLLAMA_DEFAULT_MODEL || OLLAMA_DEFAULT_MODEL) :
+      ANTHROPIC_DEFAULT_MODEL;
     const model = modelParam || defaultModel;
     // Step 1: Get text to extract from
@@ -241,7 +301,7 @@ export class ExtractWithLlm {
     let rawText, usage;
     try {
       ({ rawText, usage } = await callLLM({
-        provider, apiKey, model, systemMessage, userMessage, maxTokens
+        provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
       }));
     } catch (llmErr) {
       return { success: false, error: `LLM call failed: ${llmErr.message}` };
@@ -260,7 +320,7 @@ export class ExtractWithLlm {
       try {
         ({ rawText: retryRaw, usage: retryUsage } = await callLLM({
           provider, apiKey, model, systemMessage,
-          userMessage: retryUserMessage, maxTokens
+          userMessage: retryUserMessage, maxTokens, schema
         }));
         // Merge usage
         usage = {

package/src/tools/extract/listOllamaModels.js ADDED Viewed

@@ -0,0 +1,66 @@
+/**
+ * List Ollama Models MCP Tool
+ * Returns the models installed on the local Ollama server (GET /api/tags).
+ * Used to discover names that can be passed as the `model` parameter to extract_with_llm.
+ */
+function ollamaBaseUrl() {
+  return (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
+}
+export class ListOllamaModelsTool {
+  async execute() {
+    const baseUrl = ollamaBaseUrl();
+    const url = `${baseUrl}/api/tags`;
+    let response;
+    try {
+      response = await fetch(url, { signal: AbortSignal.timeout(10_000) });
+    } catch (err) {
+      return {
+        success: false,
+        baseUrl,
+        error:
+          `Could not reach Ollama at ${url}: ${err.message}. ` +
+          `Install from https://ollama.com and run "ollama serve".`
+      };
+    }
+    if (!response.ok) {
+      return {
+        success: false,
+        baseUrl,
+        error: `Ollama responded ${response.status} at ${url}. Is "ollama serve" running?`
+      };
+    }
+    let data;
+    try {
+      data = await response.json();
+    } catch (err) {
+      return { success: false, baseUrl, error: `Invalid JSON from Ollama: ${err.message}` };
+    }
+    const models = (data.models || []).map((m) => ({
+      name: m.name,
+      size_bytes: m.size,
+      modified_at: m.modified_at,
+      family: m.details?.family,
+      parameter_size: m.details?.parameter_size,
+      quantization: m.details?.quantization_level
+    }));
+    return {
+      success: true,
+      baseUrl,
+      count: models.length,
+      models,
+      hint:
+        models.length === 0
+          ? 'No models installed. Run "ollama pull llama3.2" (or any model from https://ollama.com/library) in your terminal.'
+          : 'Pass any of these names as the `model` parameter to extract_with_llm.'
+    };
+  }
+}
+export default ListOllamaModelsTool;