crawlforge-mcp-server 3.4.0 → 3.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
9
9
 
10
10
  ## 🎯 Features
11
11
 
12
- - **20 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis
12
+ - **22 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis, local-LLM extraction (Ollama)
13
13
  - **Free Tier**: 1,000 credits to get started instantly
14
14
  - **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
15
15
  - **Enterprise Ready**: Scale up with paid plans for production use
@@ -112,6 +112,8 @@ Restart Cursor to activate.
112
112
  - `summarize_content` - Generate intelligent summaries
113
113
  - `analyze_content` - Comprehensive content analysis
114
114
  - `extract_structured` - LLM-powered schema-driven extraction
115
+ - `extract_with_llm` - Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" | "anthropic"` with the matching key for cloud models.
116
+ - `list_ollama_models` - List the Ollama models installed locally (free; helps you pick a `model` for `extract_with_llm`)
115
117
  - `track_changes` - Monitor content changes over time
116
118
 
117
119
  ### Premium Tools (5-10 credits)
@@ -138,7 +140,7 @@ Restart Cursor to activate.
138
140
  | **Enterprise** | 250,000 | Large scale operations |
139
141
 
140
142
  **All plans include:**
141
- - Access to all 20 tools
143
+ - Access to all 22 tools
142
144
  - Credits never expire and roll over month-to-month
143
145
  - API access and webhook notifications
144
146
 
@@ -155,6 +157,30 @@ export CRAWLFORGE_API_KEY="cf_live_your_api_key_here"
155
157
  # Optional: Custom API endpoint (for enterprise)
156
158
  export CRAWLFORGE_API_URL="https://api.crawlforge.dev"
157
159
  # As of v3.0.18, this variable is validated against an allow-list of CrawlForge backend hosts.
160
+
161
+ # Optional: Local LLM (Ollama) overrides — extract_with_llm defaults to Ollama
162
+ export OLLAMA_BASE_URL="http://localhost:11434" # default
163
+ export OLLAMA_DEFAULT_MODEL="llama3.2" # default; any locally-pulled model name works
164
+
165
+ # Optional: Cloud LLM keys — only needed when you pass provider: "openai" or "anthropic"
166
+ export OPENAI_API_KEY="sk-..."
167
+ export ANTHROPIC_API_KEY="sk-ant-..."
168
+ ```
169
+
170
+ ### Local-LLM quickstart (`extract_with_llm` with Ollama)
171
+
172
+ `extract_with_llm` defaults to a local Ollama model — no API key, no API costs, no data leaving your machine.
173
+
174
+ ```bash
175
+ # 1. Install Ollama: https://ollama.com
176
+ # 2. Pull any model from https://ollama.com/library
177
+ ollama pull llama3.2
178
+
179
+ # 3. Discover what's installed (from your MCP client)
180
+ # list_ollama_models()
181
+
182
+ # 4. Extract — defaults to Ollama with the model from step 2
183
+ # extract_with_llm({ url: "https://example.com", prompt: "…", model: "llama3.2" })
158
184
  ```
159
185
 
160
186
  ### Manual Configuration
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "crawlforge-mcp-server",
3
- "version": "3.4.0",
4
- "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 21 comprehensive web scraping, crawling, and content processing tools.",
3
+ "version": "3.5.1",
4
+ "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 22 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in.",
5
5
  "main": "server.js",
6
6
  "bin": {
7
7
  "crawlforge": "server.js",
package/server.js CHANGED
@@ -17,6 +17,7 @@ import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
17
17
  import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
18
18
  import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
19
19
  import { ExtractWithLlm } from "./src/tools/extract/extractWithLlm.js";
20
+ import { ListOllamaModelsTool } from "./src/tools/extract/listOllamaModels.js";
20
21
  import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
21
22
  import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
22
23
  import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
@@ -89,8 +90,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
89
90
  // Create the server
90
91
  const server = new McpServer({
91
92
  name: "crawlforge",
92
- version: "3.2.0",
93
- description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
93
+ version: "3.5.1",
94
+ description: "Production-ready MCP server with 21 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
94
95
  homepage: "https://www.crawlforge.dev",
95
96
  icon: "https://www.crawlforge.dev/icon.png"
96
97
  });
@@ -104,7 +105,7 @@ server.prompt("getting-started", {
104
105
  role: "user",
105
106
  content: {
106
107
  type: "text",
107
- text: "You have access to CrawlForge MCP with 21 web scraping tools. Key tools:\n\n" +
108
+ text: "You have access to CrawlForge MCP with 22 web scraping tools. Key tools:\n\n" +
108
109
  "- fetch_url: Fetch raw HTML/content from any URL\n" +
109
110
  "- extract_text: Extract clean text from a webpage\n" +
110
111
  "- extract_content: Smart content extraction with readability\n" +
@@ -116,7 +117,8 @@ server.prompt("getting-started", {
116
117
  "- deep_research: Multi-source research on any topic\n" +
117
118
  "- stealth_mode: Anti-detection browsing for protected sites\n" +
118
119
  "- extract_structured: LLM-powered structured data extraction\n" +
119
- "- extract_with_llm: Natural-language extraction via OpenAI/Anthropic\n" +
120
+ "- extract_with_llm: Natural-language extraction defaults to local Ollama (no API key); openai/anthropic available with key\n" +
121
+ "- list_ollama_models: List installed Ollama models so you can pick one for extract_with_llm\n" +
120
122
  "- track_changes: Monitor website changes over time\n" +
121
123
  "- generate_llms_txt: Generate llms.txt for any website\n\n" +
122
124
  "Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
@@ -146,6 +148,7 @@ const summarizeContentTool = new SummarizeContentTool();
146
148
  const analyzeContentTool = new AnalyzeContentTool();
147
149
  const extractStructuredTool = new ExtractStructuredTool();
148
150
  const extractWithLlmTool = new ExtractWithLlm();
151
+ const listOllamaModelsTool = new ListOllamaModelsTool();
149
152
  const batchScrapeTool = new BatchScrapeTool();
150
153
  const scrapeWithActionsTool = new ScrapeWithActionsTool();
151
154
  const deepResearchTool = new DeepResearchTool();
@@ -395,15 +398,15 @@ server.registerTool("extract_structured", {
395
398
 
396
399
  // Tool: extract_with_llm
397
400
  server.registerTool("extract_with_llm", {
398
- description: "Extract structured data from a URL or text using a natural-language prompt. Supports OpenAI, Anthropic, or a local Ollama model. Cloud providers require OPENAI_API_KEY or ANTHROPIC_API_KEY; Ollama requires no key (set provider: \"ollama\" with a running `ollama serve` on http://localhost:11434).",
401
+ description: "Extract structured data from a URL or text using a natural-language prompt. Defaults to a local Ollama model (http://localhost:11434, no API key required) call list_ollama_models first to see what's installed and pass the name via the `model` parameter. Pass provider: \"openai\" or \"anthropic\" with the matching API key to use a cloud model instead.",
399
402
  annotations: { title: "Extract With LLM", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
400
403
  inputSchema: {
401
404
  url: z.string().url().optional().describe("URL to fetch and extract from (one of url/content required)"),
402
405
  content: z.string().optional().describe("Pre-fetched text to extract from (one of url/content required)"),
403
406
  prompt: z.string().describe("Natural-language extraction instruction"),
404
407
  schema: z.record(z.unknown()).optional().describe("Optional JSON-schema for output shape (used as Ollama structured-outputs format when provider is 'ollama')"),
405
- provider: z.enum(["openai", "anthropic", "ollama", "auto"]).optional().default("auto").describe("LLM provider. Use 'ollama' for a local model on http://localhost:11434"),
406
- model: z.string().optional().describe("Override default model (e.g. 'llama3.2' for ollama)"),
408
+ provider: z.enum(["openai", "anthropic", "ollama", "auto"]).optional().default("auto").describe("LLM provider. Defaults to 'ollama' (local, no key, http://localhost:11434). Use 'openai' or 'anthropic' for cloud models (requires the matching API key)."),
409
+ model: z.string().optional().describe("Override the model. For ollama, pass a name returned by list_ollama_models (e.g. 'llama3.2', 'qwen2.5:7b'). Defaults: openai='gpt-4o-mini', anthropic='claude-haiku-4-5-20251001', ollama='llama3.2' or $OLLAMA_DEFAULT_MODEL."),
407
410
  maxTokens: z.number().optional().default(4096).describe("Maximum output tokens")
408
411
  }
409
412
  }, withAuth("extract_with_llm", async (params) => {
@@ -415,6 +418,23 @@ server.registerTool("extract_with_llm", {
415
418
  }
416
419
  }));
417
420
 
421
+ // Tool: list_ollama_models
422
+ server.registerTool("list_ollama_models", {
423
+ description: "List the Ollama models installed locally on this machine. Use this to discover which `model` values you can pass to extract_with_llm. Requires Ollama running on http://localhost:11434 (or $OLLAMA_BASE_URL).",
424
+ annotations: { title: "List Ollama Models", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
425
+ inputSchema: {}
426
+ }, withAuth("list_ollama_models", async () => {
427
+ try {
428
+ const result = await listOllamaModelsTool.execute();
429
+ return {
430
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
431
+ isError: !result.success
432
+ };
433
+ } catch (error) {
434
+ return { content: [{ type: "text", text: `Listing Ollama models failed: ${error.message}` }], isError: true };
435
+ }
436
+ }));
437
+
418
438
  // Tool: batch_scrape
419
439
  server.registerTool("batch_scrape", {
420
440
  description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
@@ -918,7 +938,9 @@ const useLegacyHttp = process.argv.includes('--legacy-http') || process.env.CRAW
918
938
 
919
939
  async function runServer() {
920
940
  if (useHttp) {
921
- const port = parseInt(process.env.PORT || '3000', 10);
941
+ // Default to 10000 to match Render's default port-scan target and the
942
+ // Dockerfile `EXPOSE 10000`. Most PaaS providers inject $PORT — we honor it.
943
+ const port = parseInt(process.env.PORT || '10000', 10);
922
944
 
923
945
  if (useLegacyHttp) {
924
946
  // One-release deprecation window for stateless legacy transport.
@@ -28,7 +28,7 @@ import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/
28
28
  import { createServer } from 'node:http';
29
29
  import { randomUUID } from 'node:crypto';
30
30
 
31
- const SERVER_VERSION = '3.2.0';
31
+ const SERVER_VERSION = '3.5.1';
32
32
 
33
33
  /**
34
34
  * Stateful, session-aware Streamable HTTP transport.
@@ -160,11 +160,11 @@ export async function connectStreamableHttp(server, authManager, logger, options
160
160
  await new Promise((resolve) => {
161
161
  httpServer.listen(port, host, () => {
162
162
  const actual = httpServer.address()?.port ?? port;
163
- console.error(`CrawlForge MCP Server v${SERVER_VERSION} running on Streamable HTTP (${mode}) port ${actual}`);
164
- console.error(`MCP endpoint: http://localhost:${actual}/mcp`);
165
- console.error(`Health check: http://localhost:${actual}/health`);
166
- if (metrics) console.error(`Metrics: http://localhost:${actual}/metrics`);
167
- if (oauthProvider) console.error(`OAuth: http://localhost:${actual}/.well-known/oauth-authorization-server`);
163
+ console.error(`CrawlForge MCP Server v${SERVER_VERSION} listening on ${host}:${actual} (Streamable HTTP, ${mode})`);
164
+ console.error(`MCP endpoint: http://${host}:${actual}/mcp`);
165
+ console.error(`Health check: http://${host}:${actual}/health`);
166
+ if (metrics) console.error(`Metrics: http://${host}:${actual}/metrics`);
167
+ if (oauthProvider) console.error(`OAuth discovery: http://${host}:${actual}/.well-known/oauth-authorization-server`);
168
168
  resolve();
169
169
  });
170
170
  });
@@ -1,10 +1,10 @@
1
1
  /**
2
2
  * Extract With LLM MCP Tool
3
- * Natural-language extraction powered by OpenAI, Anthropic, or a local Ollama model.
4
- * Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
3
+ * Natural-language extraction powered by a local Ollama model (default) or
4
+ * a cloud provider (OpenAI / Anthropic, explicit opt-in).
5
5
  *
6
- * Cloud providers require OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
7
- * Ollama requires no API key just a running `ollama serve` on http://localhost:11434.
6
+ * Default: provider 'auto' Ollama at http://localhost:11434, no API key required.
7
+ * Pass provider: "openai" | "anthropic" with the matching API key to use a cloud model.
8
8
  */
9
9
 
10
10
  import { fetchAndParse } from './_fetchAndParse.js';
@@ -36,34 +36,24 @@ function ollamaBaseUrl() {
36
36
  * @returns {{ provider: 'openai'|'anthropic'|'ollama', apiKey: string|null }}
37
37
  */
38
38
  function resolveProvider(provider) {
39
- const anthropicKey = process.env.ANTHROPIC_API_KEY;
40
- const openaiKey = process.env.OPENAI_API_KEY;
41
- const ollamaOptIn = !!process.env.OLLAMA_BASE_URL;
42
-
43
- if (provider === 'auto') {
44
- if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
45
- if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
46
- if (ollamaOptIn) return { provider: 'ollama', apiKey: null };
47
- throw new Error(
48
- 'extract_with_llm requires OPENAI_API_KEY, ANTHROPIC_API_KEY, or OLLAMA_BASE_URL in environment ' +
49
- '(or pass provider: "ollama" explicitly to use a local Ollama server)'
50
- );
39
+ if (provider === 'auto' || provider === 'ollama') {
40
+ // Local Ollama is the default. No API key required; OLLAMA_BASE_URL is
41
+ // an optional override (defaults to http://localhost:11434).
42
+ return { provider: 'ollama', apiKey: null };
51
43
  }
52
44
 
53
45
  if (provider === 'anthropic') {
46
+ const anthropicKey = process.env.ANTHROPIC_API_KEY;
54
47
  if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
55
48
  return { provider: 'anthropic', apiKey: anthropicKey };
56
49
  }
57
50
 
58
51
  if (provider === 'openai') {
52
+ const openaiKey = process.env.OPENAI_API_KEY;
59
53
  if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
60
54
  return { provider: 'openai', apiKey: openaiKey };
61
55
  }
62
56
 
63
- if (provider === 'ollama') {
64
- return { provider: 'ollama', apiKey: null };
65
- }
66
-
67
57
  throw new Error(`extract_with_llm: unknown provider "${provider}"`);
68
58
  }
69
59
 
@@ -0,0 +1,66 @@
1
+ /**
2
+ * List Ollama Models MCP Tool
3
+ * Returns the models installed on the local Ollama server (GET /api/tags).
4
+ * Used to discover names that can be passed as the `model` parameter to extract_with_llm.
5
+ */
6
+
7
+ function ollamaBaseUrl() {
8
+ return (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
9
+ }
10
+
11
+ export class ListOllamaModelsTool {
12
+ async execute() {
13
+ const baseUrl = ollamaBaseUrl();
14
+ const url = `${baseUrl}/api/tags`;
15
+
16
+ let response;
17
+ try {
18
+ response = await fetch(url, { signal: AbortSignal.timeout(10_000) });
19
+ } catch (err) {
20
+ return {
21
+ success: false,
22
+ baseUrl,
23
+ error:
24
+ `Could not reach Ollama at ${url}: ${err.message}. ` +
25
+ `Install from https://ollama.com and run "ollama serve".`
26
+ };
27
+ }
28
+
29
+ if (!response.ok) {
30
+ return {
31
+ success: false,
32
+ baseUrl,
33
+ error: `Ollama responded ${response.status} at ${url}. Is "ollama serve" running?`
34
+ };
35
+ }
36
+
37
+ let data;
38
+ try {
39
+ data = await response.json();
40
+ } catch (err) {
41
+ return { success: false, baseUrl, error: `Invalid JSON from Ollama: ${err.message}` };
42
+ }
43
+
44
+ const models = (data.models || []).map((m) => ({
45
+ name: m.name,
46
+ size_bytes: m.size,
47
+ modified_at: m.modified_at,
48
+ family: m.details?.family,
49
+ parameter_size: m.details?.parameter_size,
50
+ quantization: m.details?.quantization_level
51
+ }));
52
+
53
+ return {
54
+ success: true,
55
+ baseUrl,
56
+ count: models.length,
57
+ models,
58
+ hint:
59
+ models.length === 0
60
+ ? 'No models installed. Run "ollama pull llama3.2" (or any model from https://ollama.com/library) in your terminal.'
61
+ : 'Pass any of these names as the `model` parameter to extract_with_llm.'
62
+ };
63
+ }
64
+ }
65
+
66
+ export default ListOllamaModelsTool;