crawlforge-mcp-server 3.3.1 → 3.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
9
9
 
10
10
  ## 🎯 Features
11
11
 
12
- - **20 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis
12
+ - **22 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis, local-LLM extraction (Ollama)
13
13
  - **Free Tier**: 1,000 credits to get started instantly
14
14
  - **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
15
15
  - **Enterprise Ready**: Scale up with paid plans for production use
@@ -112,6 +112,8 @@ Restart Cursor to activate.
112
112
  - `summarize_content` - Generate intelligent summaries
113
113
  - `analyze_content` - Comprehensive content analysis
114
114
  - `extract_structured` - LLM-powered schema-driven extraction
115
+ - `extract_with_llm` - Natural-language extraction. **Defaults to a local Ollama model — no API key, no API costs.** Pass `provider: "openai" | "anthropic"` with the matching key for cloud models.
116
+ - `list_ollama_models` - List the Ollama models installed locally (free; helps you pick a `model` for `extract_with_llm`)
115
117
  - `track_changes` - Monitor content changes over time
116
118
 
117
119
  ### Premium Tools (5-10 credits)
@@ -138,7 +140,7 @@ Restart Cursor to activate.
138
140
  | **Enterprise** | 250,000 | Large scale operations |
139
141
 
140
142
  **All plans include:**
141
- - Access to all 20 tools
143
+ - Access to all 22 tools
142
144
  - Credits never expire and roll over month-to-month
143
145
  - API access and webhook notifications
144
146
 
@@ -155,6 +157,30 @@ export CRAWLFORGE_API_KEY="cf_live_your_api_key_here"
155
157
  # Optional: Custom API endpoint (for enterprise)
156
158
  export CRAWLFORGE_API_URL="https://api.crawlforge.dev"
157
159
  # As of v3.0.18, this variable is validated against an allow-list of CrawlForge backend hosts.
160
+
161
+ # Optional: Local LLM (Ollama) overrides — extract_with_llm defaults to Ollama
162
+ export OLLAMA_BASE_URL="http://localhost:11434" # default
163
+ export OLLAMA_DEFAULT_MODEL="llama3.2" # default; any locally-pulled model name works
164
+
165
+ # Optional: Cloud LLM keys — only needed when you pass provider: "openai" or "anthropic"
166
+ export OPENAI_API_KEY="sk-..."
167
+ export ANTHROPIC_API_KEY="sk-ant-..."
168
+ ```
169
+
170
+ ### Local-LLM quickstart (`extract_with_llm` with Ollama)
171
+
172
+ `extract_with_llm` defaults to a local Ollama model — no API key, no API costs, no data leaving your machine.
173
+
174
+ ```bash
175
+ # 1. Install Ollama: https://ollama.com
176
+ # 2. Pull any model from https://ollama.com/library
177
+ ollama pull llama3.2
178
+
179
+ # 3. Discover what's installed (from your MCP client)
180
+ # list_ollama_models()
181
+
182
+ # 4. Extract — defaults to Ollama with the model from step 2
183
+ # extract_with_llm({ url: "https://example.com", prompt: "…", model: "llama3.2" })
158
184
  ```
159
185
 
160
186
  ### Manual Configuration
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "crawlforge-mcp-server",
3
- "version": "3.3.1",
4
- "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 21 comprehensive web scraping, crawling, and content processing tools.",
3
+ "version": "3.5.1",
4
+ "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 22 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in.",
5
5
  "main": "server.js",
6
6
  "bin": {
7
7
  "crawlforge": "server.js",
package/server.js CHANGED
@@ -17,6 +17,7 @@ import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
17
17
  import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
18
18
  import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
19
19
  import { ExtractWithLlm } from "./src/tools/extract/extractWithLlm.js";
20
+ import { ListOllamaModelsTool } from "./src/tools/extract/listOllamaModels.js";
20
21
  import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
21
22
  import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
22
23
  import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
@@ -89,8 +90,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
89
90
  // Create the server
90
91
  const server = new McpServer({
91
92
  name: "crawlforge",
92
- version: "3.2.0",
93
- description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
93
+ version: "3.5.1",
94
+ description: "Production-ready MCP server with 21 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
94
95
  homepage: "https://www.crawlforge.dev",
95
96
  icon: "https://www.crawlforge.dev/icon.png"
96
97
  });
@@ -104,7 +105,7 @@ server.prompt("getting-started", {
104
105
  role: "user",
105
106
  content: {
106
107
  type: "text",
107
- text: "You have access to CrawlForge MCP with 21 web scraping tools. Key tools:\n\n" +
108
+ text: "You have access to CrawlForge MCP with 22 web scraping tools. Key tools:\n\n" +
108
109
  "- fetch_url: Fetch raw HTML/content from any URL\n" +
109
110
  "- extract_text: Extract clean text from a webpage\n" +
110
111
  "- extract_content: Smart content extraction with readability\n" +
@@ -116,7 +117,8 @@ server.prompt("getting-started", {
116
117
  "- deep_research: Multi-source research on any topic\n" +
117
118
  "- stealth_mode: Anti-detection browsing for protected sites\n" +
118
119
  "- extract_structured: LLM-powered structured data extraction\n" +
119
- "- extract_with_llm: Natural-language extraction via OpenAI/Anthropic\n" +
120
+ "- extract_with_llm: Natural-language extraction defaults to local Ollama (no API key); openai/anthropic available with key\n" +
121
+ "- list_ollama_models: List installed Ollama models so you can pick one for extract_with_llm\n" +
120
122
  "- track_changes: Monitor website changes over time\n" +
121
123
  "- generate_llms_txt: Generate llms.txt for any website\n\n" +
122
124
  "Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
@@ -146,6 +148,7 @@ const summarizeContentTool = new SummarizeContentTool();
146
148
  const analyzeContentTool = new AnalyzeContentTool();
147
149
  const extractStructuredTool = new ExtractStructuredTool();
148
150
  const extractWithLlmTool = new ExtractWithLlm();
151
+ const listOllamaModelsTool = new ListOllamaModelsTool();
149
152
  const batchScrapeTool = new BatchScrapeTool();
150
153
  const scrapeWithActionsTool = new ScrapeWithActionsTool();
151
154
  const deepResearchTool = new DeepResearchTool();
@@ -395,15 +398,15 @@ server.registerTool("extract_structured", {
395
398
 
396
399
  // Tool: extract_with_llm
397
400
  server.registerTool("extract_with_llm", {
398
- description: "Extract structured data from a URL or text using a natural-language prompt, powered by OpenAI or Anthropic. Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in the environment.",
401
+ description: "Extract structured data from a URL or text using a natural-language prompt. Defaults to a local Ollama model (http://localhost:11434, no API key required) — call list_ollama_models first to see what's installed and pass the name via the `model` parameter. Pass provider: \"openai\" or \"anthropic\" with the matching API key to use a cloud model instead.",
399
402
  annotations: { title: "Extract With LLM", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
400
403
  inputSchema: {
401
404
  url: z.string().url().optional().describe("URL to fetch and extract from (one of url/content required)"),
402
405
  content: z.string().optional().describe("Pre-fetched text to extract from (one of url/content required)"),
403
406
  prompt: z.string().describe("Natural-language extraction instruction"),
404
- schema: z.record(z.unknown()).optional().describe("Optional JSON-schema-like hint for output shape"),
405
- provider: z.enum(["openai", "anthropic", "auto"]).optional().default("auto").describe("LLM provider"),
406
- model: z.string().optional().describe("Override default model"),
407
+ schema: z.record(z.unknown()).optional().describe("Optional JSON-schema for output shape (used as Ollama structured-outputs format when provider is 'ollama')"),
408
+ provider: z.enum(["openai", "anthropic", "ollama", "auto"]).optional().default("auto").describe("LLM provider. Defaults to 'ollama' (local, no key, http://localhost:11434). Use 'openai' or 'anthropic' for cloud models (requires the matching API key)."),
409
+ model: z.string().optional().describe("Override the model. For ollama, pass a name returned by list_ollama_models (e.g. 'llama3.2', 'qwen2.5:7b'). Defaults: openai='gpt-4o-mini', anthropic='claude-haiku-4-5-20251001', ollama='llama3.2' or $OLLAMA_DEFAULT_MODEL."),
407
410
  maxTokens: z.number().optional().default(4096).describe("Maximum output tokens")
408
411
  }
409
412
  }, withAuth("extract_with_llm", async (params) => {
@@ -415,6 +418,23 @@ server.registerTool("extract_with_llm", {
415
418
  }
416
419
  }));
417
420
 
421
+ // Tool: list_ollama_models
422
+ server.registerTool("list_ollama_models", {
423
+ description: "List the Ollama models installed locally on this machine. Use this to discover which `model` values you can pass to extract_with_llm. Requires Ollama running on http://localhost:11434 (or $OLLAMA_BASE_URL).",
424
+ annotations: { title: "List Ollama Models", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
425
+ inputSchema: {}
426
+ }, withAuth("list_ollama_models", async () => {
427
+ try {
428
+ const result = await listOllamaModelsTool.execute();
429
+ return {
430
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
431
+ isError: !result.success
432
+ };
433
+ } catch (error) {
434
+ return { content: [{ type: "text", text: `Listing Ollama models failed: ${error.message}` }], isError: true };
435
+ }
436
+ }));
437
+
418
438
  // Tool: batch_scrape
419
439
  server.registerTool("batch_scrape", {
420
440
  description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
@@ -918,7 +938,9 @@ const useLegacyHttp = process.argv.includes('--legacy-http') || process.env.CRAW
918
938
 
919
939
  async function runServer() {
920
940
  if (useHttp) {
921
- const port = parseInt(process.env.PORT || '3000', 10);
941
+ // Default to 10000 to match Render's default port-scan target and the
942
+ // Dockerfile `EXPOSE 10000`. Most PaaS providers inject $PORT — we honor it.
943
+ const port = parseInt(process.env.PORT || '10000', 10);
922
944
 
923
945
  if (useLegacyHttp) {
924
946
  // One-release deprecation window for stateless legacy transport.
@@ -28,7 +28,7 @@ import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/
28
28
  import { createServer } from 'node:http';
29
29
  import { randomUUID } from 'node:crypto';
30
30
 
31
- const SERVER_VERSION = '3.2.0';
31
+ const SERVER_VERSION = '3.5.1';
32
32
 
33
33
  /**
34
34
  * Stateful, session-aware Streamable HTTP transport.
@@ -160,11 +160,11 @@ export async function connectStreamableHttp(server, authManager, logger, options
160
160
  await new Promise((resolve) => {
161
161
  httpServer.listen(port, host, () => {
162
162
  const actual = httpServer.address()?.port ?? port;
163
- console.error(`CrawlForge MCP Server v${SERVER_VERSION} running on Streamable HTTP (${mode}) port ${actual}`);
164
- console.error(`MCP endpoint: http://localhost:${actual}/mcp`);
165
- console.error(`Health check: http://localhost:${actual}/health`);
166
- if (metrics) console.error(`Metrics: http://localhost:${actual}/metrics`);
167
- if (oauthProvider) console.error(`OAuth: http://localhost:${actual}/.well-known/oauth-authorization-server`);
163
+ console.error(`CrawlForge MCP Server v${SERVER_VERSION} listening on ${host}:${actual} (Streamable HTTP, ${mode})`);
164
+ console.error(`MCP endpoint: http://${host}:${actual}/mcp`);
165
+ console.error(`Health check: http://${host}:${actual}/health`);
166
+ if (metrics) console.error(`Metrics: http://${host}:${actual}/metrics`);
167
+ if (oauthProvider) console.error(`OAuth discovery: http://${host}:${actual}/.well-known/oauth-authorization-server`);
168
168
  resolve();
169
169
  });
170
170
  });
@@ -1,10 +1,10 @@
1
1
  /**
2
2
  * Extract With LLM MCP Tool
3
- * Natural-language extraction powered by OpenAI or Anthropic.
4
- * Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
3
+ * Natural-language extraction powered by a local Ollama model (default) or
4
+ * a cloud provider (OpenAI / Anthropic, explicit opt-in).
5
5
  *
6
- * Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
7
- * Gate: tool throws a clear error when neither key is present.
6
+ * Default: provider 'auto' Ollama at http://localhost:11434, no API key required.
7
+ * Pass provider: "openai" | "anthropic" with the matching API key to use a cloud model.
8
8
  */
9
9
 
10
10
  import { fetchAndParse } from './_fetchAndParse.js';
@@ -15,6 +15,7 @@ const MAX_INPUT_CHARS = 50_000;
15
15
 
16
16
  const OPENAI_DEFAULT_MODEL = 'gpt-4o-mini';
17
17
  const ANTHROPIC_DEFAULT_MODEL = 'claude-haiku-4-5-20251001';
18
+ const OLLAMA_DEFAULT_MODEL = 'llama3.2';
18
19
 
19
20
  // Support test-time overrides so the test suite can stub endpoints.
20
21
  function openaiBaseUrl() {
@@ -23,32 +24,32 @@ function openaiBaseUrl() {
23
24
  function anthropicBaseUrl() {
24
25
  return (process.env.ANTHROPIC_BASE_URL || 'https://api.anthropic.com').replace(/\/$/, '');
25
26
  }
27
+ function ollamaBaseUrl() {
28
+ return (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
29
+ }
26
30
 
27
31
  // ── Helpers ───────────────────────────────────────────────────────────────────
28
32
 
29
33
  /**
30
34
  * Resolve which provider to use.
31
- * @param {'openai'|'anthropic'|'auto'} provider
32
- * @returns {{ provider: 'openai'|'anthropic', apiKey: string }}
35
+ * @param {'openai'|'anthropic'|'ollama'|'auto'} provider
36
+ * @returns {{ provider: 'openai'|'anthropic'|'ollama', apiKey: string|null }}
33
37
  */
34
38
  function resolveProvider(provider) {
35
- const anthropicKey = process.env.ANTHROPIC_API_KEY;
36
- const openaiKey = process.env.OPENAI_API_KEY;
37
-
38
- if (provider === 'auto') {
39
- if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
40
- if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
41
- throw new Error(
42
- 'extract_with_llm requires OPENAI_API_KEY or ANTHROPIC_API_KEY in environment'
43
- );
39
+ if (provider === 'auto' || provider === 'ollama') {
40
+ // Local Ollama is the default. No API key required; OLLAMA_BASE_URL is
41
+ // an optional override (defaults to http://localhost:11434).
42
+ return { provider: 'ollama', apiKey: null };
44
43
  }
45
44
 
46
45
  if (provider === 'anthropic') {
46
+ const anthropicKey = process.env.ANTHROPIC_API_KEY;
47
47
  if (!anthropicKey) throw new Error('extract_with_llm: ANTHROPIC_API_KEY is not set');
48
48
  return { provider: 'anthropic', apiKey: anthropicKey };
49
49
  }
50
50
 
51
51
  if (provider === 'openai') {
52
+ const openaiKey = process.env.OPENAI_API_KEY;
52
53
  if (!openaiKey) throw new Error('extract_with_llm: OPENAI_API_KEY is not set');
53
54
  return { provider: 'openai', apiKey: openaiKey };
54
55
  }
@@ -157,12 +158,68 @@ async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTok
157
158
  return { rawText: content, usage, model: json.model || model };
158
159
  }
159
160
 
161
+ // ── Ollama call ───────────────────────────────────────────────────────────────
162
+
163
+ async function callOllama({ model, systemMessage, userMessage, maxTokens, schema }) {
164
+ const url = `${ollamaBaseUrl()}/api/chat`;
165
+ const body = {
166
+ model,
167
+ messages: [
168
+ { role: 'system', content: systemMessage },
169
+ { role: 'user', content: userMessage }
170
+ ],
171
+ stream: false,
172
+ options: { num_predict: maxTokens, temperature: 0 },
173
+ format: (schema && Object.keys(schema).length > 0) ? schema : 'json'
174
+ };
175
+
176
+ let response;
177
+ try {
178
+ response = await fetch(url, {
179
+ method: 'POST',
180
+ headers: { 'Content-Type': 'application/json' },
181
+ body: JSON.stringify(body),
182
+ signal: AbortSignal.timeout(120_000)
183
+ });
184
+ } catch (err) {
185
+ const code = err?.cause?.code;
186
+ if (code === 'ECONNREFUSED' || code === 'ENOTFOUND' || /ECONNREFUSED|ENOTFOUND|fetch failed/i.test(err.message || '')) {
187
+ throw new Error(
188
+ `Ollama is not running at ${ollamaBaseUrl()}. ` +
189
+ `Start it with "ollama serve" and pull a model: "ollama pull ${model}".`
190
+ );
191
+ }
192
+ throw err;
193
+ }
194
+
195
+ if (!response.ok) {
196
+ const errText = await response.text().catch(() => '');
197
+ if (response.status === 404 && /model.*not found|pull/i.test(errText)) {
198
+ throw new Error(
199
+ `Ollama model "${model}" is not pulled. Run: "ollama pull ${model}"`
200
+ );
201
+ }
202
+ throw new Error(`Ollama API error ${response.status}: ${errText.slice(0, 200)}`);
203
+ }
204
+
205
+ const json = await response.json();
206
+ const content = json.message?.content ?? '';
207
+ const usage = {
208
+ input_tokens: json.prompt_eval_count ?? 0,
209
+ output_tokens: json.eval_count ?? 0
210
+ };
211
+ return { rawText: content, usage, model: json.model || model };
212
+ }
213
+
160
214
  // ── LLM dispatch ─────────────────────────────────────────────────────────────
161
215
 
162
- async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens }) {
216
+ async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens, schema }) {
163
217
  if (provider === 'openai') {
164
218
  return callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens });
165
219
  }
220
+ if (provider === 'ollama') {
221
+ return callOllama({ model, systemMessage, userMessage, maxTokens, schema });
222
+ }
166
223
  return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens });
167
224
  }
168
225
 
@@ -216,7 +273,10 @@ export class ExtractWithLlm {
216
273
  }
217
274
 
218
275
  const { provider, apiKey } = resolved;
219
- const defaultModel = provider === 'openai' ? OPENAI_DEFAULT_MODEL : ANTHROPIC_DEFAULT_MODEL;
276
+ const defaultModel =
277
+ provider === 'openai' ? OPENAI_DEFAULT_MODEL :
278
+ provider === 'ollama' ? (process.env.OLLAMA_DEFAULT_MODEL || OLLAMA_DEFAULT_MODEL) :
279
+ ANTHROPIC_DEFAULT_MODEL;
220
280
  const model = modelParam || defaultModel;
221
281
 
222
282
  // Step 1: Get text to extract from
@@ -241,7 +301,7 @@ export class ExtractWithLlm {
241
301
  let rawText, usage;
242
302
  try {
243
303
  ({ rawText, usage } = await callLLM({
244
- provider, apiKey, model, systemMessage, userMessage, maxTokens
304
+ provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
245
305
  }));
246
306
  } catch (llmErr) {
247
307
  return { success: false, error: `LLM call failed: ${llmErr.message}` };
@@ -260,7 +320,7 @@ export class ExtractWithLlm {
260
320
  try {
261
321
  ({ rawText: retryRaw, usage: retryUsage } = await callLLM({
262
322
  provider, apiKey, model, systemMessage,
263
- userMessage: retryUserMessage, maxTokens
323
+ userMessage: retryUserMessage, maxTokens, schema
264
324
  }));
265
325
  // Merge usage
266
326
  usage = {
@@ -0,0 +1,66 @@
1
+ /**
2
+ * List Ollama Models MCP Tool
3
+ * Returns the models installed on the local Ollama server (GET /api/tags).
4
+ * Used to discover names that can be passed as the `model` parameter to extract_with_llm.
5
+ */
6
+
7
+ function ollamaBaseUrl() {
8
+ return (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
9
+ }
10
+
11
+ export class ListOllamaModelsTool {
12
+ async execute() {
13
+ const baseUrl = ollamaBaseUrl();
14
+ const url = `${baseUrl}/api/tags`;
15
+
16
+ let response;
17
+ try {
18
+ response = await fetch(url, { signal: AbortSignal.timeout(10_000) });
19
+ } catch (err) {
20
+ return {
21
+ success: false,
22
+ baseUrl,
23
+ error:
24
+ `Could not reach Ollama at ${url}: ${err.message}. ` +
25
+ `Install from https://ollama.com and run "ollama serve".`
26
+ };
27
+ }
28
+
29
+ if (!response.ok) {
30
+ return {
31
+ success: false,
32
+ baseUrl,
33
+ error: `Ollama responded ${response.status} at ${url}. Is "ollama serve" running?`
34
+ };
35
+ }
36
+
37
+ let data;
38
+ try {
39
+ data = await response.json();
40
+ } catch (err) {
41
+ return { success: false, baseUrl, error: `Invalid JSON from Ollama: ${err.message}` };
42
+ }
43
+
44
+ const models = (data.models || []).map((m) => ({
45
+ name: m.name,
46
+ size_bytes: m.size,
47
+ modified_at: m.modified_at,
48
+ family: m.details?.family,
49
+ parameter_size: m.details?.parameter_size,
50
+ quantization: m.details?.quantization_level
51
+ }));
52
+
53
+ return {
54
+ success: true,
55
+ baseUrl,
56
+ count: models.length,
57
+ models,
58
+ hint:
59
+ models.length === 0
60
+ ? 'No models installed. Run "ollama pull llama3.2" (or any model from https://ollama.com/library) in your terminal.'
61
+ : 'Pass any of these names as the `model` parameter to extract_with_llm.'
62
+ };
63
+ }
64
+ }
65
+
66
+ export default ListOllamaModelsTool;