crawlforge-mcp-server 3.3.1 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/server.js +4 -4
- package/src/tools/extract/extractWithLlm.js +80 -10
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.4.0",
|
|
4
4
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 21 comprehensive web scraping, crawling, and content processing tools.",
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"bin": {
|
package/server.js
CHANGED
|
@@ -395,15 +395,15 @@ server.registerTool("extract_structured", {
|
|
|
395
395
|
|
|
396
396
|
// Tool: extract_with_llm
|
|
397
397
|
server.registerTool("extract_with_llm", {
|
|
398
|
-
description: "Extract structured data from a URL or text using a natural-language prompt
|
|
398
|
+
description: "Extract structured data from a URL or text using a natural-language prompt. Supports OpenAI, Anthropic, or a local Ollama model. Cloud providers require OPENAI_API_KEY or ANTHROPIC_API_KEY; Ollama requires no key (set provider: \"ollama\" with a running `ollama serve` on http://localhost:11434).",
|
|
399
399
|
annotations: { title: "Extract With LLM", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
400
400
|
inputSchema: {
|
|
401
401
|
url: z.string().url().optional().describe("URL to fetch and extract from (one of url/content required)"),
|
|
402
402
|
content: z.string().optional().describe("Pre-fetched text to extract from (one of url/content required)"),
|
|
403
403
|
prompt: z.string().describe("Natural-language extraction instruction"),
|
|
404
|
-
schema: z.record(z.unknown()).optional().describe("Optional JSON-schema
|
|
405
|
-
provider: z.enum(["openai", "anthropic", "auto"]).optional().default("auto").describe("LLM provider"),
|
|
406
|
-
model: z.string().optional().describe("Override default model"),
|
|
404
|
+
schema: z.record(z.unknown()).optional().describe("Optional JSON-schema for output shape (used as Ollama structured-outputs format when provider is 'ollama')"),
|
|
405
|
+
provider: z.enum(["openai", "anthropic", "ollama", "auto"]).optional().default("auto").describe("LLM provider. Use 'ollama' for a local model on http://localhost:11434"),
|
|
406
|
+
model: z.string().optional().describe("Override default model (e.g. 'llama3.2' for ollama)"),
|
|
407
407
|
maxTokens: z.number().optional().default(4096).describe("Maximum output tokens")
|
|
408
408
|
}
|
|
409
409
|
}, withAuth("extract_with_llm", async (params) => {
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Extract With LLM MCP Tool
|
|
3
|
-
* Natural-language extraction powered by OpenAI or
|
|
3
|
+
* Natural-language extraction powered by OpenAI, Anthropic, or a local Ollama model.
|
|
4
4
|
* Mirrors ScrapeGraphAI positioning: describe what you want, get structured JSON back.
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
6
|
+
* Cloud providers require OPENAI_API_KEY or ANTHROPIC_API_KEY in environment.
|
|
7
|
+
* Ollama requires no API key — just a running `ollama serve` on http://localhost:11434.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
10
|
import { fetchAndParse } from './_fetchAndParse.js';
|
|
@@ -15,6 +15,7 @@ const MAX_INPUT_CHARS = 50_000;
|
|
|
15
15
|
|
|
16
16
|
const OPENAI_DEFAULT_MODEL = 'gpt-4o-mini';
|
|
17
17
|
const ANTHROPIC_DEFAULT_MODEL = 'claude-haiku-4-5-20251001';
|
|
18
|
+
const OLLAMA_DEFAULT_MODEL = 'llama3.2';
|
|
18
19
|
|
|
19
20
|
// Support test-time overrides so the test suite can stub endpoints.
|
|
20
21
|
function openaiBaseUrl() {
|
|
@@ -23,23 +24,29 @@ function openaiBaseUrl() {
|
|
|
23
24
|
function anthropicBaseUrl() {
|
|
24
25
|
return (process.env.ANTHROPIC_BASE_URL || 'https://api.anthropic.com').replace(/\/$/, '');
|
|
25
26
|
}
|
|
27
|
+
function ollamaBaseUrl() {
|
|
28
|
+
return (process.env.OLLAMA_BASE_URL || 'http://localhost:11434').replace(/\/$/, '');
|
|
29
|
+
}
|
|
26
30
|
|
|
27
31
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
28
32
|
|
|
29
33
|
/**
|
|
30
34
|
* Resolve which provider to use.
|
|
31
|
-
* @param {'openai'|'anthropic'|'auto'} provider
|
|
32
|
-
* @returns {{ provider: 'openai'|'anthropic', apiKey: string }}
|
|
35
|
+
* @param {'openai'|'anthropic'|'ollama'|'auto'} provider
|
|
36
|
+
* @returns {{ provider: 'openai'|'anthropic'|'ollama', apiKey: string|null }}
|
|
33
37
|
*/
|
|
34
38
|
function resolveProvider(provider) {
|
|
35
39
|
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
36
40
|
const openaiKey = process.env.OPENAI_API_KEY;
|
|
41
|
+
const ollamaOptIn = !!process.env.OLLAMA_BASE_URL;
|
|
37
42
|
|
|
38
43
|
if (provider === 'auto') {
|
|
39
44
|
if (anthropicKey) return { provider: 'anthropic', apiKey: anthropicKey };
|
|
40
45
|
if (openaiKey) return { provider: 'openai', apiKey: openaiKey };
|
|
46
|
+
if (ollamaOptIn) return { provider: 'ollama', apiKey: null };
|
|
41
47
|
throw new Error(
|
|
42
|
-
'extract_with_llm requires OPENAI_API_KEY or
|
|
48
|
+
'extract_with_llm requires OPENAI_API_KEY, ANTHROPIC_API_KEY, or OLLAMA_BASE_URL in environment ' +
|
|
49
|
+
'(or pass provider: "ollama" explicitly to use a local Ollama server)'
|
|
43
50
|
);
|
|
44
51
|
}
|
|
45
52
|
|
|
@@ -53,6 +60,10 @@ function resolveProvider(provider) {
|
|
|
53
60
|
return { provider: 'openai', apiKey: openaiKey };
|
|
54
61
|
}
|
|
55
62
|
|
|
63
|
+
if (provider === 'ollama') {
|
|
64
|
+
return { provider: 'ollama', apiKey: null };
|
|
65
|
+
}
|
|
66
|
+
|
|
56
67
|
throw new Error(`extract_with_llm: unknown provider "${provider}"`);
|
|
57
68
|
}
|
|
58
69
|
|
|
@@ -157,12 +168,68 @@ async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTok
|
|
|
157
168
|
return { rawText: content, usage, model: json.model || model };
|
|
158
169
|
}
|
|
159
170
|
|
|
171
|
+
// ── Ollama call ───────────────────────────────────────────────────────────────
|
|
172
|
+
|
|
173
|
+
async function callOllama({ model, systemMessage, userMessage, maxTokens, schema }) {
|
|
174
|
+
const url = `${ollamaBaseUrl()}/api/chat`;
|
|
175
|
+
const body = {
|
|
176
|
+
model,
|
|
177
|
+
messages: [
|
|
178
|
+
{ role: 'system', content: systemMessage },
|
|
179
|
+
{ role: 'user', content: userMessage }
|
|
180
|
+
],
|
|
181
|
+
stream: false,
|
|
182
|
+
options: { num_predict: maxTokens, temperature: 0 },
|
|
183
|
+
format: (schema && Object.keys(schema).length > 0) ? schema : 'json'
|
|
184
|
+
};
|
|
185
|
+
|
|
186
|
+
let response;
|
|
187
|
+
try {
|
|
188
|
+
response = await fetch(url, {
|
|
189
|
+
method: 'POST',
|
|
190
|
+
headers: { 'Content-Type': 'application/json' },
|
|
191
|
+
body: JSON.stringify(body),
|
|
192
|
+
signal: AbortSignal.timeout(120_000)
|
|
193
|
+
});
|
|
194
|
+
} catch (err) {
|
|
195
|
+
const code = err?.cause?.code;
|
|
196
|
+
if (code === 'ECONNREFUSED' || code === 'ENOTFOUND' || /ECONNREFUSED|ENOTFOUND|fetch failed/i.test(err.message || '')) {
|
|
197
|
+
throw new Error(
|
|
198
|
+
`Ollama is not running at ${ollamaBaseUrl()}. ` +
|
|
199
|
+
`Start it with "ollama serve" and pull a model: "ollama pull ${model}".`
|
|
200
|
+
);
|
|
201
|
+
}
|
|
202
|
+
throw err;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (!response.ok) {
|
|
206
|
+
const errText = await response.text().catch(() => '');
|
|
207
|
+
if (response.status === 404 && /model.*not found|pull/i.test(errText)) {
|
|
208
|
+
throw new Error(
|
|
209
|
+
`Ollama model "${model}" is not pulled. Run: "ollama pull ${model}"`
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
throw new Error(`Ollama API error ${response.status}: ${errText.slice(0, 200)}`);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
const json = await response.json();
|
|
216
|
+
const content = json.message?.content ?? '';
|
|
217
|
+
const usage = {
|
|
218
|
+
input_tokens: json.prompt_eval_count ?? 0,
|
|
219
|
+
output_tokens: json.eval_count ?? 0
|
|
220
|
+
};
|
|
221
|
+
return { rawText: content, usage, model: json.model || model };
|
|
222
|
+
}
|
|
223
|
+
|
|
160
224
|
// ── LLM dispatch ─────────────────────────────────────────────────────────────
|
|
161
225
|
|
|
162
|
-
async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens }) {
|
|
226
|
+
async function callLLM({ provider, apiKey, model, systemMessage, userMessage, maxTokens, schema }) {
|
|
163
227
|
if (provider === 'openai') {
|
|
164
228
|
return callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens });
|
|
165
229
|
}
|
|
230
|
+
if (provider === 'ollama') {
|
|
231
|
+
return callOllama({ model, systemMessage, userMessage, maxTokens, schema });
|
|
232
|
+
}
|
|
166
233
|
return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens });
|
|
167
234
|
}
|
|
168
235
|
|
|
@@ -216,7 +283,10 @@ export class ExtractWithLlm {
|
|
|
216
283
|
}
|
|
217
284
|
|
|
218
285
|
const { provider, apiKey } = resolved;
|
|
219
|
-
const defaultModel =
|
|
286
|
+
const defaultModel =
|
|
287
|
+
provider === 'openai' ? OPENAI_DEFAULT_MODEL :
|
|
288
|
+
provider === 'ollama' ? (process.env.OLLAMA_DEFAULT_MODEL || OLLAMA_DEFAULT_MODEL) :
|
|
289
|
+
ANTHROPIC_DEFAULT_MODEL;
|
|
220
290
|
const model = modelParam || defaultModel;
|
|
221
291
|
|
|
222
292
|
// Step 1: Get text to extract from
|
|
@@ -241,7 +311,7 @@ export class ExtractWithLlm {
|
|
|
241
311
|
let rawText, usage;
|
|
242
312
|
try {
|
|
243
313
|
({ rawText, usage } = await callLLM({
|
|
244
|
-
provider, apiKey, model, systemMessage, userMessage, maxTokens
|
|
314
|
+
provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
|
|
245
315
|
}));
|
|
246
316
|
} catch (llmErr) {
|
|
247
317
|
return { success: false, error: `LLM call failed: ${llmErr.message}` };
|
|
@@ -260,7 +330,7 @@ export class ExtractWithLlm {
|
|
|
260
330
|
try {
|
|
261
331
|
({ rawText: retryRaw, usage: retryUsage } = await callLLM({
|
|
262
332
|
provider, apiKey, model, systemMessage,
|
|
263
|
-
userMessage: retryUserMessage, maxTokens
|
|
333
|
+
userMessage: retryUserMessage, maxTokens, schema
|
|
264
334
|
}));
|
|
265
335
|
// Merge usage
|
|
266
336
|
usage = {
|