crawlforge-mcp-server 4.2.12 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/server.js +138 -20
- package/src/constants/config.js +5 -0
- package/src/core/ActionExecutor.js +13 -1
- package/src/core/ChangeTracker.js +8 -5
- package/src/core/LLMsTxtAnalyzer.js +71 -47
- package/src/core/LocalizationManager.js +7 -4
- package/src/core/ResearchOrchestrator.js +10 -6
- package/src/core/StealthBrowserManager.js +52 -13
- package/src/core/analysis/ContentAnalyzer.js +2 -2
- package/src/core/crawlers/BFSCrawler.js +23 -12
- package/src/core/processing/ContentProcessor.js +19 -3
- package/src/core/processing/PDFProcessor.js +72 -23
- package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
- package/src/tools/advanced/batchScrape/index.js +3 -1
- package/src/tools/advanced/batchScrape/reporter.js +5 -1
- package/src/tools/advanced/batchScrape/worker.js +6 -1
- package/src/tools/basic/_fetch.js +78 -5
- package/src/tools/basic/extractLinks.js +1 -1
- package/src/tools/basic/extractMetadata.js +65 -1
- package/src/tools/basic/extractText.js +61 -5
- package/src/tools/basic/scrapeStructured.js +48 -10
- package/src/tools/crawl/crawlDeep.js +13 -5
- package/src/tools/crawl/mapSite.js +24 -51
- package/src/tools/extract/analyzeContent.js +11 -6
- package/src/tools/extract/extractContent.js +23 -5
- package/src/tools/extract/extractStructured.js +65 -16
- package/src/tools/extract/extractWithLlm.js +192 -11
- package/src/tools/extract/listOllamaModels.js +19 -8
- package/src/tools/extract/processDocument.js +10 -4
- package/src/tools/extract/summarizeContent.js +58 -1
- package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
- package/src/tools/research/deepResearch.js +43 -4
- package/src/tools/search/providers/searxng.js +2 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
- package/src/tools/search/ranking/ResultRanker.js +13 -4
- package/src/tools/search/searchWeb.js +5 -5
- package/src/tools/templates/TemplateRegistry.js +3 -2
- package/src/tools/tracking/trackChanges/differ.js +33 -1
- package/src/utils/htmlToMarkdown.js +5 -1
|
@@ -124,7 +124,7 @@ export class ExtractContentTool {
|
|
|
124
124
|
|
|
125
125
|
try {
|
|
126
126
|
const validated = ExtractContentSchema.parse(params);
|
|
127
|
-
const { url, options } = validated;
|
|
127
|
+
const { url, html: providedHtml, options } = validated;
|
|
128
128
|
|
|
129
129
|
const result = {
|
|
130
130
|
url,
|
|
@@ -133,10 +133,16 @@ export class ExtractContentTool {
|
|
|
133
133
|
processingTime: 0
|
|
134
134
|
};
|
|
135
135
|
|
|
136
|
-
// Step 1: Fetch content (with or without JavaScript rendering)
|
|
136
|
+
// Step 1: Fetch content (with or without JavaScript rendering).
|
|
137
|
+
// If pre-rendered HTML is supplied (e.g. post-action page from
|
|
138
|
+
// scrape_with_actions), use it directly and skip the network fetch.
|
|
137
139
|
let html, pageTitle;
|
|
140
|
+
if (providedHtml) {
|
|
141
|
+
html = providedHtml;
|
|
142
|
+
pageTitle = this.extractTitleFromHTML(html);
|
|
143
|
+
} else {
|
|
138
144
|
const shouldUseJavaScript = options.requiresJavaScript || await this.shouldUseJavaScript(url);
|
|
139
|
-
|
|
145
|
+
|
|
140
146
|
if (shouldUseJavaScript) {
|
|
141
147
|
console.error('Using browser rendering for JavaScript content...');
|
|
142
148
|
const browserResult = await this.browserProcessor.processURL({
|
|
@@ -162,7 +168,7 @@ export class ExtractContentTool {
|
|
|
162
168
|
headers: {
|
|
163
169
|
'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Enhanced-Content-Extractor)'
|
|
164
170
|
},
|
|
165
|
-
|
|
171
|
+
signal: AbortSignal.timeout(15000)
|
|
166
172
|
});
|
|
167
173
|
|
|
168
174
|
if (!response.ok) {
|
|
@@ -172,6 +178,7 @@ export class ExtractContentTool {
|
|
|
172
178
|
html = await response.text();
|
|
173
179
|
pageTitle = this.extractTitleFromHTML(html);
|
|
174
180
|
}
|
|
181
|
+
}
|
|
175
182
|
|
|
176
183
|
result.title = pageTitle;
|
|
177
184
|
|
|
@@ -194,6 +201,9 @@ export class ExtractContentTool {
|
|
|
194
201
|
result.content = {
|
|
195
202
|
text: processingResult.readability.textContent || processingResult.readability.content,
|
|
196
203
|
};
|
|
204
|
+
result.extractionMethod = 'readability';
|
|
205
|
+
result.confidence = 0.9;
|
|
206
|
+
result.finalUrl = url;
|
|
197
207
|
|
|
198
208
|
// Convert to markdown if requested
|
|
199
209
|
if (options.outputFormat === 'markdown') {
|
|
@@ -203,6 +213,10 @@ export class ExtractContentTool {
|
|
|
203
213
|
result.content = {
|
|
204
214
|
text: processingResult.fallback_content.content
|
|
205
215
|
};
|
|
216
|
+
result.extractionMethod = 'fallback_boilerplate_removal';
|
|
217
|
+
result.fallback_reason = 'Readability did not detect an article; used boilerplate-removal fallback';
|
|
218
|
+
result.confidence = 0.5;
|
|
219
|
+
result.finalUrl = url;
|
|
206
220
|
} else {
|
|
207
221
|
// Last resort: extract text from HTML
|
|
208
222
|
result.content = {
|
|
@@ -213,6 +227,10 @@ export class ExtractContentTool {
|
|
|
213
227
|
includeImageAlt: true
|
|
214
228
|
})
|
|
215
229
|
};
|
|
230
|
+
result.extractionMethod = 'raw_body_text';
|
|
231
|
+
result.fallback_reason = 'Neither Readability nor boilerplate-removal yielded content; extracted raw body text';
|
|
232
|
+
result.confidence = 0.2;
|
|
233
|
+
result.finalUrl = url;
|
|
216
234
|
}
|
|
217
235
|
|
|
218
236
|
// Include HTML if requested
|
|
@@ -314,4 +332,4 @@ export class ExtractContentTool {
|
|
|
314
332
|
}
|
|
315
333
|
}
|
|
316
334
|
|
|
317
|
-
export default ExtractContentTool;
|
|
335
|
+
export default ExtractContentTool;
|
|
@@ -8,6 +8,11 @@ import { z } from 'zod';
|
|
|
8
8
|
import { ElicitationHelper } from '../../core/ElicitationHelper.js'; // D1.4
|
|
9
9
|
import { load } from 'cheerio';
|
|
10
10
|
import { LLMManager } from '../../core/llm/LLMManager.js';
|
|
11
|
+
import { createRequire } from 'module';
|
|
12
|
+
|
|
13
|
+
const _require = createRequire(import.meta.url);
|
|
14
|
+
const _pkg = _require('../../../package.json');
|
|
15
|
+
const CRAWLFORGE_UA = `CrawlForge/${_pkg.version} (+https://crawlforge.dev)`;
|
|
11
16
|
import { fetchAndParse } from './_fetchAndParse.js';
|
|
12
17
|
|
|
13
18
|
const ExtractStructuredSchema = z.object({
|
|
@@ -30,7 +35,7 @@ export class ExtractStructuredTool {
|
|
|
30
35
|
constructor(options = {}) {
|
|
31
36
|
this.llmManager = null;
|
|
32
37
|
this.llmConfig = options.llmConfig || {};
|
|
33
|
-
this.userAgent =
|
|
38
|
+
this.userAgent = CRAWLFORGE_UA;
|
|
34
39
|
// D1.4: Elicitation helper
|
|
35
40
|
this._elicitation = new ElicitationHelper({});
|
|
36
41
|
}
|
|
@@ -129,7 +134,8 @@ export class ExtractStructuredTool {
|
|
|
129
134
|
validation: {
|
|
130
135
|
valid: extractionResult.valid || false,
|
|
131
136
|
errors: extractionResult.validationErrors || []
|
|
132
|
-
}
|
|
137
|
+
},
|
|
138
|
+
extractionNotes: extractionResult.extractionNotes || []
|
|
133
139
|
};
|
|
134
140
|
|
|
135
141
|
} catch (error) {
|
|
@@ -156,20 +162,53 @@ export class ExtractStructuredTool {
|
|
|
156
162
|
let fieldsFound = 0;
|
|
157
163
|
|
|
158
164
|
for (const [key, fieldSchema] of Object.entries(properties)) {
|
|
165
|
+
const isArrayField = fieldSchema.type === 'array';
|
|
166
|
+
|
|
159
167
|
// Use explicit selector hint if provided
|
|
160
168
|
const selector = selectorHints[key];
|
|
161
169
|
if (selector) {
|
|
162
|
-
const
|
|
163
|
-
if (
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
170
|
+
const els = $(selector);
|
|
171
|
+
if (els.length > 0) {
|
|
172
|
+
if (isArrayField || els.length > 1) {
|
|
173
|
+
const values = els.map((_, el) => $(el).text().trim()).get().filter(Boolean);
|
|
174
|
+
if (values.length > 0) {
|
|
175
|
+
extracted[key] = values;
|
|
176
|
+
fieldsFound++;
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
} else {
|
|
180
|
+
const rawValue = els.first().text().trim();
|
|
181
|
+
if (rawValue) {
|
|
182
|
+
extracted[key] = this._coerceValue(rawValue, fieldSchema);
|
|
183
|
+
fieldsFound++;
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
169
186
|
}
|
|
170
187
|
}
|
|
171
188
|
}
|
|
172
189
|
|
|
190
|
+
// For array fields: detect ul/ol > li patterns before meta/common selectors
|
|
191
|
+
if (isArrayField) {
|
|
192
|
+
const listSelectors = [
|
|
193
|
+
`ul.${key} > li`, `ol.${key} > li`,
|
|
194
|
+
`#${key} > li`, `[data-${key}] > li`,
|
|
195
|
+
`ul[class*="${key}"] > li`, `ol[class*="${key}"] > li`
|
|
196
|
+
];
|
|
197
|
+
let listValues = null;
|
|
198
|
+
for (const lsel of listSelectors) {
|
|
199
|
+
const items = $(lsel);
|
|
200
|
+
if (items.length > 0) {
|
|
201
|
+
listValues = items.map((_, el) => $(el).text().trim()).get().filter(Boolean);
|
|
202
|
+
break;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if (listValues && listValues.length > 0) {
|
|
206
|
+
extracted[key] = listValues;
|
|
207
|
+
fieldsFound++;
|
|
208
|
+
continue;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
173
212
|
// Try common patterns: meta tags, headings, semantic elements
|
|
174
213
|
const metaContent = $(`meta[name="${key}"], meta[property="${key}"], meta[property="og:${key}"]`).attr('content');
|
|
175
214
|
if (metaContent) {
|
|
@@ -189,11 +228,20 @@ export class ExtractStructuredTool {
|
|
|
189
228
|
for (const sel of commonSelectors) {
|
|
190
229
|
const el = $(sel);
|
|
191
230
|
if (el.length > 0) {
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
231
|
+
if (isArrayField && el.length > 1) {
|
|
232
|
+
const values = el.map((_, item) => $(item).text().trim()).get().filter(Boolean);
|
|
233
|
+
if (values.length > 0) {
|
|
234
|
+
extracted[key] = values;
|
|
235
|
+
fieldsFound++;
|
|
236
|
+
break;
|
|
237
|
+
}
|
|
238
|
+
} else {
|
|
239
|
+
const rawValue = el.first().text().trim();
|
|
240
|
+
if (rawValue) {
|
|
241
|
+
extracted[key] = this._coerceValue(rawValue, fieldSchema);
|
|
242
|
+
fieldsFound++;
|
|
243
|
+
break;
|
|
244
|
+
}
|
|
197
245
|
}
|
|
198
246
|
}
|
|
199
247
|
}
|
|
@@ -215,7 +263,8 @@ export class ExtractStructuredTool {
|
|
|
215
263
|
return {
|
|
216
264
|
data: extracted,
|
|
217
265
|
valid: errors.length === 0,
|
|
218
|
-
validationErrors: errors
|
|
266
|
+
validationErrors: errors,
|
|
267
|
+
extractionNotes: ['Used CSS selector fallback extraction']
|
|
219
268
|
};
|
|
220
269
|
}
|
|
221
270
|
|
|
@@ -254,7 +303,7 @@ export class ExtractStructuredTool {
|
|
|
254
303
|
base = result.valid ? 0.6 : 0.4;
|
|
255
304
|
}
|
|
256
305
|
|
|
257
|
-
// Penalize for validation errors
|
|
306
|
+
// Penalize only for actual validation errors (not extractionNotes)
|
|
258
307
|
const errorCount = (result.validationErrors || []).length;
|
|
259
308
|
const penalty = Math.min(0.3, errorCount * 0.1);
|
|
260
309
|
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
* Pass provider: "openai" | "anthropic" with the matching API key to use a cloud model.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
+
import { z } from 'zod';
|
|
10
11
|
import { fetchAndParse } from './_fetchAndParse.js';
|
|
11
12
|
// D1.3: SamplingClient for MCP sampling fallback (lazy — only imported if needed)
|
|
12
13
|
let _SamplingClient = null;
|
|
@@ -68,20 +69,26 @@ function resolveProvider(provider) {
|
|
|
68
69
|
|
|
69
70
|
/**
|
|
70
71
|
* Build the user message text that goes to the LLM.
|
|
72
|
+
* C3: also returns truncation metadata so the caller can surface it.
|
|
73
|
+
* @returns {{ userMessage: string, truncated: boolean, original_length: number }}
|
|
71
74
|
*/
|
|
72
75
|
function buildUserMessage(userPrompt, text, schema) {
|
|
73
|
-
const
|
|
76
|
+
const original_length = text.length;
|
|
77
|
+
const truncated = original_length > MAX_INPUT_CHARS;
|
|
78
|
+
const body = truncated ? text.slice(0, MAX_INPUT_CHARS) + '\n[...truncated]' : text;
|
|
74
79
|
let msg = `Extraction instruction: ${userPrompt}\n\n`;
|
|
75
80
|
if (schema && Object.keys(schema).length > 0) {
|
|
76
81
|
msg += `Output schema hint:\n${JSON.stringify(schema, null, 2)}\n\n`;
|
|
77
82
|
}
|
|
78
|
-
msg += `Web page content:\n${
|
|
79
|
-
return msg;
|
|
83
|
+
msg += `Web page content:\n${body}\n\nReturn only valid JSON.`;
|
|
84
|
+
return { userMessage: msg, truncated, original_length };
|
|
80
85
|
}
|
|
81
86
|
|
|
82
87
|
/**
|
|
83
88
|
* Parse JSON from an LLM response string defensively.
|
|
84
89
|
* Strips markdown code fences if present.
|
|
90
|
+
* C3: if the stripped string is not a full JSON document, locate the first
|
|
91
|
+
* embedded JSON object or array and try to parse that substring.
|
|
85
92
|
* Returns parsed object or throws.
|
|
86
93
|
*/
|
|
87
94
|
function parseJson(raw) {
|
|
@@ -90,7 +97,137 @@ function parseJson(raw) {
|
|
|
90
97
|
.replace(/^```(?:json)?\s*/i, '')
|
|
91
98
|
.replace(/\s*```\s*$/, '')
|
|
92
99
|
.trim();
|
|
93
|
-
|
|
100
|
+
|
|
101
|
+
// Fast path: well-formed JSON
|
|
102
|
+
try {
|
|
103
|
+
return JSON.parse(stripped);
|
|
104
|
+
} catch (_) {
|
|
105
|
+
// Fall through to substring recovery
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// C3: locate the first *balanced* JSON object or array embedded in the
|
|
109
|
+
// string — tolerant of prose both before and after the JSON.
|
|
110
|
+
const balanced = extractBalancedJson(stripped);
|
|
111
|
+
if (balanced !== null) {
|
|
112
|
+
return JSON.parse(balanced);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Re-throw the original parse error with the full content
|
|
116
|
+
throw new SyntaxError(`No JSON found in LLM response: ${stripped.slice(0, 200)}`);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Scan a string for the first balanced JSON object or array, respecting string
|
|
121
|
+
* literals and escapes so braces inside strings don't unbalance the scan.
|
|
122
|
+
* @returns {string|null} the JSON substring, or null if none is found
|
|
123
|
+
*/
|
|
124
|
+
function extractBalancedJson(str) {
|
|
125
|
+
const objStart = str.indexOf('{');
|
|
126
|
+
const arrStart = str.indexOf('[');
|
|
127
|
+
const start = objStart === -1 ? arrStart :
|
|
128
|
+
arrStart === -1 ? objStart :
|
|
129
|
+
Math.min(objStart, arrStart);
|
|
130
|
+
if (start === -1) return null;
|
|
131
|
+
|
|
132
|
+
const open = str[start];
|
|
133
|
+
const close = open === '{' ? '}' : ']';
|
|
134
|
+
let depth = 0;
|
|
135
|
+
let inString = false;
|
|
136
|
+
let escaped = false;
|
|
137
|
+
|
|
138
|
+
for (let i = start; i < str.length; i++) {
|
|
139
|
+
const ch = str[i];
|
|
140
|
+
if (escaped) { escaped = false; continue; }
|
|
141
|
+
if (ch === '\\') { escaped = true; continue; }
|
|
142
|
+
if (ch === '"') { inString = !inString; continue; }
|
|
143
|
+
if (inString) continue;
|
|
144
|
+
if (ch === open) depth++;
|
|
145
|
+
else if (ch === close) {
|
|
146
|
+
depth--;
|
|
147
|
+
if (depth === 0) return str.slice(start, i + 1);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return null;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// ── Schema handling (C3) ───────────────────────────────────────────────────────
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Normalize a caller-supplied schema hint into a valid top-level JSON Schema
|
|
157
|
+
* object suitable for Anthropic tool `input_schema`.
|
|
158
|
+
*
|
|
159
|
+
* Accepts either a full JSON Schema (`{ type, properties, ... }`) or a flat
|
|
160
|
+
* field→type-hint map (`{ name: "string", tags: "array" }`), which is wrapped
|
|
161
|
+
* as an object schema.
|
|
162
|
+
*/
|
|
163
|
+
function buildInputSchema(schema) {
|
|
164
|
+
if (schema && (schema.type === 'object' || schema.properties)) {
|
|
165
|
+
return { additionalProperties: true, ...schema, type: 'object' };
|
|
166
|
+
}
|
|
167
|
+
// Flat hint map → object schema with string-typed properties for any
|
|
168
|
+
// non-object hint values (Anthropic requires a valid JSON Schema).
|
|
169
|
+
const properties = {};
|
|
170
|
+
for (const [key, val] of Object.entries(schema || {})) {
|
|
171
|
+
properties[key] = (val && typeof val === 'object') ? val : { type: 'string' };
|
|
172
|
+
}
|
|
173
|
+
return { type: 'object', properties, additionalProperties: true };
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Build a zod validator from a JSON-Schema-like hint. Best-effort: unknown
|
|
178
|
+
* shapes fall back to `z.any()` so validation never rejects on constructs the
|
|
179
|
+
* converter does not understand.
|
|
180
|
+
*/
|
|
181
|
+
function jsonSchemaToZod(schema) {
|
|
182
|
+
if (!schema || typeof schema !== 'object') return z.any();
|
|
183
|
+
|
|
184
|
+
// Flat hint map (no `type`/`properties`) → treat values as field hints.
|
|
185
|
+
const isJsonSchema = schema.type || schema.properties || schema.items;
|
|
186
|
+
if (!isJsonSchema) {
|
|
187
|
+
const shape = {};
|
|
188
|
+
for (const [key, val] of Object.entries(schema)) {
|
|
189
|
+
shape[key] = jsonSchemaToZod(typeof val === 'string' ? { type: val } : val).optional();
|
|
190
|
+
}
|
|
191
|
+
return z.object(shape).passthrough();
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
switch (schema.type) {
|
|
195
|
+
case 'string': return z.string();
|
|
196
|
+
case 'number':
|
|
197
|
+
case 'integer': return z.number();
|
|
198
|
+
case 'boolean': return z.boolean();
|
|
199
|
+
case 'null': return z.null();
|
|
200
|
+
case 'array': return z.array(schema.items ? jsonSchemaToZod(schema.items) : z.any());
|
|
201
|
+
case 'object': {
|
|
202
|
+
const shape = {};
|
|
203
|
+
const required = Array.isArray(schema.required) ? schema.required : [];
|
|
204
|
+
for (const [key, val] of Object.entries(schema.properties || {})) {
|
|
205
|
+
const field = jsonSchemaToZod(val);
|
|
206
|
+
shape[key] = required.includes(key) ? field : field.optional();
|
|
207
|
+
}
|
|
208
|
+
return z.object(shape).passthrough();
|
|
209
|
+
}
|
|
210
|
+
default: return z.any();
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Validate parsed output against the schema hint.
|
|
216
|
+
* @returns {{ valid: boolean, errors: string[] }}
|
|
217
|
+
*/
|
|
218
|
+
function validateAgainstSchema(parsed, schema) {
|
|
219
|
+
try {
|
|
220
|
+
const validator = jsonSchemaToZod(schema);
|
|
221
|
+
const result = validator.safeParse(parsed);
|
|
222
|
+
if (result.success) return { valid: true, errors: [] };
|
|
223
|
+
return {
|
|
224
|
+
valid: false,
|
|
225
|
+
errors: result.error.issues.map((i) => `${i.path.join('.') || '(root)'}: ${i.message}`)
|
|
226
|
+
};
|
|
227
|
+
} catch {
|
|
228
|
+
// Converter failure should not block extraction — treat as unvalidated.
|
|
229
|
+
return { valid: true, errors: [] };
|
|
230
|
+
}
|
|
94
231
|
}
|
|
95
232
|
|
|
96
233
|
// ── OpenAI call ───────────────────────────────────────────────────────────────
|
|
@@ -133,8 +270,10 @@ async function callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens
|
|
|
133
270
|
|
|
134
271
|
// ── Anthropic call ────────────────────────────────────────────────────────────
|
|
135
272
|
|
|
136
|
-
async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens }) {
|
|
273
|
+
async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens, schema }) {
|
|
137
274
|
const url = `${anthropicBaseUrl()}/v1/messages`;
|
|
275
|
+
const useToolUse = schema && Object.keys(schema).length > 0;
|
|
276
|
+
|
|
138
277
|
const body = {
|
|
139
278
|
model,
|
|
140
279
|
system: systemMessage,
|
|
@@ -142,6 +281,18 @@ async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTok
|
|
|
142
281
|
max_tokens: maxTokens
|
|
143
282
|
};
|
|
144
283
|
|
|
284
|
+
// C3: when a schema is provided, force structured output via tool-use. The
|
|
285
|
+
// tool's input_schema constrains the model and the tool_use input block is
|
|
286
|
+
// returned as already-valid JSON (no fence-stripping/parsing guesswork).
|
|
287
|
+
if (useToolUse) {
|
|
288
|
+
body.tools = [{
|
|
289
|
+
name: 'extract_data',
|
|
290
|
+
description: 'Return the extracted data conforming to the provided schema.',
|
|
291
|
+
input_schema: buildInputSchema(schema)
|
|
292
|
+
}];
|
|
293
|
+
body.tool_choice = { type: 'tool', name: 'extract_data' };
|
|
294
|
+
}
|
|
295
|
+
|
|
145
296
|
const response = await fetch(url, {
|
|
146
297
|
method: 'POST',
|
|
147
298
|
headers: {
|
|
@@ -159,11 +310,21 @@ async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTok
|
|
|
159
310
|
}
|
|
160
311
|
|
|
161
312
|
const json = await response.json();
|
|
162
|
-
const content = json.content?.[0]?.text ?? '';
|
|
163
313
|
const usage = {
|
|
164
314
|
input_tokens: json.usage?.input_tokens ?? 0,
|
|
165
315
|
output_tokens: json.usage?.output_tokens ?? 0
|
|
166
316
|
};
|
|
317
|
+
|
|
318
|
+
if (useToolUse) {
|
|
319
|
+
// Read the structured input from the tool_use block.
|
|
320
|
+
const toolBlock = (json.content || []).find((b) => b.type === 'tool_use');
|
|
321
|
+
if (toolBlock && toolBlock.input !== undefined) {
|
|
322
|
+
return { rawText: JSON.stringify(toolBlock.input), usage, model: json.model || model };
|
|
323
|
+
}
|
|
324
|
+
// Fall through to text if the model declined to call the tool.
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
const content = (json.content || []).find((b) => b.type === 'text')?.text ?? '';
|
|
167
328
|
return { rawText: content, usage, model: json.model || model };
|
|
168
329
|
}
|
|
169
330
|
|
|
@@ -229,7 +390,7 @@ async function callLLM({ provider, apiKey, model, systemMessage, userMessage, ma
|
|
|
229
390
|
if (provider === 'ollama') {
|
|
230
391
|
return callOllama({ model, systemMessage, userMessage, maxTokens, schema });
|
|
231
392
|
}
|
|
232
|
-
return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens });
|
|
393
|
+
return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens, schema });
|
|
233
394
|
}
|
|
234
395
|
|
|
235
396
|
// ── Tool class ────────────────────────────────────────────────────────────────
|
|
@@ -304,7 +465,7 @@ export class ExtractWithLlm {
|
|
|
304
465
|
const systemMessage =
|
|
305
466
|
'You extract structured data from web content per the user\'s instructions. Return JSON only.';
|
|
306
467
|
|
|
307
|
-
const userMessage = buildUserMessage(prompt, text, schema);
|
|
468
|
+
const { userMessage, truncated: inputTruncated, original_length } = buildUserMessage(prompt, text, schema);
|
|
308
469
|
|
|
309
470
|
// Step 2: First LLM call — with sampling fallback for 'auto' provider
|
|
310
471
|
// Fallback chain: Ollama → API key (handled by resolveProvider) → sampling → error
|
|
@@ -314,10 +475,17 @@ export class ExtractWithLlm {
|
|
|
314
475
|
provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
|
|
315
476
|
}));
|
|
316
477
|
} catch (llmErr) {
|
|
317
|
-
// D1.3: If provider is 'auto'/'ollama' and it failed, try sampling as final fallback
|
|
478
|
+
// D1.3: If provider is 'auto'/'ollama' and it failed, try MCP sampling as final fallback
|
|
318
479
|
if (providerParam === 'auto' || providerParam === 'ollama') {
|
|
319
480
|
try {
|
|
320
|
-
|
|
481
|
+
const SamplingClient = await getSamplingClient();
|
|
482
|
+
const samplingClient = new SamplingClient();
|
|
483
|
+
const { text: sampledText } = await samplingClient.complete(
|
|
484
|
+
`${systemMessage}\n\n${userMessage}`,
|
|
485
|
+
{ maxTokens }
|
|
486
|
+
);
|
|
487
|
+
rawText = sampledText;
|
|
488
|
+
usage = { input_tokens: 0, output_tokens: 0 };
|
|
321
489
|
resolvedModel = 'sampling';
|
|
322
490
|
} catch (samplingErr) {
|
|
323
491
|
return { success: false, error: `LLM call failed: ${llmErr.message}. Sampling fallback also failed: ${samplingErr.message}` };
|
|
@@ -362,13 +530,26 @@ export class ExtractWithLlm {
|
|
|
362
530
|
}
|
|
363
531
|
}
|
|
364
532
|
|
|
365
|
-
|
|
533
|
+
// C3: surface truncation metadata so callers know the input was clipped
|
|
534
|
+
const result = {
|
|
366
535
|
success: true,
|
|
367
536
|
data: parsed,
|
|
368
537
|
provider: resolvedModel === 'sampling' ? 'sampling' : provider,
|
|
369
538
|
model: resolvedModel || model,
|
|
370
539
|
usage
|
|
371
540
|
};
|
|
541
|
+
if (inputTruncated) {
|
|
542
|
+
result.truncated = true;
|
|
543
|
+
result.original_length = original_length;
|
|
544
|
+
}
|
|
545
|
+
// C3: validate output against the schema hint (zod). Non-fatal — the data
|
|
546
|
+
// is still returned; callers can inspect `valid`/`validationErrors`.
|
|
547
|
+
if (schema && Object.keys(schema).length > 0) {
|
|
548
|
+
const { valid, errors } = validateAgainstSchema(parsed, schema);
|
|
549
|
+
result.valid = valid;
|
|
550
|
+
if (!valid) result.validationErrors = errors;
|
|
551
|
+
}
|
|
552
|
+
return result;
|
|
372
553
|
}
|
|
373
554
|
}
|
|
374
555
|
|
|
@@ -41,14 +41,25 @@ export class ListOllamaModelsTool {
|
|
|
41
41
|
return { success: false, baseUrl, error: `Invalid JSON from Ollama: ${err.message}` };
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
44
|
+
// C3: harden against non-array response; normalize modified_at to ISO 8601.
|
|
45
|
+
const rawModels = Array.isArray(data.models) ? data.models :
|
|
46
|
+
Array.isArray(data) ? data : [];
|
|
47
|
+
|
|
48
|
+
const models = rawModels.map((m) => {
|
|
49
|
+
let modified_at = m.modified_at ?? null;
|
|
50
|
+
if (modified_at !== null) {
|
|
51
|
+
const d = new Date(modified_at);
|
|
52
|
+
modified_at = isNaN(d.getTime()) ? modified_at : d.toISOString();
|
|
53
|
+
}
|
|
54
|
+
return {
|
|
55
|
+
name: m.name,
|
|
56
|
+
size_bytes: m.size,
|
|
57
|
+
modified_at,
|
|
58
|
+
family: m.details?.family,
|
|
59
|
+
parameter_size: m.details?.parameter_size,
|
|
60
|
+
quantization: m.details?.quantization_level
|
|
61
|
+
};
|
|
62
|
+
});
|
|
52
63
|
|
|
53
64
|
return {
|
|
54
65
|
success: true,
|
|
@@ -19,7 +19,12 @@ const ProcessDocumentSchema = z.object({
|
|
|
19
19
|
extractMetadata: z.boolean().default(true),
|
|
20
20
|
password: z.string().optional(),
|
|
21
21
|
maxPages: z.number().min(1).max(500).default(100),
|
|
22
|
-
|
|
22
|
+
// C3: extract a specific 1-based, inclusive page range from a PDF
|
|
23
|
+
pageRange: z.object({
|
|
24
|
+
start: z.number().min(1).default(1),
|
|
25
|
+
end: z.number().min(1).optional()
|
|
26
|
+
}).optional(),
|
|
27
|
+
|
|
23
28
|
// Web content options
|
|
24
29
|
useReadability: z.boolean().default(true),
|
|
25
30
|
extractStructuredData: z.boolean().default(true),
|
|
@@ -195,7 +200,8 @@ export class ProcessDocumentTool {
|
|
|
195
200
|
extractText: options.extractText,
|
|
196
201
|
extractMetadata: options.extractMetadata,
|
|
197
202
|
password: options.password,
|
|
198
|
-
maxPages: options.maxPages
|
|
203
|
+
maxPages: options.maxPages,
|
|
204
|
+
...(options.pageRange ? { pageRange: options.pageRange } : {})
|
|
199
205
|
}
|
|
200
206
|
});
|
|
201
207
|
|
|
@@ -273,7 +279,7 @@ export class ProcessDocumentTool {
|
|
|
273
279
|
headers: {
|
|
274
280
|
'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Document-Processor)'
|
|
275
281
|
},
|
|
276
|
-
|
|
282
|
+
signal: AbortSignal.timeout(15000)
|
|
277
283
|
});
|
|
278
284
|
|
|
279
285
|
if (!response.ok) {
|
|
@@ -506,4 +512,4 @@ export class ProcessDocumentTool {
|
|
|
506
512
|
}
|
|
507
513
|
}
|
|
508
514
|
|
|
509
|
-
export default ProcessDocumentTool;
|
|
515
|
+
export default ProcessDocumentTool;
|
|
@@ -68,6 +68,8 @@ const SummarizeContentResult = z.object({
|
|
|
68
68
|
summarizedAt: z.string(),
|
|
69
69
|
processingTime: z.number(),
|
|
70
70
|
success: z.boolean(),
|
|
71
|
+
degraded: z.boolean().optional(),
|
|
72
|
+
degradedReason: z.string().optional(),
|
|
71
73
|
error: z.string().optional()
|
|
72
74
|
});
|
|
73
75
|
|
|
@@ -131,11 +133,17 @@ export class SummarizeContentTool {
|
|
|
131
133
|
// Step 2: Set summary result
|
|
132
134
|
result.summary = analysisResult.summary;
|
|
133
135
|
|
|
134
|
-
// D1.3: If abstractive mode requested, attempt sampling-based enhancement
|
|
136
|
+
// D1.3: If abstractive mode requested, attempt sampling-based enhancement.
|
|
137
|
+
// If it can't run (no LLM/sampling available), fall back to the extractive
|
|
138
|
+
// result but flag it explicitly rather than silently masking.
|
|
135
139
|
if (options.summaryType === 'abstractive') {
|
|
136
140
|
const abstractive = await this._abstractiveSummaryViaSampling(text, analysisResult.summary, options.summaryLength);
|
|
137
141
|
if (abstractive) {
|
|
138
142
|
result.summary = abstractive;
|
|
143
|
+
} else {
|
|
144
|
+
result.summary = { ...result.summary, type: 'extractive' };
|
|
145
|
+
result.degraded = true;
|
|
146
|
+
result.degradedReason = 'Abstractive summarization unavailable (no LLM/sampling backend); returned extractive summary instead.';
|
|
139
147
|
}
|
|
140
148
|
}
|
|
141
149
|
|
|
@@ -191,6 +199,55 @@ export class SummarizeContentTool {
|
|
|
191
199
|
}
|
|
192
200
|
}
|
|
193
201
|
|
|
202
|
+
/**
|
|
203
|
+
* Generate an abstractive summary via the MCP SamplingClient fallback chain
|
|
204
|
+
* (Ollama → OpenAI → Anthropic → MCP sampling). Returns a summary object in the
|
|
205
|
+
* same shape as the extractive result, or null if no backend is available.
|
|
206
|
+
* @param {string} text - Full original text
|
|
207
|
+
* @param {Object} extractiveSummary - The extractive summary (for shape/fallback)
|
|
208
|
+
* @param {string} summaryLength - 'short' | 'medium' | 'long'
|
|
209
|
+
* @returns {Promise<Object|null>}
|
|
210
|
+
*/
|
|
211
|
+
async _abstractiveSummaryViaSampling(text, extractiveSummary, summaryLength) {
|
|
212
|
+
try {
|
|
213
|
+
const SamplingClient = await getSamplingClient();
|
|
214
|
+
const client = new SamplingClient();
|
|
215
|
+
|
|
216
|
+
const lengthGuide = {
|
|
217
|
+
short: '1-2 sentences',
|
|
218
|
+
medium: '3-5 sentences',
|
|
219
|
+
long: '6-10 sentences'
|
|
220
|
+
}[summaryLength] || '3-5 sentences';
|
|
221
|
+
|
|
222
|
+
const prompt =
|
|
223
|
+
`Write a concise, fluent abstractive summary (${lengthGuide}) of the following text. ` +
|
|
224
|
+
`Capture the main ideas in your own words. Respond with only the summary text.\n\n` +
|
|
225
|
+
`${text.slice(0, 12000)}`;
|
|
226
|
+
|
|
227
|
+
const { text: summaryText } = await client.complete(prompt, { maxTokens: 600 });
|
|
228
|
+
if (!summaryText || !summaryText.trim()) {
|
|
229
|
+
return null;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const cleaned = summaryText.trim();
|
|
233
|
+
const sentences = splitSentences(cleaned);
|
|
234
|
+
const compressionRatio = text.length > 0
|
|
235
|
+
? Math.round((cleaned.length / text.length) * 1000) / 1000
|
|
236
|
+
: 0;
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
text: cleaned,
|
|
240
|
+
sentences,
|
|
241
|
+
type: 'abstractive',
|
|
242
|
+
length: summaryLength,
|
|
243
|
+
compressionRatio
|
|
244
|
+
};
|
|
245
|
+
} catch {
|
|
246
|
+
// No sampling/LLM backend available — caller falls back to extractive.
|
|
247
|
+
return null;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
194
251
|
/**
|
|
195
252
|
* Extract key points from original text and summary
|
|
196
253
|
* @param {string} originalText - Original text
|