crawlforge-mcp-server 4.2.11 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +2 -1
  2. package/server.js +152 -21
  3. package/src/constants/config.js +5 -0
  4. package/src/core/ActionExecutor.js +13 -1
  5. package/src/core/ChangeTracker.js +8 -5
  6. package/src/core/LLMsTxtAnalyzer.js +71 -47
  7. package/src/core/LocalizationManager.js +7 -4
  8. package/src/core/ResearchOrchestrator.js +10 -6
  9. package/src/core/StealthBrowserManager.js +111 -40
  10. package/src/core/analysis/ContentAnalyzer.js +2 -2
  11. package/src/core/crawlers/BFSCrawler.js +23 -12
  12. package/src/core/processing/ContentProcessor.js +19 -3
  13. package/src/core/processing/PDFProcessor.js +72 -23
  14. package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
  15. package/src/tools/advanced/batchScrape/index.js +3 -1
  16. package/src/tools/advanced/batchScrape/reporter.js +5 -1
  17. package/src/tools/advanced/batchScrape/worker.js +6 -1
  18. package/src/tools/basic/_fetch.js +78 -5
  19. package/src/tools/basic/extractLinks.js +1 -1
  20. package/src/tools/basic/extractMetadata.js +65 -1
  21. package/src/tools/basic/extractText.js +61 -5
  22. package/src/tools/basic/scrapeStructured.js +48 -10
  23. package/src/tools/crawl/crawlDeep.js +13 -5
  24. package/src/tools/crawl/mapSite.js +24 -51
  25. package/src/tools/extract/analyzeContent.js +11 -6
  26. package/src/tools/extract/extractContent.js +23 -5
  27. package/src/tools/extract/extractStructured.js +65 -16
  28. package/src/tools/extract/extractWithLlm.js +192 -11
  29. package/src/tools/extract/listOllamaModels.js +19 -8
  30. package/src/tools/extract/processDocument.js +10 -4
  31. package/src/tools/extract/summarizeContent.js +58 -1
  32. package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
  33. package/src/tools/research/deepResearch.js +43 -4
  34. package/src/tools/search/providers/searxng.js +2 -2
  35. package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
  36. package/src/tools/search/ranking/ResultRanker.js +13 -4
  37. package/src/tools/search/searchWeb.js +5 -5
  38. package/src/tools/templates/TemplateRegistry.js +3 -2
  39. package/src/tools/tracking/trackChanges/differ.js +33 -1
  40. package/src/utils/htmlToMarkdown.js +5 -1
@@ -124,7 +124,7 @@ export class ExtractContentTool {
124
124
 
125
125
  try {
126
126
  const validated = ExtractContentSchema.parse(params);
127
- const { url, options } = validated;
127
+ const { url, html: providedHtml, options } = validated;
128
128
 
129
129
  const result = {
130
130
  url,
@@ -133,10 +133,16 @@ export class ExtractContentTool {
133
133
  processingTime: 0
134
134
  };
135
135
 
136
- // Step 1: Fetch content (with or without JavaScript rendering)
136
+ // Step 1: Fetch content (with or without JavaScript rendering).
137
+ // If pre-rendered HTML is supplied (e.g. post-action page from
138
+ // scrape_with_actions), use it directly and skip the network fetch.
137
139
  let html, pageTitle;
140
+ if (providedHtml) {
141
+ html = providedHtml;
142
+ pageTitle = this.extractTitleFromHTML(html);
143
+ } else {
138
144
  const shouldUseJavaScript = options.requiresJavaScript || await this.shouldUseJavaScript(url);
139
-
145
+
140
146
  if (shouldUseJavaScript) {
141
147
  console.error('Using browser rendering for JavaScript content...');
142
148
  const browserResult = await this.browserProcessor.processURL({
@@ -162,7 +168,7 @@ export class ExtractContentTool {
162
168
  headers: {
163
169
  'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Enhanced-Content-Extractor)'
164
170
  },
165
- timeout: 15000
171
+ signal: AbortSignal.timeout(15000)
166
172
  });
167
173
 
168
174
  if (!response.ok) {
@@ -172,6 +178,7 @@ export class ExtractContentTool {
172
178
  html = await response.text();
173
179
  pageTitle = this.extractTitleFromHTML(html);
174
180
  }
181
+ }
175
182
 
176
183
  result.title = pageTitle;
177
184
 
@@ -194,6 +201,9 @@ export class ExtractContentTool {
194
201
  result.content = {
195
202
  text: processingResult.readability.textContent || processingResult.readability.content,
196
203
  };
204
+ result.extractionMethod = 'readability';
205
+ result.confidence = 0.9;
206
+ result.finalUrl = url;
197
207
 
198
208
  // Convert to markdown if requested
199
209
  if (options.outputFormat === 'markdown') {
@@ -203,6 +213,10 @@ export class ExtractContentTool {
203
213
  result.content = {
204
214
  text: processingResult.fallback_content.content
205
215
  };
216
+ result.extractionMethod = 'fallback_boilerplate_removal';
217
+ result.fallback_reason = 'Readability did not detect an article; used boilerplate-removal fallback';
218
+ result.confidence = 0.5;
219
+ result.finalUrl = url;
206
220
  } else {
207
221
  // Last resort: extract text from HTML
208
222
  result.content = {
@@ -213,6 +227,10 @@ export class ExtractContentTool {
213
227
  includeImageAlt: true
214
228
  })
215
229
  };
230
+ result.extractionMethod = 'raw_body_text';
231
+ result.fallback_reason = 'Neither Readability nor boilerplate-removal yielded content; extracted raw body text';
232
+ result.confidence = 0.2;
233
+ result.finalUrl = url;
216
234
  }
217
235
 
218
236
  // Include HTML if requested
@@ -314,4 +332,4 @@ export class ExtractContentTool {
314
332
  }
315
333
  }
316
334
 
317
- export default ExtractContentTool;
335
+ export default ExtractContentTool;
@@ -8,6 +8,11 @@ import { z } from 'zod';
8
8
  import { ElicitationHelper } from '../../core/ElicitationHelper.js'; // D1.4
9
9
  import { load } from 'cheerio';
10
10
  import { LLMManager } from '../../core/llm/LLMManager.js';
11
+ import { createRequire } from 'module';
12
+
13
+ const _require = createRequire(import.meta.url);
14
+ const _pkg = _require('../../../package.json');
15
+ const CRAWLFORGE_UA = `CrawlForge/${_pkg.version} (+https://crawlforge.dev)`;
11
16
  import { fetchAndParse } from './_fetchAndParse.js';
12
17
 
13
18
  const ExtractStructuredSchema = z.object({
@@ -30,7 +35,7 @@ export class ExtractStructuredTool {
30
35
  constructor(options = {}) {
31
36
  this.llmManager = null;
32
37
  this.llmConfig = options.llmConfig || {};
33
- this.userAgent = 'Mozilla/5.0 (compatible; CrawlForge-MCP/3.0; ExtractStructured)';
38
+ this.userAgent = CRAWLFORGE_UA;
34
39
  // D1.4: Elicitation helper
35
40
  this._elicitation = new ElicitationHelper({});
36
41
  }
@@ -129,7 +134,8 @@ export class ExtractStructuredTool {
129
134
  validation: {
130
135
  valid: extractionResult.valid || false,
131
136
  errors: extractionResult.validationErrors || []
132
- }
137
+ },
138
+ extractionNotes: extractionResult.extractionNotes || []
133
139
  };
134
140
 
135
141
  } catch (error) {
@@ -156,20 +162,53 @@ export class ExtractStructuredTool {
156
162
  let fieldsFound = 0;
157
163
 
158
164
  for (const [key, fieldSchema] of Object.entries(properties)) {
165
+ const isArrayField = fieldSchema.type === 'array';
166
+
159
167
  // Use explicit selector hint if provided
160
168
  const selector = selectorHints[key];
161
169
  if (selector) {
162
- const el = $(selector);
163
- if (el.length > 0) {
164
- const rawValue = el.first().text().trim();
165
- if (rawValue) {
166
- extracted[key] = this._coerceValue(rawValue, fieldSchema);
167
- fieldsFound++;
168
- continue;
170
+ const els = $(selector);
171
+ if (els.length > 0) {
172
+ if (isArrayField || els.length > 1) {
173
+ const values = els.map((_, el) => $(el).text().trim()).get().filter(Boolean);
174
+ if (values.length > 0) {
175
+ extracted[key] = values;
176
+ fieldsFound++;
177
+ continue;
178
+ }
179
+ } else {
180
+ const rawValue = els.first().text().trim();
181
+ if (rawValue) {
182
+ extracted[key] = this._coerceValue(rawValue, fieldSchema);
183
+ fieldsFound++;
184
+ continue;
185
+ }
169
186
  }
170
187
  }
171
188
  }
172
189
 
190
+ // For array fields: detect ul/ol > li patterns before meta/common selectors
191
+ if (isArrayField) {
192
+ const listSelectors = [
193
+ `ul.${key} > li`, `ol.${key} > li`,
194
+ `#${key} > li`, `[data-${key}] > li`,
195
+ `ul[class*="${key}"] > li`, `ol[class*="${key}"] > li`
196
+ ];
197
+ let listValues = null;
198
+ for (const lsel of listSelectors) {
199
+ const items = $(lsel);
200
+ if (items.length > 0) {
201
+ listValues = items.map((_, el) => $(el).text().trim()).get().filter(Boolean);
202
+ break;
203
+ }
204
+ }
205
+ if (listValues && listValues.length > 0) {
206
+ extracted[key] = listValues;
207
+ fieldsFound++;
208
+ continue;
209
+ }
210
+ }
211
+
173
212
  // Try common patterns: meta tags, headings, semantic elements
174
213
  const metaContent = $(`meta[name="${key}"], meta[property="${key}"], meta[property="og:${key}"]`).attr('content');
175
214
  if (metaContent) {
@@ -189,11 +228,20 @@ export class ExtractStructuredTool {
189
228
  for (const sel of commonSelectors) {
190
229
  const el = $(sel);
191
230
  if (el.length > 0) {
192
- const rawValue = el.first().text().trim();
193
- if (rawValue) {
194
- extracted[key] = this._coerceValue(rawValue, fieldSchema);
195
- fieldsFound++;
196
- break;
231
+ if (isArrayField && el.length > 1) {
232
+ const values = el.map((_, item) => $(item).text().trim()).get().filter(Boolean);
233
+ if (values.length > 0) {
234
+ extracted[key] = values;
235
+ fieldsFound++;
236
+ break;
237
+ }
238
+ } else {
239
+ const rawValue = el.first().text().trim();
240
+ if (rawValue) {
241
+ extracted[key] = this._coerceValue(rawValue, fieldSchema);
242
+ fieldsFound++;
243
+ break;
244
+ }
197
245
  }
198
246
  }
199
247
  }
@@ -215,7 +263,8 @@ export class ExtractStructuredTool {
215
263
  return {
216
264
  data: extracted,
217
265
  valid: errors.length === 0,
218
- validationErrors: errors.length > 0 ? errors : ['Used CSS selector fallback extraction']
266
+ validationErrors: errors,
267
+ extractionNotes: ['Used CSS selector fallback extraction']
219
268
  };
220
269
  }
221
270
 
@@ -254,7 +303,7 @@ export class ExtractStructuredTool {
254
303
  base = result.valid ? 0.6 : 0.4;
255
304
  }
256
305
 
257
- // Penalize for validation errors
306
+ // Penalize only for actual validation errors (not extractionNotes)
258
307
  const errorCount = (result.validationErrors || []).length;
259
308
  const penalty = Math.min(0.3, errorCount * 0.1);
260
309
 
@@ -7,6 +7,7 @@
7
7
  * Pass provider: "openai" | "anthropic" with the matching API key to use a cloud model.
8
8
  */
9
9
 
10
+ import { z } from 'zod';
10
11
  import { fetchAndParse } from './_fetchAndParse.js';
11
12
  // D1.3: SamplingClient for MCP sampling fallback (lazy — only imported if needed)
12
13
  let _SamplingClient = null;
@@ -68,20 +69,26 @@ function resolveProvider(provider) {
68
69
 
69
70
  /**
70
71
  * Build the user message text that goes to the LLM.
72
+ * C3: also returns truncation metadata so the caller can surface it.
73
+ * @returns {{ userMessage: string, truncated: boolean, original_length: number }}
71
74
  */
72
75
  function buildUserMessage(userPrompt, text, schema) {
73
- const truncated = text.length > MAX_INPUT_CHARS ? text.slice(0, MAX_INPUT_CHARS) + '\n[...truncated]' : text;
76
+ const original_length = text.length;
77
+ const truncated = original_length > MAX_INPUT_CHARS;
78
+ const body = truncated ? text.slice(0, MAX_INPUT_CHARS) + '\n[...truncated]' : text;
74
79
  let msg = `Extraction instruction: ${userPrompt}\n\n`;
75
80
  if (schema && Object.keys(schema).length > 0) {
76
81
  msg += `Output schema hint:\n${JSON.stringify(schema, null, 2)}\n\n`;
77
82
  }
78
- msg += `Web page content:\n${truncated}\n\nReturn only valid JSON.`;
79
- return msg;
83
+ msg += `Web page content:\n${body}\n\nReturn only valid JSON.`;
84
+ return { userMessage: msg, truncated, original_length };
80
85
  }
81
86
 
82
87
  /**
83
88
  * Parse JSON from an LLM response string defensively.
84
89
  * Strips markdown code fences if present.
90
+ * C3: if the stripped string is not a full JSON document, locate the first
91
+ * embedded JSON object or array and try to parse that substring.
85
92
  * Returns parsed object or throws.
86
93
  */
87
94
  function parseJson(raw) {
@@ -90,7 +97,137 @@ function parseJson(raw) {
90
97
  .replace(/^```(?:json)?\s*/i, '')
91
98
  .replace(/\s*```\s*$/, '')
92
99
  .trim();
93
- return JSON.parse(stripped);
100
+
101
+ // Fast path: well-formed JSON
102
+ try {
103
+ return JSON.parse(stripped);
104
+ } catch (_) {
105
+ // Fall through to substring recovery
106
+ }
107
+
108
+ // C3: locate the first *balanced* JSON object or array embedded in the
109
+ // string — tolerant of prose both before and after the JSON.
110
+ const balanced = extractBalancedJson(stripped);
111
+ if (balanced !== null) {
112
+ return JSON.parse(balanced);
113
+ }
114
+
115
+ // Re-throw the original parse error with the full content
116
+ throw new SyntaxError(`No JSON found in LLM response: ${stripped.slice(0, 200)}`);
117
+ }
118
+
119
+ /**
120
+ * Scan a string for the first balanced JSON object or array, respecting string
121
+ * literals and escapes so braces inside strings don't unbalance the scan.
122
+ * @returns {string|null} the JSON substring, or null if none is found
123
+ */
124
+ function extractBalancedJson(str) {
125
+ const objStart = str.indexOf('{');
126
+ const arrStart = str.indexOf('[');
127
+ const start = objStart === -1 ? arrStart :
128
+ arrStart === -1 ? objStart :
129
+ Math.min(objStart, arrStart);
130
+ if (start === -1) return null;
131
+
132
+ const open = str[start];
133
+ const close = open === '{' ? '}' : ']';
134
+ let depth = 0;
135
+ let inString = false;
136
+ let escaped = false;
137
+
138
+ for (let i = start; i < str.length; i++) {
139
+ const ch = str[i];
140
+ if (escaped) { escaped = false; continue; }
141
+ if (ch === '\\') { escaped = true; continue; }
142
+ if (ch === '"') { inString = !inString; continue; }
143
+ if (inString) continue;
144
+ if (ch === open) depth++;
145
+ else if (ch === close) {
146
+ depth--;
147
+ if (depth === 0) return str.slice(start, i + 1);
148
+ }
149
+ }
150
+ return null;
151
+ }
152
+
153
+ // ── Schema handling (C3) ───────────────────────────────────────────────────────
154
+
155
+ /**
156
+ * Normalize a caller-supplied schema hint into a valid top-level JSON Schema
157
+ * object suitable for Anthropic tool `input_schema`.
158
+ *
159
+ * Accepts either a full JSON Schema (`{ type, properties, ... }`) or a flat
160
+ * field→type-hint map (`{ name: "string", tags: "array" }`), which is wrapped
161
+ * as an object schema.
162
+ */
163
+ function buildInputSchema(schema) {
164
+ if (schema && (schema.type === 'object' || schema.properties)) {
165
+ return { additionalProperties: true, ...schema, type: 'object' };
166
+ }
167
+ // Flat hint map → object schema with string-typed properties for any
168
+ // non-object hint values (Anthropic requires a valid JSON Schema).
169
+ const properties = {};
170
+ for (const [key, val] of Object.entries(schema || {})) {
171
+ properties[key] = (val && typeof val === 'object') ? val : { type: 'string' };
172
+ }
173
+ return { type: 'object', properties, additionalProperties: true };
174
+ }
175
+
176
+ /**
177
+ * Build a zod validator from a JSON-Schema-like hint. Best-effort: unknown
178
+ * shapes fall back to `z.any()` so validation never rejects on constructs the
179
+ * converter does not understand.
180
+ */
181
+ function jsonSchemaToZod(schema) {
182
+ if (!schema || typeof schema !== 'object') return z.any();
183
+
184
+ // Flat hint map (no `type`/`properties`) → treat values as field hints.
185
+ const isJsonSchema = schema.type || schema.properties || schema.items;
186
+ if (!isJsonSchema) {
187
+ const shape = {};
188
+ for (const [key, val] of Object.entries(schema)) {
189
+ shape[key] = jsonSchemaToZod(typeof val === 'string' ? { type: val } : val).optional();
190
+ }
191
+ return z.object(shape).passthrough();
192
+ }
193
+
194
+ switch (schema.type) {
195
+ case 'string': return z.string();
196
+ case 'number':
197
+ case 'integer': return z.number();
198
+ case 'boolean': return z.boolean();
199
+ case 'null': return z.null();
200
+ case 'array': return z.array(schema.items ? jsonSchemaToZod(schema.items) : z.any());
201
+ case 'object': {
202
+ const shape = {};
203
+ const required = Array.isArray(schema.required) ? schema.required : [];
204
+ for (const [key, val] of Object.entries(schema.properties || {})) {
205
+ const field = jsonSchemaToZod(val);
206
+ shape[key] = required.includes(key) ? field : field.optional();
207
+ }
208
+ return z.object(shape).passthrough();
209
+ }
210
+ default: return z.any();
211
+ }
212
+ }
213
+
214
+ /**
215
+ * Validate parsed output against the schema hint.
216
+ * @returns {{ valid: boolean, errors: string[] }}
217
+ */
218
+ function validateAgainstSchema(parsed, schema) {
219
+ try {
220
+ const validator = jsonSchemaToZod(schema);
221
+ const result = validator.safeParse(parsed);
222
+ if (result.success) return { valid: true, errors: [] };
223
+ return {
224
+ valid: false,
225
+ errors: result.error.issues.map((i) => `${i.path.join('.') || '(root)'}: ${i.message}`)
226
+ };
227
+ } catch {
228
+ // Converter failure should not block extraction — treat as unvalidated.
229
+ return { valid: true, errors: [] };
230
+ }
94
231
  }
95
232
 
96
233
  // ── OpenAI call ───────────────────────────────────────────────────────────────
@@ -133,8 +270,10 @@ async function callOpenAI({ apiKey, model, systemMessage, userMessage, maxTokens
133
270
 
134
271
  // ── Anthropic call ────────────────────────────────────────────────────────────
135
272
 
136
- async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens }) {
273
+ async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens, schema }) {
137
274
  const url = `${anthropicBaseUrl()}/v1/messages`;
275
+ const useToolUse = schema && Object.keys(schema).length > 0;
276
+
138
277
  const body = {
139
278
  model,
140
279
  system: systemMessage,
@@ -142,6 +281,18 @@ async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTok
142
281
  max_tokens: maxTokens
143
282
  };
144
283
 
284
+ // C3: when a schema is provided, force structured output via tool-use. The
285
+ // tool's input_schema constrains the model and the tool_use input block is
286
+ // returned as already-valid JSON (no fence-stripping/parsing guesswork).
287
+ if (useToolUse) {
288
+ body.tools = [{
289
+ name: 'extract_data',
290
+ description: 'Return the extracted data conforming to the provided schema.',
291
+ input_schema: buildInputSchema(schema)
292
+ }];
293
+ body.tool_choice = { type: 'tool', name: 'extract_data' };
294
+ }
295
+
145
296
  const response = await fetch(url, {
146
297
  method: 'POST',
147
298
  headers: {
@@ -159,11 +310,21 @@ async function callAnthropic({ apiKey, model, systemMessage, userMessage, maxTok
159
310
  }
160
311
 
161
312
  const json = await response.json();
162
- const content = json.content?.[0]?.text ?? '';
163
313
  const usage = {
164
314
  input_tokens: json.usage?.input_tokens ?? 0,
165
315
  output_tokens: json.usage?.output_tokens ?? 0
166
316
  };
317
+
318
+ if (useToolUse) {
319
+ // Read the structured input from the tool_use block.
320
+ const toolBlock = (json.content || []).find((b) => b.type === 'tool_use');
321
+ if (toolBlock && toolBlock.input !== undefined) {
322
+ return { rawText: JSON.stringify(toolBlock.input), usage, model: json.model || model };
323
+ }
324
+ // Fall through to text if the model declined to call the tool.
325
+ }
326
+
327
+ const content = (json.content || []).find((b) => b.type === 'text')?.text ?? '';
167
328
  return { rawText: content, usage, model: json.model || model };
168
329
  }
169
330
 
@@ -229,7 +390,7 @@ async function callLLM({ provider, apiKey, model, systemMessage, userMessage, ma
229
390
  if (provider === 'ollama') {
230
391
  return callOllama({ model, systemMessage, userMessage, maxTokens, schema });
231
392
  }
232
- return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens });
393
+ return callAnthropic({ apiKey, model, systemMessage, userMessage, maxTokens, schema });
233
394
  }
234
395
 
235
396
  // ── Tool class ────────────────────────────────────────────────────────────────
@@ -304,7 +465,7 @@ export class ExtractWithLlm {
304
465
  const systemMessage =
305
466
  'You extract structured data from web content per the user\'s instructions. Return JSON only.';
306
467
 
307
- const userMessage = buildUserMessage(prompt, text, schema);
468
+ const { userMessage, truncated: inputTruncated, original_length } = buildUserMessage(prompt, text, schema);
308
469
 
309
470
  // Step 2: First LLM call — with sampling fallback for 'auto' provider
310
471
  // Fallback chain: Ollama → API key (handled by resolveProvider) → sampling → error
@@ -314,10 +475,17 @@ export class ExtractWithLlm {
314
475
  provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
315
476
  }));
316
477
  } catch (llmErr) {
317
- // D1.3: If provider is 'auto'/'ollama' and it failed, try sampling as final fallback
478
+ // D1.3: If provider is 'auto'/'ollama' and it failed, try MCP sampling as final fallback
318
479
  if (providerParam === 'auto' || providerParam === 'ollama') {
319
480
  try {
320
- ({ rawText, usage } = await callViaSampling({ systemMessage, userMessage, maxTokens }));
481
+ const SamplingClient = await getSamplingClient();
482
+ const samplingClient = new SamplingClient();
483
+ const { text: sampledText } = await samplingClient.complete(
484
+ `${systemMessage}\n\n${userMessage}`,
485
+ { maxTokens }
486
+ );
487
+ rawText = sampledText;
488
+ usage = { input_tokens: 0, output_tokens: 0 };
321
489
  resolvedModel = 'sampling';
322
490
  } catch (samplingErr) {
323
491
  return { success: false, error: `LLM call failed: ${llmErr.message}. Sampling fallback also failed: ${samplingErr.message}` };
@@ -362,13 +530,26 @@ export class ExtractWithLlm {
362
530
  }
363
531
  }
364
532
 
365
- return {
533
+ // C3: surface truncation metadata so callers know the input was clipped
534
+ const result = {
366
535
  success: true,
367
536
  data: parsed,
368
537
  provider: resolvedModel === 'sampling' ? 'sampling' : provider,
369
538
  model: resolvedModel || model,
370
539
  usage
371
540
  };
541
+ if (inputTruncated) {
542
+ result.truncated = true;
543
+ result.original_length = original_length;
544
+ }
545
+ // C3: validate output against the schema hint (zod). Non-fatal — the data
546
+ // is still returned; callers can inspect `valid`/`validationErrors`.
547
+ if (schema && Object.keys(schema).length > 0) {
548
+ const { valid, errors } = validateAgainstSchema(parsed, schema);
549
+ result.valid = valid;
550
+ if (!valid) result.validationErrors = errors;
551
+ }
552
+ return result;
372
553
  }
373
554
  }
374
555
 
@@ -41,14 +41,25 @@ export class ListOllamaModelsTool {
41
41
  return { success: false, baseUrl, error: `Invalid JSON from Ollama: ${err.message}` };
42
42
  }
43
43
 
44
- const models = (data.models || []).map((m) => ({
45
- name: m.name,
46
- size_bytes: m.size,
47
- modified_at: m.modified_at,
48
- family: m.details?.family,
49
- parameter_size: m.details?.parameter_size,
50
- quantization: m.details?.quantization_level
51
- }));
44
+ // C3: harden against non-array response; normalize modified_at to ISO 8601.
45
+ const rawModels = Array.isArray(data.models) ? data.models :
46
+ Array.isArray(data) ? data : [];
47
+
48
+ const models = rawModels.map((m) => {
49
+ let modified_at = m.modified_at ?? null;
50
+ if (modified_at !== null) {
51
+ const d = new Date(modified_at);
52
+ modified_at = isNaN(d.getTime()) ? modified_at : d.toISOString();
53
+ }
54
+ return {
55
+ name: m.name,
56
+ size_bytes: m.size,
57
+ modified_at,
58
+ family: m.details?.family,
59
+ parameter_size: m.details?.parameter_size,
60
+ quantization: m.details?.quantization_level
61
+ };
62
+ });
52
63
 
53
64
  return {
54
65
  success: true,
@@ -19,7 +19,12 @@ const ProcessDocumentSchema = z.object({
19
19
  extractMetadata: z.boolean().default(true),
20
20
  password: z.string().optional(),
21
21
  maxPages: z.number().min(1).max(500).default(100),
22
-
22
+ // C3: extract a specific 1-based, inclusive page range from a PDF
23
+ pageRange: z.object({
24
+ start: z.number().min(1).default(1),
25
+ end: z.number().min(1).optional()
26
+ }).optional(),
27
+
23
28
  // Web content options
24
29
  useReadability: z.boolean().default(true),
25
30
  extractStructuredData: z.boolean().default(true),
@@ -195,7 +200,8 @@ export class ProcessDocumentTool {
195
200
  extractText: options.extractText,
196
201
  extractMetadata: options.extractMetadata,
197
202
  password: options.password,
198
- maxPages: options.maxPages
203
+ maxPages: options.maxPages,
204
+ ...(options.pageRange ? { pageRange: options.pageRange } : {})
199
205
  }
200
206
  });
201
207
 
@@ -273,7 +279,7 @@ export class ProcessDocumentTool {
273
279
  headers: {
274
280
  'User-Agent': 'Mozilla/5.0 (compatible; MCP-WebScraper/3.0; Document-Processor)'
275
281
  },
276
- timeout: 15000
282
+ signal: AbortSignal.timeout(15000)
277
283
  });
278
284
 
279
285
  if (!response.ok) {
@@ -506,4 +512,4 @@ export class ProcessDocumentTool {
506
512
  }
507
513
  }
508
514
 
509
- export default ProcessDocumentTool;
515
+ export default ProcessDocumentTool;
@@ -68,6 +68,8 @@ const SummarizeContentResult = z.object({
68
68
  summarizedAt: z.string(),
69
69
  processingTime: z.number(),
70
70
  success: z.boolean(),
71
+ degraded: z.boolean().optional(),
72
+ degradedReason: z.string().optional(),
71
73
  error: z.string().optional()
72
74
  });
73
75
 
@@ -131,11 +133,17 @@ export class SummarizeContentTool {
131
133
  // Step 2: Set summary result
132
134
  result.summary = analysisResult.summary;
133
135
 
134
- // D1.3: If abstractive mode requested, attempt sampling-based enhancement
136
+ // D1.3: If abstractive mode requested, attempt sampling-based enhancement.
137
+ // If it can't run (no LLM/sampling available), fall back to the extractive
138
+ // result but flag it explicitly rather than silently masking.
135
139
  if (options.summaryType === 'abstractive') {
136
140
  const abstractive = await this._abstractiveSummaryViaSampling(text, analysisResult.summary, options.summaryLength);
137
141
  if (abstractive) {
138
142
  result.summary = abstractive;
143
+ } else {
144
+ result.summary = { ...result.summary, type: 'extractive' };
145
+ result.degraded = true;
146
+ result.degradedReason = 'Abstractive summarization unavailable (no LLM/sampling backend); returned extractive summary instead.';
139
147
  }
140
148
  }
141
149
 
@@ -191,6 +199,55 @@ export class SummarizeContentTool {
191
199
  }
192
200
  }
193
201
 
202
+ /**
203
+ * Generate an abstractive summary via the MCP SamplingClient fallback chain
204
+ * (Ollama → OpenAI → Anthropic → MCP sampling). Returns a summary object in the
205
+ * same shape as the extractive result, or null if no backend is available.
206
+ * @param {string} text - Full original text
207
+ * @param {Object} extractiveSummary - The extractive summary (for shape/fallback)
208
+ * @param {string} summaryLength - 'short' | 'medium' | 'long'
209
+ * @returns {Promise<Object|null>}
210
+ */
211
+ async _abstractiveSummaryViaSampling(text, extractiveSummary, summaryLength) {
212
+ try {
213
+ const SamplingClient = await getSamplingClient();
214
+ const client = new SamplingClient();
215
+
216
+ const lengthGuide = {
217
+ short: '1-2 sentences',
218
+ medium: '3-5 sentences',
219
+ long: '6-10 sentences'
220
+ }[summaryLength] || '3-5 sentences';
221
+
222
+ const prompt =
223
+ `Write a concise, fluent abstractive summary (${lengthGuide}) of the following text. ` +
224
+ `Capture the main ideas in your own words. Respond with only the summary text.\n\n` +
225
+ `${text.slice(0, 12000)}`;
226
+
227
+ const { text: summaryText } = await client.complete(prompt, { maxTokens: 600 });
228
+ if (!summaryText || !summaryText.trim()) {
229
+ return null;
230
+ }
231
+
232
+ const cleaned = summaryText.trim();
233
+ const sentences = splitSentences(cleaned);
234
+ const compressionRatio = text.length > 0
235
+ ? Math.round((cleaned.length / text.length) * 1000) / 1000
236
+ : 0;
237
+
238
+ return {
239
+ text: cleaned,
240
+ sentences,
241
+ type: 'abstractive',
242
+ length: summaryLength,
243
+ compressionRatio
244
+ };
245
+ } catch {
246
+ // No sampling/LLM backend available — caller falls back to extractive.
247
+ return null;
248
+ }
249
+ }
250
+
194
251
  /**
195
252
  * Extract key points from original text and summary
196
253
  * @param {string} originalText - Original text