crawlforge-mcp-server 4.7.1 → 4.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawlforge-mcp-server",
3
- "version": "4.7.1",
3
+ "version": "4.7.2",
4
4
  "mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
5
5
  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
6
6
  "main": "server.js",
package/server.js CHANGED
@@ -89,7 +89,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
89
89
  // Create the server
90
90
  const server = new McpServer({
91
91
  name: "crawlforge",
92
- version: "4.7.1",
92
+ version: "4.7.2",
93
93
  description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
94
94
  homepage: "https://www.crawlforge.dev",
95
95
  icon: "https://www.crawlforge.dev/icon.png"
@@ -736,6 +736,19 @@ server.registerTool("scrape_with_actions", {
736
736
  }, withAuth("scrape_with_actions", async (params) => {
737
737
  try {
738
738
  const result = await scrapeWithActionsTool.execute(params);
739
+
740
+ // Publish captured screenshots as crawlforge://screenshot/{actionId}
741
+ // resources (the documented contract) and annotate each with its URI.
742
+ if (Array.isArray(result.screenshots)) {
743
+ result.screenshots = result.screenshots.map((shot) => {
744
+ if (shot?.actionId && shot?.data) {
745
+ resourceRegistry.storeScreenshot(shot.actionId, shot.data);
746
+ return { ...shot, resourceUri: `crawlforge://screenshot/${shot.actionId}` };
747
+ }
748
+ return shot;
749
+ });
750
+ }
751
+
739
752
  return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
740
753
  } catch (error) {
741
754
  return { content: [{ type: "text", text: `Scrape with actions failed: ${error.message}` }], isError: true };
@@ -23,8 +23,8 @@ const WaitActionSchema = BaseActionSchema.extend({
23
23
  selector: z.string().optional(),
24
24
  condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional(),
25
25
  text: z.string().optional()
26
- }).refine(data => data.duration || data.milliseconds || data.selector || data.text, {
27
- message: 'Wait action requires duration/milliseconds, selector, or text'
26
+ }).refine(data => data.duration || data.milliseconds || data.timeout || data.selector || data.text, {
27
+ message: 'Wait action requires duration/milliseconds/timeout, selector, or text'
28
28
  });
29
29
 
30
30
  const ClickActionSchema = BaseActionSchema.extend({
@@ -329,6 +329,18 @@ export class ActionExecutor extends EventEmitter {
329
329
  executionContext.results.push(actionResult);
330
330
  this.stats.totalActions++;
331
331
 
332
+ // Collect screenshots produced by successful screenshot actions so
333
+ // they surface in the tool result (not just error screenshots).
334
+ if (actionResult.success && action.type === 'screenshot' && actionResult.result?.data) {
335
+ executionContext.screenshots.push({
336
+ actionId: actionResult.id,
337
+ data: actionResult.result.data,
338
+ format: actionResult.result.format,
339
+ fullPage: actionResult.result.fullPage,
340
+ timestamp: actionResult.timestamp
341
+ });
342
+ }
343
+
332
344
  if (actionResult.success) {
333
345
  this.stats.successfulActions++;
334
346
  } else {
@@ -382,7 +394,16 @@ export class ActionExecutor extends EventEmitter {
382
394
  this.emit('actionStarted', { actionId, action, chainId: executionContext.id });
383
395
 
384
396
  let result;
385
- const timeout = action.timeout || this.defaultTimeout;
397
+ let timeout = action.timeout || this.defaultTimeout;
398
+
399
+ // A `wait` action that uses `timeout` as its pause duration (no
400
+ // duration/milliseconds/selector/text) must not also use that same value
401
+ // as its abort deadline, or the abort would race the wait. Give headroom.
402
+ if (action.type === 'wait' &&
403
+ !action.duration && !action.milliseconds && !action.selector && !action.text &&
404
+ action.timeout) {
405
+ timeout = Math.max(this.defaultTimeout, action.timeout + 5000);
406
+ }
386
407
 
387
408
  // Execute based on action type with timeout
388
409
  const executionPromise = this.executeActionByType(page, action);
@@ -467,8 +488,11 @@ export class ActionExecutor extends EventEmitter {
467
488
  * @returns {Promise<Object>} Wait result
468
489
  */
469
490
  async executeWaitAction(page, action) {
470
- // Handle both 'duration' and 'milliseconds' for backwards compatibility
471
- const waitTime = action.duration || action.milliseconds;
491
+ // Handle 'duration'/'milliseconds' (and 'timeout' as a pause duration only
492
+ // when no selector/text is given — selector/text waits use 'timeout' as
493
+ // their abort deadline instead).
494
+ const waitTime = action.duration || action.milliseconds ||
495
+ (!action.selector && !action.text ? action.timeout : undefined);
472
496
  if (waitTime) {
473
497
  await this.delay(waitTime);
474
498
  return { waited: waitTime };
@@ -492,7 +516,7 @@ export class ActionExecutor extends EventEmitter {
492
516
  return { text: action.text };
493
517
  }
494
518
 
495
- throw new Error('Wait action requires duration, selector, or text');
519
+ throw new Error('Wait action requires duration/milliseconds/timeout, selector, or text');
496
520
  }
497
521
 
498
522
  /**
@@ -167,6 +167,9 @@ export class ResourceRegistry {
167
167
  * @returns {{ contents: Array<{ uri: string, mimeType: string, text?: string, blob?: string }> }}
168
168
  */
169
169
  async readResource(uri) {
170
+ // The MCP SDK hands the read callback a URL object, not a string; coerce so
171
+ // the sub-readers and parseResourceUri (which calls String#startsWith) work.
172
+ uri = typeof uri === 'string' ? uri : (uri?.href ?? String(uri));
170
173
  const parsed = parseResourceUri(uri);
171
174
  if (!parsed) {
172
175
  throw new Error(`Unknown resource URI: ${uri}`);
@@ -619,6 +619,13 @@ export class ScrapeWithActionsTool extends EventEmitter {
619
619
  customSelectors: params.extractionOptions?.selectors
620
620
  };
621
621
 
622
+ // extractContent only emits content.markdown when explicitly asked; honor
623
+ // a requested "markdown" format so generateFormats doesn't fall back to a
624
+ // "Content not available in markdown format" placeholder.
625
+ if (params.formats?.includes('markdown')) {
626
+ options.outputFormat = 'markdown';
627
+ }
628
+
622
629
  // Prefer the post-action live page HTML captured during action execution.
623
630
  // This ensures the final content reflects clicks/typing/navigation rather
624
631
  // than re-fetching the original (pre-action) URL.
@@ -15,6 +15,22 @@ const _pkg = _require('../../../package.json');
15
15
  const CRAWLFORGE_UA = `CrawlForge/${_pkg.version} (+https://crawlforge.dev)`;
16
16
  import { fetchAndParse } from './_fetchAndParse.js';
17
17
 
18
+ // Semantic element selectors for well-known field names, tried as a last
19
+ // resort in the CSS fallback so common fields (e.g. "title") still resolve when
20
+ // no LLM provider and no selectorHints are available. Element/text selectors
21
+ // only — meta tags are already handled separately above.
22
+ const SEMANTIC_FIELD_SELECTORS = {
23
+ title: ['h1', 'title'],
24
+ name: ['h1', 'title'],
25
+ heading: ['h1', 'h2'],
26
+ headline: ['h1', 'h2'],
27
+ description: ['article p', 'main p', '.description', 'p'],
28
+ summary: ['article p', 'main p', 'p'],
29
+ author: ['[rel="author"]', '.author', '.byline'],
30
+ date: ['time', '.date'],
31
+ published: ['time', '.published', '.date']
32
+ };
33
+
18
34
  const ExtractStructuredSchema = z.object({
19
35
  url: z.string().url(),
20
36
  schema: z.object({
@@ -245,6 +261,33 @@ export class ExtractStructuredTool {
245
261
  }
246
262
  }
247
263
  }
264
+
265
+ // Last resort: semantic element selectors for well-known field names
266
+ // (e.g. title -> <h1>/<title>) so common fields resolve without hints.
267
+ if (!(key in extracted)) {
268
+ const semanticSelectors = SEMANTIC_FIELD_SELECTORS[key.toLowerCase()];
269
+ if (semanticSelectors) {
270
+ for (const sel of semanticSelectors) {
271
+ const el = $(sel);
272
+ if (el.length === 0) continue;
273
+ if (isArrayField && el.length > 1) {
274
+ const values = el.map((_, item) => $(item).text().trim()).get().filter(Boolean);
275
+ if (values.length > 0) {
276
+ extracted[key] = values;
277
+ fieldsFound++;
278
+ break;
279
+ }
280
+ } else {
281
+ const rawValue = el.first().text().trim();
282
+ if (rawValue) {
283
+ extracted[key] = this._coerceValue(rawValue, fieldSchema);
284
+ fieldsFound++;
285
+ break;
286
+ }
287
+ }
288
+ }
289
+ }
290
+ }
248
291
  }
249
292
 
250
293
  if (fieldsFound === 0) {