npm - crawlforge-mcp-server - Versions diffs - 4.7.0 → 4.7.2 - Mend

crawlforge-mcp-server 4.7.0 → 4.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md +2 -2
package/package.json +1 -1
package/server.js +14 -1
package/src/core/ActionExecutor.js +30 -6
package/src/core/LLMsTxtAnalyzer.js +10 -1
package/src/core/ResearchOrchestrator.js +5 -3
package/src/resources/ResourceRegistry.js +3 -0
package/src/tools/advanced/ScrapeWithActionsTool.js +7 -0
package/src/tools/extract/extractStructured.js +43 -0
package/src/tools/llmstxt/generateLLMsTxt.js +3 -1
package/src/tools/research/deepResearch.js +4 -1

package/README.md CHANGED Viewed

@@ -199,7 +199,7 @@ For the full canonical capabilities reference (all tools, CLI commands, stealth
 | **Business** ($399) | 250,000 | Large scale operations |
 **All plans include:**
-- Access to all 26 tools (the 15 local tools never consume credits)
+- Access to all 26 tools
 - Credits never expire and roll over month-to-month
 - API access and webhook notifications
@@ -298,7 +298,7 @@ Once configured, use these tools in your AI assistant:
 ## 🔒 Security & Privacy
-- **Secure Authentication**: API keys required for all metered premium tools (the 15 free local tools run without one)
+- **Secure Authentication**: API keys required for all metered tools
 - **Local Storage**: API keys stored securely at `~/.crawlforge/config.json`
 - **HTTPS Only**: All connections use encrypted HTTPS
 - **No Data Retention**: We don't store scraped data, only usage logs

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawlforge-mcp-server",
-  "version": "4.7.0",
+  "version": "4.7.2",
   "mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
   "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
   "main": "server.js",

package/server.js CHANGED Viewed

@@ -89,7 +89,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
 // Create the server
 const server = new McpServer({
   name: "crawlforge",
-  version: "4.7.0",
+  version: "4.7.2",
   description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
   homepage: "https://www.crawlforge.dev",
   icon: "https://www.crawlforge.dev/icon.png"
@@ -736,6 +736,19 @@ server.registerTool("scrape_with_actions", {
 }, withAuth("scrape_with_actions", async (params) => {
   try {
     const result = await scrapeWithActionsTool.execute(params);
+    // Publish captured screenshots as crawlforge://screenshot/{actionId}
+    // resources (the documented contract) and annotate each with its URI.
+    if (Array.isArray(result.screenshots)) {
+      result.screenshots = result.screenshots.map((shot) => {
+        if (shot?.actionId && shot?.data) {
+          resourceRegistry.storeScreenshot(shot.actionId, shot.data);
+          return { ...shot, resourceUri: `crawlforge://screenshot/${shot.actionId}` };
+        }
+        return shot;
+      });
+    }
     return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
   } catch (error) {
     return { content: [{ type: "text", text: `Scrape with actions failed: ${error.message}` }], isError: true };

package/src/core/ActionExecutor.js CHANGED Viewed

@@ -23,8 +23,8 @@ const WaitActionSchema = BaseActionSchema.extend({
   selector: z.string().optional(),
   condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional(),
   text: z.string().optional()
-}).refine(data => data.duration || data.milliseconds || data.selector || data.text, {
-  message: 'Wait action requires duration/milliseconds, selector, or text'
+}).refine(data => data.duration || data.milliseconds || data.timeout || data.selector || data.text, {
+  message: 'Wait action requires duration/milliseconds/timeout, selector, or text'
 });
 const ClickActionSchema = BaseActionSchema.extend({
@@ -329,6 +329,18 @@ export class ActionExecutor extends EventEmitter {
           executionContext.results.push(actionResult);
           this.stats.totalActions++;
+          // Collect screenshots produced by successful screenshot actions so
+          // they surface in the tool result (not just error screenshots).
+          if (actionResult.success && action.type === 'screenshot' && actionResult.result?.data) {
+            executionContext.screenshots.push({
+              actionId: actionResult.id,
+              data: actionResult.result.data,
+              format: actionResult.result.format,
+              fullPage: actionResult.result.fullPage,
+              timestamp: actionResult.timestamp
+            });
+          }
           if (actionResult.success) {
             this.stats.successfulActions++;
           } else {
@@ -382,7 +394,16 @@ export class ActionExecutor extends EventEmitter {
       this.emit('actionStarted', { actionId, action, chainId: executionContext.id });
       let result;
-      const timeout = action.timeout || this.defaultTimeout;
+      let timeout = action.timeout || this.defaultTimeout;
+      // A `wait` action that uses `timeout` as its pause duration (no
+      // duration/milliseconds/selector/text) must not also use that same value
+      // as its abort deadline, or the abort would race the wait. Give headroom.
+      if (action.type === 'wait' &&
+          !action.duration && !action.milliseconds && !action.selector && !action.text &&
+          action.timeout) {
+        timeout = Math.max(this.defaultTimeout, action.timeout + 5000);
+      }
       // Execute based on action type with timeout
       const executionPromise = this.executeActionByType(page, action);
@@ -467,8 +488,11 @@ export class ActionExecutor extends EventEmitter {
    * @returns {Promise<Object>} Wait result
    */
   async executeWaitAction(page, action) {
-    // Handle both 'duration' and 'milliseconds' for backwards compatibility
-    const waitTime = action.duration || action.milliseconds;
+    // Handle 'duration'/'milliseconds' (and 'timeout' as a pause duration only
+    // when no selector/text is given — selector/text waits use 'timeout' as
+    // their abort deadline instead).
+    const waitTime = action.duration || action.milliseconds ||
+      (!action.selector && !action.text ? action.timeout : undefined);
     if (waitTime) {
       await this.delay(waitTime);
       return { waited: waitTime };
@@ -492,7 +516,7 @@ export class ActionExecutor extends EventEmitter {
       return { text: action.text };
     }
-    throw new Error('Wait action requires duration, selector, or text');
+    throw new Error('Wait action requires duration/milliseconds/timeout, selector, or text');
   }
   /**

package/src/core/LLMsTxtAnalyzer.js CHANGED Viewed

@@ -50,7 +50,16 @@ export class LLMsTxtAnalyzer {
       apis: [],
       contentTypes: {},
       securityAreas: [],
-      rateLimit: {},
+      // Conservative defaults so output never renders `undefined` when live
+      // rate-limit probing is skipped (analyzeRateLimiting only runs with
+      // probeRateLimit:true). Overwritten with measured values when probed.
+      rateLimit: {
+        recommendedDelay: 1000,
+        maxConcurrency: 5,
+        recommendedRPM: 30,
+        reasoning: 'Conservative defaults applied; live rate-limit probing was not performed (pass probeRateLimit:true to measure actual response times).',
+        averageResponseTime: null
+      },
       guidelines: {},
       metadata: {},
       errors: []

package/src/core/ResearchOrchestrator.js CHANGED Viewed

@@ -32,6 +32,7 @@ export class ResearchOrchestrator extends EventEmitter {
       concurrency = 5,
       enableSourceVerification = true,
       enableConflictDetection = true,
+      credibilityThreshold = 0.3,
       cacheEnabled = true,
       cacheTTL = 1800000, // 30 minutes
       researchApproach = 'broad',
@@ -61,6 +62,7 @@ export class ResearchOrchestrator extends EventEmitter {
     this.concurrency = Math.min(Math.max(1, concurrency), 20);
     this.enableSourceVerification = enableSourceVerification;
     this.enableConflictDetection = enableConflictDetection;
+    this.credibilityThreshold = Math.min(Math.max(0, credibilityThreshold), 1);
     // Stealth fallback config + lazy state (browser launched only on first block)
     this.enableStealthFallback = enableStealthFallback;
@@ -859,7 +861,7 @@ export class ResearchOrchestrator extends EventEmitter {
         }
         // Only include sources that meet minimum credibility threshold
-        if (overallCredibility >= 0.3) {
+        if (overallCredibility >= this.credibilityThreshold) {
           verifiedSources.push({
             ...source,
             credibilityFactors,
@@ -1360,7 +1362,7 @@ export class ResearchOrchestrator extends EventEmitter {
   generateKeyFindings(claimGroups, sources) {
     return claimGroups
-      .filter(group => group.avgCredibility >= 0.3)
+      .filter(group => group.avgCredibility >= this.credibilityThreshold)
       .sort((a, b) => b.consensusStrength - a.consensusStrength)
       .slice(0, 10)
       .map(group => ({
@@ -1373,7 +1375,7 @@ export class ResearchOrchestrator extends EventEmitter {
   compileSupportingEvidence(sources) {
     return sources
-      .filter(source => source.overallCredibility >= 0.3)
+      .filter(source => source.overallCredibility >= this.credibilityThreshold)
       .map(source => ({
         title: source.title,
         url: source.link,

package/src/resources/ResourceRegistry.js CHANGED Viewed

@@ -167,6 +167,9 @@ export class ResourceRegistry {
    * @returns {{ contents: Array<{ uri: string, mimeType: string, text?: string, blob?: string }> }}
    */
   async readResource(uri) {
+    // The MCP SDK hands the read callback a URL object, not a string; coerce so
+    // the sub-readers and parseResourceUri (which calls String#startsWith) work.
+    uri = typeof uri === 'string' ? uri : (uri?.href ?? String(uri));
     const parsed = parseResourceUri(uri);
     if (!parsed) {
       throw new Error(`Unknown resource URI: ${uri}`);

package/src/tools/advanced/ScrapeWithActionsTool.js CHANGED Viewed

@@ -619,6 +619,13 @@ export class ScrapeWithActionsTool extends EventEmitter {
         customSelectors: params.extractionOptions?.selectors
       };
+      // extractContent only emits content.markdown when explicitly asked; honor
+      // a requested "markdown" format so generateFormats doesn't fall back to a
+      // "Content not available in markdown format" placeholder.
+      if (params.formats?.includes('markdown')) {
+        options.outputFormat = 'markdown';
+      }
       // Prefer the post-action live page HTML captured during action execution.
       // This ensures the final content reflects clicks/typing/navigation rather
       // than re-fetching the original (pre-action) URL.

package/src/tools/extract/extractStructured.js CHANGED Viewed

@@ -15,6 +15,22 @@ const _pkg = _require('../../../package.json');
 const CRAWLFORGE_UA = `CrawlForge/${_pkg.version} (+https://crawlforge.dev)`;
 import { fetchAndParse } from './_fetchAndParse.js';
+// Semantic element selectors for well-known field names, tried as a last
+// resort in the CSS fallback so common fields (e.g. "title") still resolve when
+// no LLM provider and no selectorHints are available. Element/text selectors
+// only — meta tags are already handled separately above.
+const SEMANTIC_FIELD_SELECTORS = {
+  title: ['h1', 'title'],
+  name: ['h1', 'title'],
+  heading: ['h1', 'h2'],
+  headline: ['h1', 'h2'],
+  description: ['article p', 'main p', '.description', 'p'],
+  summary: ['article p', 'main p', 'p'],
+  author: ['[rel="author"]', '.author', '.byline'],
+  date: ['time', '.date'],
+  published: ['time', '.published', '.date']
+};
 const ExtractStructuredSchema = z.object({
   url: z.string().url(),
   schema: z.object({
@@ -245,6 +261,33 @@ export class ExtractStructuredTool {
           }
         }
       }
+      // Last resort: semantic element selectors for well-known field names
+      // (e.g. title -> <h1>/<title>) so common fields resolve without hints.
+      if (!(key in extracted)) {
+        const semanticSelectors = SEMANTIC_FIELD_SELECTORS[key.toLowerCase()];
+        if (semanticSelectors) {
+          for (const sel of semanticSelectors) {
+            const el = $(sel);
+            if (el.length === 0) continue;
+            if (isArrayField && el.length > 1) {
+              const values = el.map((_, item) => $(item).text().trim()).get().filter(Boolean);
+              if (values.length > 0) {
+                extracted[key] = values;
+                fieldsFound++;
+                break;
+              }
+            } else {
+              const rawValue = el.first().text().trim();
+              if (rawValue) {
+                extracted[key] = this._coerceValue(rawValue, fieldSchema);
+                fieldsFound++;
+                break;
+              }
+            }
+          }
+        }
+      }
     }
     if (fieldsFound === 0) {

package/src/tools/llmstxt/generateLLMsTxt.js CHANGED Viewed

@@ -391,7 +391,9 @@ export class GenerateLLMsTxtTool {
       lines.push('');
       lines.push('### Technical Justification');
       lines.push(`${analysis.rateLimit.reasoning}`);
-      lines.push(`Average response time: ${analysis.rateLimit.averageResponseTime}ms`);
+      if (analysis.rateLimit.averageResponseTime != null) {
+        lines.push(`Average response time: ${analysis.rateLimit.averageResponseTime}ms`);
+      }
       lines.push('');
     }

package/src/tools/research/deepResearch.js CHANGED Viewed

@@ -272,6 +272,10 @@ export class DeepResearchTool {
       maxUrls: params.maxUrls,
       timeLimit: params.timeLimit,
       concurrency: params.concurrency,
+      // Minimum credibility a source must clear in verifySourceCredibility.
+      // Must be on the orchestrator *constructor* config (not the
+      // conductResearch options) — that is the only place it is now read.
+      credibilityThreshold: params.credibilityThreshold,
       // The orchestrator tunes its query expansion to the approach (commercial
       // vs academic vs current-events); without this it always used academic
       // variations, which poisoned commercial/comparative searches.
@@ -356,7 +360,6 @@ export class DeepResearchTool {
   buildResearchOptions(params) {
     return {
       sourceTypes: params.sourceTypes,
-      credibilityThreshold: params.credibilityThreshold,
       includeRecentOnly: params.includeRecentOnly,
       queryExpansion: params.queryExpansion,
       enableConflictDetection: params.enableConflictDetection,