npm - crawlforge-mcp-server - Versions diffs - 3.0.16 → 3.0.18 - Mend

crawlforge-mcp-server 3.0.16 → 3.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/CLAUDE.md +5 -3
package/README.md +5 -2
package/package.json +17 -7
package/src/constants/config.js +2 -1
package/src/core/AuthManager.js +112 -27
package/src/core/ResearchOrchestrator.js +86 -5
package/src/core/endpointGuard.js +37 -0
package/src/tools/research/deepResearch.js +33 -8

package/CLAUDE.md CHANGED Viewed

@@ -60,7 +60,7 @@ These guidelines are working if: fewer unnecessary changes in diffs, fewer rewri
 ## Project Overview
-CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing 19 web scraping, crawling, and content processing tools.
+CrawlForge MCP Server - A professional MCP (Model Context Protocol) server providing 20 web scraping, crawling, and content processing tools.
 **Current Version:** 3.0.12
@@ -141,13 +141,13 @@ Tools are organized in subdirectories by category:
 - `tracking/` - trackChanges
 - `llmstxt/` - generateLLMsTxt
-### Available MCP Tools (19 total)
+### Available MCP Tools (20 total)
 **Basic Tools (server.js inline):**
 fetch_url, extract_text, extract_links, extract_metadata, scrape_structured
 **Advanced Tools:**
-search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization
+search_web, crawl_deep, map_site, extract_content, process_document, summarize_content, analyze_content, extract_structured, batch_scrape, scrape_with_actions, deep_research, track_changes, generate_llms_txt, stealth_mode, localization
 ### MCP Server Entry Point
@@ -183,6 +183,8 @@ MAX_PAGES_PER_CRAWL=100
 RESPECT_ROBOTS_TXT=true
 ```
+`OPENAI_API_KEY` / `ANTHROPIC_API_KEY` are optional. They only affect `deep_research`: when set, it produces a fully synthesized report internally; when unset, it returns raw evidence for the calling LLM (e.g. Claude Code) to synthesize.
 ### Configuration Files
 - `~/.crawlforge/config.json` - User authentication and API key storage

package/README.md CHANGED Viewed

@@ -9,7 +9,7 @@ Professional web scraping and content extraction server implementing the Model C
 ## 🎯 Features
-- **18 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis
+- **20 Professional Tools**: Web scraping, deep research, stealth browsing, content analysis
 - **Free Tier**: 1,000 credits to get started instantly
 - **MCP Compatible**: Works with Claude, Cursor, and other MCP-enabled AI tools
 - **Enterprise Ready**: Scale up with paid plans for production use
@@ -111,6 +111,8 @@ Restart Cursor to activate.
 - `search_web` - Search the web using Google Search API
 - `summarize_content` - Generate intelligent summaries
 - `analyze_content` - Comprehensive content analysis
+- `extract_structured` - LLM-powered schema-driven extraction
+- `track_changes` - Monitor content changes over time
 ### Premium Tools (5-10 credits)
 - `crawl_deep` - Deep crawl entire websites
@@ -136,7 +138,7 @@ Restart Cursor to activate.
 | **Enterprise** | 250,000 | Large scale operations |
 **All plans include:**
-- Access to all 18 tools
+- Access to all 20 tools
 - Credits never expire and roll over month-to-month
 - API access and webhook notifications
@@ -152,6 +154,7 @@ export CRAWLFORGE_API_KEY="cf_live_your_api_key_here"
 # Optional: Custom API endpoint (for enterprise)
 export CRAWLFORGE_API_URL="https://api.crawlforge.dev"
+# As of v3.0.18, this variable is validated against an allow-list of CrawlForge backend hosts.
 ```
 ### Manual Configuration

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "crawlforge-mcp-server",
-  "version": "3.0.16",
-  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 19 comprehensive web scraping, crawling, and content processing tools.",
+  "version": "3.0.18",
+  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 20 comprehensive web scraping, crawling, and content processing tools.",
   "main": "server.js",
   "bin": {
     "crawlforge": "server.js",
@@ -13,6 +13,7 @@
     "setup": "node setup.js",
     "dev": "cross-env NODE_ENV=development node server.js",
     "test": "node tests/integration/mcp-protocol-compliance.test.js",
+    "test:unit": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/unit/*.test.js'",
     "test:tools": "node test-tools.js",
     "test:real-world": "node test-real-world.js",
     "test:all": "bash run-all-tests.sh",
@@ -90,15 +91,15 @@
   ],
   "dependencies": {
     "@googleapis/customsearch": "^5.0.1",
-    "@modelcontextprotocol/sdk": "^1.17.3",
+    "@modelcontextprotocol/sdk": "^1.29.0",
     "@mozilla/readability": "^0.6.0",
     "cheerio": "^1.1.2",
     "compromise": "^14.14.4",
     "diff": "^8.0.2",
     "dotenv": "^17.2.1",
     "franc": "^6.2.0",
-    "isomorphic-dompurify": "^2.26.0",
-    "jsdom": "^26.1.0",
+    "isomorphic-dompurify": "^3.9.0",
+    "jsdom": "^29.0.2",
     "lru-cache": "^11.1.0",
     "node-cron": "^3.0.3",
     "node-summarizer": "^1.0.7",
@@ -110,9 +111,18 @@
     "zod": "^3.23.8"
   },
   "devDependencies": {
-    "@jest/globals": "^30.0.5",
+    "@jest/globals": "^30.3.0",
     "cross-env": "^10.0.0",
-    "jest": "^30.0.5",
+    "jest": "^30.3.0",
     "shx": "^0.4.0"
+  },
+  "overrides": {
+    "undici": "^7.24.0",
+    "underscore": "^1.13.8",
+    "qs": "^6.14.2",
+    "path-to-regexp": "^8.4.2",
+    "@hono/node-server": "^1.19.13",
+    "hono": "^4.12.4",
+    "dompurify": "^3.4.0"
   }
 }

package/src/constants/config.js CHANGED Viewed

@@ -1,6 +1,7 @@
 import dotenv from 'dotenv';
 import { fileURLToPath } from 'url';
 import { dirname, join } from 'path';
+import { resolveApiEndpoint } from '../core/endpointGuard.js';
 // Load environment variables
 const __filename = fileURLToPath(import.meta.url);
@@ -11,7 +12,7 @@ export const config = {
   // CrawlForge API Configuration
   crawlforge: {
     apiKey: process.env.CRAWLFORGE_API_KEY || '',
-    apiBaseUrl: process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev'
+    apiBaseUrl: resolveApiEndpoint(process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev')
   },
   // Performance

package/src/core/AuthManager.js CHANGED Viewed

@@ -7,15 +7,18 @@
 import fs from 'fs/promises';
 import path from 'path';
 import { isCreatorModeVerified } from './creatorMode.js';
+import { resolveApiEndpoint } from './endpointGuard.js';
 class AuthManager {
   constructor() {
-    this.apiEndpoint = process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev';
+    this.apiEndpoint = resolveApiEndpoint(process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev');
     this.configPath = path.join(process.env.HOME || process.env.USERPROFILE, '.crawlforge', 'config.json');
+    this.pendingUsagePath = path.join(process.env.HOME || process.env.USERPROFILE, '.crawlforge', 'pending-usage.json');
     this.config = null;
     this.creditCache = new Map();
     this.lastCreditCheck = null;
-    this.CREDIT_CHECK_INTERVAL = 60000; // Check credits every minute max
+    this.lastSuccessfulCreditCheck = new Map();
+    this.CREDIT_CHECK_INTERVAL = 15000;
     this.initialized = false;
     // NOTE: Don't read creator mode in constructor - it's set dynamically in server.js
   }
@@ -48,6 +51,12 @@ class AuthManager {
       console.log('No existing CrawlForge configuration found. Run setup to configure.');
       this.initialized = true;
     }
+    try {
+      await this._flushPendingUsage();
+    } catch {
+      // Best-effort flush — do not block startup
+    }
   }
   /**
@@ -192,20 +201,16 @@ class AuthManager {
         const data = await response.json();
         this.creditCache.set(this.config.userId, data.creditsRemaining);
         this.lastCreditCheck = now;
+        this.lastSuccessfulCreditCheck.set(this.config.userId, now);
         return data.creditsRemaining >= estimatedCredits;
       }
     } catch (error) {
       console.error('Failed to check credits:', error.message);
-      // Grace period: allow stale cached credits during transient network failures
-      // This prevents outages from blocking authenticated users while still
-      // failing closed when there's no cached data (no free usage bypass)
+      const lastOk = this.lastSuccessfulCreditCheck.get(this.config.userId) ?? 0;
+      const withinGrace = Date.now() - lastOk < 30_000;
       const cached = this.creditCache.get(this.config.userId);
-      if (cached !== undefined && cached >= estimatedCredits) {
-        console.warn('Using cached credits due to network error — will re-verify on next call');
-        return true;
-      }
+      if (withinGrace && cached !== undefined && cached >= estimatedCredits) return true;
       throw new Error('Unable to verify credits. Please check your connection and try again.');
     }
   }
@@ -218,39 +223,119 @@ class AuthManager {
     if (this.isCreatorMode()) {
       return;
     }
     if (!this.config) {
       return; // Silently skip if not configured
     }
-    try {
-      const payload = {
-        tool,
-        creditsUsed,
-        requestData,
-        responseStatus,
-        processingTime,
-        timestamp: new Date().toISOString(),
-        version: '3.0.3'
-      };
+    const userId = this.config.userId;
+    // Pre-decrement cache before fetch so network failures still deplete credits
+    const cached = this.creditCache.get(userId);
+    if (cached !== undefined) {
+      this.creditCache.set(userId, Math.max(0, cached - creditsUsed));
+    }
+    const payload = {
+      tool,
+      creditsUsed,
+      requestData,
+      responseStatus,
+      processingTime,
+      timestamp: new Date().toISOString(),
+      version: '3.0.3'
+    };
+    try {
       await fetch(`${this.apiEndpoint}/api/v1/usage`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
           'X-API-Key': this.config.apiKey
         },
-        body: JSON.stringify(payload)
+        body: JSON.stringify(payload),
+        signal: AbortSignal.timeout(5000)
       });
-      // Update cached credits
-      const cached = this.creditCache.get(this.config.userId);
-      if (cached !== undefined) {
-        this.creditCache.set(this.config.userId, Math.max(0, cached - creditsUsed));
-      }
+      await this._flushPendingUsage();
     } catch (error) {
       // Log but don't throw - usage reporting should not break tool execution
       console.error('Failed to report usage:', error.message);
+      await this._appendPendingUsage({ toolName: tool, creditsUsed, userId, timestamp: payload.timestamp });
+    }
+  }
+  async _appendPendingUsage(entry) {
+    try {
+      const configDir = path.dirname(this.pendingUsagePath);
+      await fs.mkdir(configDir, { recursive: true });
+      let entries = [];
+      try {
+        const raw = await fs.readFile(this.pendingUsagePath, 'utf-8');
+        entries = JSON.parse(raw);
+      } catch {
+        // File absent or corrupt — start fresh
+      }
+      entries.push(entry);
+      // Cap at 1 MB — drop oldest entries until serialized size fits
+      let serialized = JSON.stringify(entries);
+      while (serialized.length > 1_048_576 && entries.length > 1) {
+        entries.shift();
+        serialized = JSON.stringify(entries);
+      }
+      await fs.writeFile(this.pendingUsagePath, serialized, { mode: 0o600 });
+    } catch (error) {
+      console.error('Failed to append pending usage:', error.message);
+    }
+  }
+  async _flushPendingUsage() {
+    if (!this.config) return;
+    let entries;
+    try {
+      const raw = await fs.readFile(this.pendingUsagePath, 'utf-8');
+      entries = JSON.parse(raw);
+    } catch {
+      return; // Nothing to flush
+    }
+    if (!Array.isArray(entries) || entries.length === 0) return;
+    const remaining = [];
+    for (const entry of entries) {
+      try {
+        await fetch(`${this.apiEndpoint}/api/v1/usage`, {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+            'X-API-Key': this.config.apiKey
+          },
+          body: JSON.stringify({
+            tool: entry.toolName,
+            creditsUsed: entry.creditsUsed,
+            timestamp: entry.timestamp,
+            version: '3.0.3'
+          }),
+          signal: AbortSignal.timeout(5000)
+        });
+      } catch {
+        remaining.push(entry);
+      }
+    }
+    try {
+      if (remaining.length === 0) {
+        await fs.unlink(this.pendingUsagePath);
+      } else {
+        await fs.writeFile(this.pendingUsagePath, JSON.stringify(remaining), { mode: 0o600 });
+      }
+    } catch (error) {
+      console.error('Failed to update pending usage file:', error.message);
     }
   }

package/src/core/ResearchOrchestrator.js CHANGED Viewed

@@ -120,30 +120,35 @@ export class ResearchOrchestrator extends EventEmitter {
       // Stage 1: Initial topic exploration and query expansion
       const expandedQueries = await this.expandResearchTopic(topic);
+      this.researchState.currentDepth = 1;
       this.logActivity('topic_expansion', { originalTopic: topic, expandedQueries });
       // Stage 2: Broad information gathering
       const initialSources = await this.gatherInitialSources(expandedQueries, options);
+      this.researchState.currentDepth = 2;
       this.logActivity('initial_gathering', { sourcesFound: initialSources.length });
       // Stage 3: Deep exploration of promising sources
       const detailedFindings = await this.exploreSourcesInDepth(initialSources, options);
+      this.researchState.currentDepth = 3;
       this.logActivity('deep_exploration', { findingsCount: detailedFindings.length });
       // Stage 4: Source credibility assessment
-      const verifiedSources = this.enableSourceVerification ?
+      const verifiedSources = this.enableSourceVerification ?
         await this.verifySourceCredibility(detailedFindings) : detailedFindings;
+      this.researchState.currentDepth = 4;
       this.logActivity('source_verification', { verifiedCount: verifiedSources.length });
       // Stage 5: Information synthesis and conflict detection
       const synthesizedResults = await this.synthesizeInformation(verifiedSources, topic);
+      this.researchState.currentDepth = 5;
       this.logActivity('information_synthesis', { conflictsFound: synthesizedResults.conflicts.length });
-      // Stage 6: Final result compilation
-      const finalResults = this.compileResearchResults(topic, synthesizedResults, options);
       const totalTime = Date.now() - startTime;
       this.metrics.totalProcessingTime = totalTime;
+      // Stage 6: Final result compilation
+      const finalResults = this.compileResearchResults(topic, synthesizedResults, options);
       this.logger.info('Research completed', {
         sessionId,
@@ -636,10 +641,22 @@ export class ResearchOrchestrator extends EventEmitter {
       consensus: [],
       gaps: [],
       recommendations: [],
-      llmSynthesis: null
+      llmSynthesis: null,
+      rawEvidence: null,
+      synthesisMode: this.enableLLMFeatures ? 'llm' : 'raw_evidence'
     };
     try {
+      // Without an LLM the keyword/frequency-based synthesis produces
+      // unreadable output. Skip it and return raw evidence for the calling
+      // LLM (e.g. Claude Code) to synthesize.
+      if (!this.enableLLMFeatures) {
+        synthesis.rawEvidence = this.buildRawEvidence(sources);
+        synthesis.supportingEvidence = this.compileSupportingEvidence(sources);
+        this.metrics.synthesisTime += Date.now() - startTime;
+        return synthesis;
+      }
       // Extract key claims and facts from each source
       const extractedClaims = await this.extractKeyClaims(sources);
@@ -1110,6 +1127,36 @@ export class ResearchOrchestrator extends EventEmitter {
       .slice(0, 15);
   }
+  buildRawEvidence(sources) {
+    return sources
+      .filter(s => s.extractedContent && s.extractedContent.length > 0)
+      .map(s => ({
+        title: s.title,
+        url: s.link,
+        credibility: s.overallCredibility ?? 0.5,
+        contentSnippet: s.extractedContent.substring(0, 4000),
+        topSentences: this.extractTopSentences(s.extractedContent, 5)
+      }))
+      .slice(0, 20);
+  }
+  extractTopSentences(text, n = 5) {
+    if (!text) return [];
+    const sentences = text
+      .split(/(?<=[.!?])\s+/)
+      .map(s => s.trim())
+      .filter(s => s.length >= 40 && s.length <= 500);
+    return sentences
+      .map(s => ({
+        text: s,
+        score: s.length * 0.5 + (s.match(/[A-Z][a-z]+/g)?.length || 0) * 5
+      }))
+      .sort((a, b) => b.score - a.score)
+      .slice(0, n)
+      .map(item => item.text);
+  }
   identifyResearchGaps(claimGroups, topic) {
     const gaps = [];
@@ -1158,6 +1205,40 @@ export class ResearchOrchestrator extends EventEmitter {
   }
   compileResearchResults(topic, synthesis, options) {
+    if (synthesis.synthesisMode === 'raw_evidence') {
+      const sources = synthesis.rawEvidence || [];
+      return {
+        sessionId: this.researchState.sessionId,
+        topic,
+        synthesisMode: 'raw_evidence',
+        note: "This response contains raw research evidence with no AI synthesis. The calling LLM (you) should synthesize these sources to answer the user's question. To enable internal LLM synthesis instead, set OPENAI_API_KEY or ANTHROPIC_API_KEY in the MCP server environment.",
+        sources,
+        findings: [],
+        researchSummary: {
+          totalSources: this.metrics.urlsProcessed,
+          verifiedSources: this.metrics.sourcesVerified,
+          sourcesReturned: sources.length,
+          llmEnhanced: false
+        },
+        activityLog: this.researchState.activityLog,
+        performance: {
+          ...this.metrics,
+          timeLimit: this.timeLimit,
+          completedWithinLimit: this.metrics.totalProcessingTime < this.timeLimit
+        },
+        metadata: {
+          generatedAt: new Date().toISOString(),
+          researchDepth: this.researchState.currentDepth,
+          configuration: {
+            maxDepth: this.maxDepth,
+            maxUrls: this.maxUrls,
+            timeLimit: this.timeLimit,
+            llmEnabled: false
+          }
+        }
+      };
+    }
     const baseResults = {
       sessionId: this.researchState.sessionId,
       topic,

package/src/core/endpointGuard.js ADDED Viewed

@@ -0,0 +1,37 @@
+import { isCreatorModeVerified } from './creatorMode.js';
+export const ALLOWED_HOSTS = ['www.crawlforge.dev', 'crawlforge.dev', 'api.crawlforge.dev'];
+const LOCALHOST_HOSTS = new Set(['localhost', '127.0.0.1', '::1']);
+export function resolveApiEndpoint(rawUrl) {
+  let parsed;
+  try {
+    parsed = new URL(rawUrl);
+  } catch {
+    throw new Error(`Invalid API endpoint URL: "${rawUrl}"`);
+  }
+  const hostname = parsed.hostname;
+  if (LOCALHOST_HOSTS.has(hostname)) {
+    if (!isCreatorModeVerified()) {
+      throw new Error(`Refusing to use API endpoint "${rawUrl}" — not in allow-list`);
+    }
+    // Strip trailing slash from pathname
+    parsed.pathname = parsed.pathname.replace(/\/+$/, '');
+    return parsed.toString();
+  }
+  if (parsed.protocol !== 'https:') {
+    throw new Error(`Refusing to use API endpoint "${rawUrl}" — not in allow-list`);
+  }
+  if (!ALLOWED_HOSTS.includes(hostname)) {
+    throw new Error(`Refusing to use API endpoint "${rawUrl}" — not in allow-list`);
+  }
+  // Strip trailing slash from pathname
+  parsed.pathname = parsed.pathname.replace(/\/+$/, '');
+  return parsed.toString();
+}

package/src/tools/research/deepResearch.js CHANGED Viewed

@@ -208,11 +208,20 @@ export class DeepResearchTool {
       baseConfig.llmConfig = params.llmConfig;
     }
-    // Adjust configuration based on research approach
+    // Every approach must propagate the user's scope params (maxUrls,
+    // timeLimit, concurrency) — only `broad` did before, so non-broad
+    // approaches silently fell back to orchestrator defaults.
+    const scopeConfig = {
+      maxUrls: params.maxUrls,
+      timeLimit: params.timeLimit,
+      concurrency: params.concurrency
+    };
     switch (params.researchApproach) {
       case 'academic':
         return {
           ...baseConfig,
+          ...scopeConfig,
           maxDepth: Math.min(params.maxDepth, 8),
           enableSourceVerification: true,
           searchConfig: {
@@ -225,10 +234,11 @@ export class DeepResearchTool {
             }
           }
         };
       case 'current_events':
         return {
           ...baseConfig,
+          ...scopeConfig,
           maxDepth: Math.min(params.maxDepth, 6),
           searchConfig: {
             enableRanking: true,
@@ -240,18 +250,20 @@ export class DeepResearchTool {
             }
           }
         };
       case 'focused':
         return {
           ...baseConfig,
+          ...scopeConfig,
           maxDepth: Math.min(params.maxDepth, 4),
           maxUrls: Math.min(params.maxUrls, 30),
           concurrency: Math.min(params.concurrency, 3)
         };
       case 'comparative':
         return {
           ...baseConfig,
+          ...scopeConfig,
           enableConflictDetection: true,
           maxDepth: params.maxDepth,
           searchConfig: {
@@ -263,14 +275,13 @@ export class DeepResearchTool {
             }
           }
         };
       case 'broad':
       default:
         return {
           ...baseConfig,
-          maxDepth: params.maxDepth,
-          maxUrls: params.maxUrls,
-          timeLimit: params.timeLimit
+          ...scopeConfig,
+          maxDepth: params.maxDepth
         };
     }
   }
@@ -334,6 +345,20 @@ export class DeepResearchTool {
    * Format research results according to output preferences
    */
   formatResults(results, params) {
+    // Raw evidence mode (no LLM configured): pass through the clean shape
+    // designed for the calling LLM to synthesize.
+    if (results.synthesisMode === 'raw_evidence') {
+      return {
+        synthesisMode: 'raw_evidence',
+        note: results.note,
+        sources: results.sources,
+        researchSummary: results.researchSummary,
+        metadata: results.metadata,
+        performance: results.performance,
+        activityLog: params.includeActivityLog ? results.activityLog : undefined
+      };
+    }
     const formatted = {
       researchSummary: results.researchSummary,
       metadata: results.metadata