npm - crawlforge-mcp-server - Versions diffs - 4.6.4 → 4.6.5 - Mend

crawlforge-mcp-server 4.6.4 → 4.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +2 -2
package/server.js +1 -1
package/src/core/ResearchOrchestrator.js +57 -26
package/src/tools/research/deepResearch.js +5 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawlforge-mcp-server",
-  "version": "4.6.4",
+  "version": "4.6.5",
   "mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
   "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
   "main": "server.js",
@@ -18,7 +18,7 @@
     "test": "node tests/integration/mcp-protocol-compliance.test.js",
     "test:unit": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/unit/*.test.js'",
     "test:integration": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/integration/tools/*.test.js'",
-    "test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
+    "test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test --test-force-exit 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
     "test:tools": "node test-tools.js",
     "test:real-world": "node test-real-world.js",
     "test:all": "bash run-all-tests.sh",

package/server.js CHANGED Viewed

@@ -90,7 +90,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
 // Create the server
 const server = new McpServer({
   name: "crawlforge",
-  version: "4.6.4",
+  version: "4.6.5",
   description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
   homepage: "https://www.crawlforge.dev",
   icon: "https://www.crawlforge.dev/icon.png"

package/src/core/ResearchOrchestrator.js CHANGED Viewed

@@ -34,12 +34,14 @@ export class ResearchOrchestrator extends EventEmitter {
       enableConflictDetection = true,
       cacheEnabled = true,
       cacheTTL = 1800000, // 30 minutes
+      researchApproach = 'broad',
       searchConfig = {},
       crawlConfig = {},
       extractConfig = {},
       summarizeConfig = {}
     } = options;
+    this.researchApproach = researchApproach;
     this.maxDepth = Math.min(Math.max(1, maxDepth), 10);
     this.maxUrls = Math.min(Math.max(1, maxUrls), 1000);
     this.timeLimit = Math.min(Math.max(30000, timeLimit), 300000);
@@ -269,32 +271,50 @@ export class ResearchOrchestrator extends EventEmitter {
   }
   /**
-   * Generate research-specific query variations
+   * Generate research-specific query variations, tuned to the research approach.
+   *
+   * Academic/scientific suffixes ("peer reviewed", "research paper", "what is")
+   * only help when the caller actually asked for an academic search. Appending
+   * them to commercial or comparative topics dragged web search toward
+   * irrelevant government/academic PDFs and long-tail noise — the cause of
+   * near-empty research runs on niche commercial topics.
    */
   generateResearchVariations(topic) {
-    const variations = [];
-    // Question-based variations
-    variations.push(`what is ${topic}`);
-    variations.push(`how does ${topic} work`);
-    variations.push(`${topic} explained`);
-    variations.push(`${topic} research`);
-    variations.push(`${topic} studies`);
-    variations.push(`${topic} analysis`);
-    // Academic and authoritative variations
-    variations.push(`${topic} academic`);
-    variations.push(`${topic} scientific`);
-    variations.push(`${topic} research paper`);
-    variations.push(`${topic} peer reviewed`);
-    // Current and historical context
-    variations.push(`latest ${topic}`);
-    variations.push(`current ${topic}`);
-    variations.push(`${topic} 2024`);
-    variations.push(`${topic} trends`);
-    return variations.slice(0, 10); // Limit variations
+    const approach = this.researchApproach || 'broad';
+    if (approach === 'academic') {
+      return [
+        `${topic} research`,
+        `${topic} study`,
+        `${topic} analysis`,
+        `${topic} academic`,
+        `${topic} scientific`,
+        `${topic} research paper`,
+        `${topic} peer reviewed`,
+        `${topic} explained`
+      ];
+    }
+    if (approach === 'current_events') {
+      return [
+        `latest ${topic}`,
+        `${topic} news`,
+        `recent ${topic}`,
+        `${topic} update`,
+        `${topic} announcement`
+      ];
+    }
+    // broad / focused / comparative — commercial & general intent
+    return [
+      `${topic} review`,
+      `${topic} reviews`,
+      `${topic} comparison`,
+      `${topic} vs alternatives`,
+      `${topic} pricing`,
+      `best ${topic}`,
+      `${topic} company`
+    ];
   }
   /**
@@ -644,8 +664,19 @@ export class ResearchOrchestrator extends EventEmitter {
           citationPotential: this.assessCitationPotential(source)
         };
-        const overallCredibility = this.calculateOverallCredibility(credibilityFactors);
+        let overallCredibility = this.calculateOverallCredibility(credibilityFactors);
+        // Down-weight topically-irrelevant sources so high-authority but
+        // off-topic pages (e.g. a .gov PDF unrelated to the query) don't
+        // dominate the results. relevanceScore is keyword-based here (no LLM):
+        // ~1 when the topic appears in the content, ~0 when it doesn't.
+        const relevance = typeof source.relevanceScore === 'number'
+          ? source.relevanceScore
+          : null;
+        if (relevance !== null) {
+          overallCredibility *= (0.4 + 0.6 * relevance);
+        }
         // Only include sources that meet minimum credibility threshold
         if (overallCredibility >= 0.3) {
           verifiedSources.push({

package/src/tools/research/deepResearch.js CHANGED Viewed

@@ -271,7 +271,11 @@ export class DeepResearchTool {
     const scopeConfig = {
       maxUrls: params.maxUrls,
       timeLimit: params.timeLimit,
-      concurrency: params.concurrency
+      concurrency: params.concurrency,
+      // The orchestrator tunes its query expansion to the approach (commercial
+      // vs academic vs current-events); without this it always used academic
+      // variations, which poisoned commercial/comparative searches.
+      researchApproach: params.researchApproach
     };
     switch (params.researchApproach) {