crawlforge-mcp-server 4.6.4 → 4.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawlforge-mcp-server",
3
- "version": "4.6.4",
3
+ "version": "4.6.5",
4
4
  "mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
5
5
  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
6
6
  "main": "server.js",
@@ -18,7 +18,7 @@
18
18
  "test": "node tests/integration/mcp-protocol-compliance.test.js",
19
19
  "test:unit": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/unit/*.test.js'",
20
20
  "test:integration": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/integration/tools/*.test.js'",
21
- "test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
21
+ "test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test --test-force-exit 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
22
22
  "test:tools": "node test-tools.js",
23
23
  "test:real-world": "node test-real-world.js",
24
24
  "test:all": "bash run-all-tests.sh",
package/server.js CHANGED
@@ -90,7 +90,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
90
90
  // Create the server
91
91
  const server = new McpServer({
92
92
  name: "crawlforge",
93
- version: "4.6.4",
93
+ version: "4.6.5",
94
94
  description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
95
95
  homepage: "https://www.crawlforge.dev",
96
96
  icon: "https://www.crawlforge.dev/icon.png"
@@ -34,12 +34,14 @@ export class ResearchOrchestrator extends EventEmitter {
34
34
  enableConflictDetection = true,
35
35
  cacheEnabled = true,
36
36
  cacheTTL = 1800000, // 30 minutes
37
+ researchApproach = 'broad',
37
38
  searchConfig = {},
38
39
  crawlConfig = {},
39
40
  extractConfig = {},
40
41
  summarizeConfig = {}
41
42
  } = options;
42
43
 
44
+ this.researchApproach = researchApproach;
43
45
  this.maxDepth = Math.min(Math.max(1, maxDepth), 10);
44
46
  this.maxUrls = Math.min(Math.max(1, maxUrls), 1000);
45
47
  this.timeLimit = Math.min(Math.max(30000, timeLimit), 300000);
@@ -269,32 +271,50 @@ export class ResearchOrchestrator extends EventEmitter {
269
271
  }
270
272
 
271
273
  /**
272
- * Generate research-specific query variations
274
+ * Generate research-specific query variations, tuned to the research approach.
275
+ *
276
+ * Academic/scientific suffixes ("peer reviewed", "research paper", "what is")
277
+ * only help when the caller actually asked for an academic search. Appending
278
+ * them to commercial or comparative topics dragged web search toward
279
+ * irrelevant government/academic PDFs and long-tail noise — the cause of
280
+ * near-empty research runs on niche commercial topics.
273
281
  */
274
282
  generateResearchVariations(topic) {
275
- const variations = [];
276
-
277
- // Question-based variations
278
- variations.push(`what is ${topic}`);
279
- variations.push(`how does ${topic} work`);
280
- variations.push(`${topic} explained`);
281
- variations.push(`${topic} research`);
282
- variations.push(`${topic} studies`);
283
- variations.push(`${topic} analysis`);
284
-
285
- // Academic and authoritative variations
286
- variations.push(`${topic} academic`);
287
- variations.push(`${topic} scientific`);
288
- variations.push(`${topic} research paper`);
289
- variations.push(`${topic} peer reviewed`);
290
-
291
- // Current and historical context
292
- variations.push(`latest ${topic}`);
293
- variations.push(`current ${topic}`);
294
- variations.push(`${topic} 2024`);
295
- variations.push(`${topic} trends`);
296
-
297
- return variations.slice(0, 10); // Limit variations
283
+ const approach = this.researchApproach || 'broad';
284
+
285
+ if (approach === 'academic') {
286
+ return [
287
+ `${topic} research`,
288
+ `${topic} study`,
289
+ `${topic} analysis`,
290
+ `${topic} academic`,
291
+ `${topic} scientific`,
292
+ `${topic} research paper`,
293
+ `${topic} peer reviewed`,
294
+ `${topic} explained`
295
+ ];
296
+ }
297
+
298
+ if (approach === 'current_events') {
299
+ return [
300
+ `latest ${topic}`,
301
+ `${topic} news`,
302
+ `recent ${topic}`,
303
+ `${topic} update`,
304
+ `${topic} announcement`
305
+ ];
306
+ }
307
+
308
+ // broad / focused / comparative — commercial & general intent
309
+ return [
310
+ `${topic} review`,
311
+ `${topic} reviews`,
312
+ `${topic} comparison`,
313
+ `${topic} vs alternatives`,
314
+ `${topic} pricing`,
315
+ `best ${topic}`,
316
+ `${topic} company`
317
+ ];
298
318
  }
299
319
 
300
320
  /**
@@ -644,8 +664,19 @@ export class ResearchOrchestrator extends EventEmitter {
644
664
  citationPotential: this.assessCitationPotential(source)
645
665
  };
646
666
 
647
- const overallCredibility = this.calculateOverallCredibility(credibilityFactors);
648
-
667
+ let overallCredibility = this.calculateOverallCredibility(credibilityFactors);
668
+
669
+ // Down-weight topically-irrelevant sources so high-authority but
670
+ // off-topic pages (e.g. a .gov PDF unrelated to the query) don't
671
+ // dominate the results. relevanceScore is keyword-based here (no LLM):
672
+ // ~1 when the topic appears in the content, ~0 when it doesn't.
673
+ const relevance = typeof source.relevanceScore === 'number'
674
+ ? source.relevanceScore
675
+ : null;
676
+ if (relevance !== null) {
677
+ overallCredibility *= (0.4 + 0.6 * relevance);
678
+ }
679
+
649
680
  // Only include sources that meet minimum credibility threshold
650
681
  if (overallCredibility >= 0.3) {
651
682
  verifiedSources.push({
@@ -271,7 +271,11 @@ export class DeepResearchTool {
271
271
  const scopeConfig = {
272
272
  maxUrls: params.maxUrls,
273
273
  timeLimit: params.timeLimit,
274
- concurrency: params.concurrency
274
+ concurrency: params.concurrency,
275
+ // The orchestrator tunes its query expansion to the approach (commercial
276
+ // vs academic vs current-events); without this it always used academic
277
+ // variations, which poisoned commercial/comparative searches.
278
+ researchApproach: params.researchApproach
275
279
  };
276
280
 
277
281
  switch (params.researchApproach) {