crawlforge-mcp-server 4.6.4 → 4.6.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.6.
|
|
3
|
+
"version": "4.6.5",
|
|
4
4
|
"mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
|
|
5
5
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
|
|
6
6
|
"main": "server.js",
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
"test": "node tests/integration/mcp-protocol-compliance.test.js",
|
|
19
19
|
"test:unit": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/unit/*.test.js'",
|
|
20
20
|
"test:integration": "CRAWLFORGE_CREATOR_SECRET= node --test 'tests/integration/tools/*.test.js'",
|
|
21
|
-
"test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
|
|
21
|
+
"test:coverage": "CRAWLFORGE_CREATOR_SECRET= c8 --reporter=text --reporter=lcov --include='src/**/*.js' --exclude='src/**/_*.js' --lines=60 --statements=60 --functions=55 --branches=45 node --test --test-force-exit 'tests/unit/*.test.js' 'tests/integration/tools/*.test.js'",
|
|
22
22
|
"test:tools": "node test-tools.js",
|
|
23
23
|
"test:real-world": "node test-real-world.js",
|
|
24
24
|
"test:all": "bash run-all-tests.sh",
|
package/server.js
CHANGED
|
@@ -90,7 +90,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
90
90
|
// Create the server
|
|
91
91
|
const server = new McpServer({
|
|
92
92
|
name: "crawlforge",
|
|
93
|
-
version: "4.6.
|
|
93
|
+
version: "4.6.5",
|
|
94
94
|
description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
|
|
95
95
|
homepage: "https://www.crawlforge.dev",
|
|
96
96
|
icon: "https://www.crawlforge.dev/icon.png"
|
|
@@ -34,12 +34,14 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
34
34
|
enableConflictDetection = true,
|
|
35
35
|
cacheEnabled = true,
|
|
36
36
|
cacheTTL = 1800000, // 30 minutes
|
|
37
|
+
researchApproach = 'broad',
|
|
37
38
|
searchConfig = {},
|
|
38
39
|
crawlConfig = {},
|
|
39
40
|
extractConfig = {},
|
|
40
41
|
summarizeConfig = {}
|
|
41
42
|
} = options;
|
|
42
43
|
|
|
44
|
+
this.researchApproach = researchApproach;
|
|
43
45
|
this.maxDepth = Math.min(Math.max(1, maxDepth), 10);
|
|
44
46
|
this.maxUrls = Math.min(Math.max(1, maxUrls), 1000);
|
|
45
47
|
this.timeLimit = Math.min(Math.max(30000, timeLimit), 300000);
|
|
@@ -269,32 +271,50 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
269
271
|
}
|
|
270
272
|
|
|
271
273
|
/**
|
|
272
|
-
* Generate research-specific query variations
|
|
274
|
+
* Generate research-specific query variations, tuned to the research approach.
|
|
275
|
+
*
|
|
276
|
+
* Academic/scientific suffixes ("peer reviewed", "research paper", "what is")
|
|
277
|
+
* only help when the caller actually asked for an academic search. Appending
|
|
278
|
+
* them to commercial or comparative topics dragged web search toward
|
|
279
|
+
* irrelevant government/academic PDFs and long-tail noise — the cause of
|
|
280
|
+
* near-empty research runs on niche commercial topics.
|
|
273
281
|
*/
|
|
274
282
|
generateResearchVariations(topic) {
|
|
275
|
-
const
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
283
|
+
const approach = this.researchApproach || 'broad';
|
|
284
|
+
|
|
285
|
+
if (approach === 'academic') {
|
|
286
|
+
return [
|
|
287
|
+
`${topic} research`,
|
|
288
|
+
`${topic} study`,
|
|
289
|
+
`${topic} analysis`,
|
|
290
|
+
`${topic} academic`,
|
|
291
|
+
`${topic} scientific`,
|
|
292
|
+
`${topic} research paper`,
|
|
293
|
+
`${topic} peer reviewed`,
|
|
294
|
+
`${topic} explained`
|
|
295
|
+
];
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (approach === 'current_events') {
|
|
299
|
+
return [
|
|
300
|
+
`latest ${topic}`,
|
|
301
|
+
`${topic} news`,
|
|
302
|
+
`recent ${topic}`,
|
|
303
|
+
`${topic} update`,
|
|
304
|
+
`${topic} announcement`
|
|
305
|
+
];
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// broad / focused / comparative — commercial & general intent
|
|
309
|
+
return [
|
|
310
|
+
`${topic} review`,
|
|
311
|
+
`${topic} reviews`,
|
|
312
|
+
`${topic} comparison`,
|
|
313
|
+
`${topic} vs alternatives`,
|
|
314
|
+
`${topic} pricing`,
|
|
315
|
+
`best ${topic}`,
|
|
316
|
+
`${topic} company`
|
|
317
|
+
];
|
|
298
318
|
}
|
|
299
319
|
|
|
300
320
|
/**
|
|
@@ -644,8 +664,19 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
644
664
|
citationPotential: this.assessCitationPotential(source)
|
|
645
665
|
};
|
|
646
666
|
|
|
647
|
-
|
|
648
|
-
|
|
667
|
+
let overallCredibility = this.calculateOverallCredibility(credibilityFactors);
|
|
668
|
+
|
|
669
|
+
// Down-weight topically-irrelevant sources so high-authority but
|
|
670
|
+
// off-topic pages (e.g. a .gov PDF unrelated to the query) don't
|
|
671
|
+
// dominate the results. relevanceScore is keyword-based here (no LLM):
|
|
672
|
+
// ~1 when the topic appears in the content, ~0 when it doesn't.
|
|
673
|
+
const relevance = typeof source.relevanceScore === 'number'
|
|
674
|
+
? source.relevanceScore
|
|
675
|
+
: null;
|
|
676
|
+
if (relevance !== null) {
|
|
677
|
+
overallCredibility *= (0.4 + 0.6 * relevance);
|
|
678
|
+
}
|
|
679
|
+
|
|
649
680
|
// Only include sources that meet minimum credibility threshold
|
|
650
681
|
if (overallCredibility >= 0.3) {
|
|
651
682
|
verifiedSources.push({
|
|
@@ -271,7 +271,11 @@ export class DeepResearchTool {
|
|
|
271
271
|
const scopeConfig = {
|
|
272
272
|
maxUrls: params.maxUrls,
|
|
273
273
|
timeLimit: params.timeLimit,
|
|
274
|
-
concurrency: params.concurrency
|
|
274
|
+
concurrency: params.concurrency,
|
|
275
|
+
// The orchestrator tunes its query expansion to the approach (commercial
|
|
276
|
+
// vs academic vs current-events); without this it always used academic
|
|
277
|
+
// variations, which poisoned commercial/comparative searches.
|
|
278
|
+
researchApproach: params.researchApproach
|
|
275
279
|
};
|
|
276
280
|
|
|
277
281
|
switch (params.researchApproach) {
|