crawlforge-mcp-server 4.7.0 → 4.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/package.json +1 -1
- package/server.js +14 -1
- package/src/core/ActionExecutor.js +30 -6
- package/src/core/LLMsTxtAnalyzer.js +10 -1
- package/src/core/ResearchOrchestrator.js +5 -3
- package/src/resources/ResourceRegistry.js +3 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +7 -0
- package/src/tools/extract/extractStructured.js +43 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +3 -1
- package/src/tools/research/deepResearch.js +4 -1
package/README.md
CHANGED
|
@@ -199,7 +199,7 @@ For the full canonical capabilities reference (all tools, CLI commands, stealth
|
|
|
199
199
|
| **Business** ($399) | 250,000 | Large scale operations |
|
|
200
200
|
|
|
201
201
|
**All plans include:**
|
|
202
|
-
- Access to all 26 tools
|
|
202
|
+
- Access to all 26 tools
|
|
203
203
|
- Credits never expire and roll over month-to-month
|
|
204
204
|
- API access and webhook notifications
|
|
205
205
|
|
|
@@ -298,7 +298,7 @@ Once configured, use these tools in your AI assistant:
|
|
|
298
298
|
|
|
299
299
|
## 🔒 Security & Privacy
|
|
300
300
|
|
|
301
|
-
- **Secure Authentication**: API keys required for all metered
|
|
301
|
+
- **Secure Authentication**: API keys required for all metered tools
|
|
302
302
|
- **Local Storage**: API keys stored securely at `~/.crawlforge/config.json`
|
|
303
303
|
- **HTTPS Only**: All connections use encrypted HTTPS
|
|
304
304
|
- **No Data Retention**: We don't store scraped data, only usage logs
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.7.
|
|
3
|
+
"version": "4.7.2",
|
|
4
4
|
"mcpName": "io.github.mysleekdesigns/crawlforge-mcp-server",
|
|
5
5
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 26 web scraping, crawling, deep-research, and autonomous-extraction tools. Returns clean Markdown and structured JSON for Claude, Cursor, and any MCP client. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. Includes a unified multi-format scrape tool, an autonomous agent, pre-built site templates, and Camoufox stealth browsing.",
|
|
6
6
|
"main": "server.js",
|
package/server.js
CHANGED
|
@@ -89,7 +89,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
89
89
|
// Create the server
|
|
90
90
|
const server = new McpServer({
|
|
91
91
|
name: "crawlforge",
|
|
92
|
-
version: "4.7.
|
|
92
|
+
version: "4.7.2",
|
|
93
93
|
description: "Production-ready MCP server with 26 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, local-LLM extraction via Ollama, unified multi-format scrape, and autonomous agent tool.",
|
|
94
94
|
homepage: "https://www.crawlforge.dev",
|
|
95
95
|
icon: "https://www.crawlforge.dev/icon.png"
|
|
@@ -736,6 +736,19 @@ server.registerTool("scrape_with_actions", {
|
|
|
736
736
|
}, withAuth("scrape_with_actions", async (params) => {
|
|
737
737
|
try {
|
|
738
738
|
const result = await scrapeWithActionsTool.execute(params);
|
|
739
|
+
|
|
740
|
+
// Publish captured screenshots as crawlforge://screenshot/{actionId}
|
|
741
|
+
// resources (the documented contract) and annotate each with its URI.
|
|
742
|
+
if (Array.isArray(result.screenshots)) {
|
|
743
|
+
result.screenshots = result.screenshots.map((shot) => {
|
|
744
|
+
if (shot?.actionId && shot?.data) {
|
|
745
|
+
resourceRegistry.storeScreenshot(shot.actionId, shot.data);
|
|
746
|
+
return { ...shot, resourceUri: `crawlforge://screenshot/${shot.actionId}` };
|
|
747
|
+
}
|
|
748
|
+
return shot;
|
|
749
|
+
});
|
|
750
|
+
}
|
|
751
|
+
|
|
739
752
|
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
740
753
|
} catch (error) {
|
|
741
754
|
return { content: [{ type: "text", text: `Scrape with actions failed: ${error.message}` }], isError: true };
|
|
@@ -23,8 +23,8 @@ const WaitActionSchema = BaseActionSchema.extend({
|
|
|
23
23
|
selector: z.string().optional(),
|
|
24
24
|
condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional(),
|
|
25
25
|
text: z.string().optional()
|
|
26
|
-
}).refine(data => data.duration || data.milliseconds || data.selector || data.text, {
|
|
27
|
-
message: 'Wait action requires duration/milliseconds, selector, or text'
|
|
26
|
+
}).refine(data => data.duration || data.milliseconds || data.timeout || data.selector || data.text, {
|
|
27
|
+
message: 'Wait action requires duration/milliseconds/timeout, selector, or text'
|
|
28
28
|
});
|
|
29
29
|
|
|
30
30
|
const ClickActionSchema = BaseActionSchema.extend({
|
|
@@ -329,6 +329,18 @@ export class ActionExecutor extends EventEmitter {
|
|
|
329
329
|
executionContext.results.push(actionResult);
|
|
330
330
|
this.stats.totalActions++;
|
|
331
331
|
|
|
332
|
+
// Collect screenshots produced by successful screenshot actions so
|
|
333
|
+
// they surface in the tool result (not just error screenshots).
|
|
334
|
+
if (actionResult.success && action.type === 'screenshot' && actionResult.result?.data) {
|
|
335
|
+
executionContext.screenshots.push({
|
|
336
|
+
actionId: actionResult.id,
|
|
337
|
+
data: actionResult.result.data,
|
|
338
|
+
format: actionResult.result.format,
|
|
339
|
+
fullPage: actionResult.result.fullPage,
|
|
340
|
+
timestamp: actionResult.timestamp
|
|
341
|
+
});
|
|
342
|
+
}
|
|
343
|
+
|
|
332
344
|
if (actionResult.success) {
|
|
333
345
|
this.stats.successfulActions++;
|
|
334
346
|
} else {
|
|
@@ -382,7 +394,16 @@ export class ActionExecutor extends EventEmitter {
|
|
|
382
394
|
this.emit('actionStarted', { actionId, action, chainId: executionContext.id });
|
|
383
395
|
|
|
384
396
|
let result;
|
|
385
|
-
|
|
397
|
+
let timeout = action.timeout || this.defaultTimeout;
|
|
398
|
+
|
|
399
|
+
// A `wait` action that uses `timeout` as its pause duration (no
|
|
400
|
+
// duration/milliseconds/selector/text) must not also use that same value
|
|
401
|
+
// as its abort deadline, or the abort would race the wait. Give headroom.
|
|
402
|
+
if (action.type === 'wait' &&
|
|
403
|
+
!action.duration && !action.milliseconds && !action.selector && !action.text &&
|
|
404
|
+
action.timeout) {
|
|
405
|
+
timeout = Math.max(this.defaultTimeout, action.timeout + 5000);
|
|
406
|
+
}
|
|
386
407
|
|
|
387
408
|
// Execute based on action type with timeout
|
|
388
409
|
const executionPromise = this.executeActionByType(page, action);
|
|
@@ -467,8 +488,11 @@ export class ActionExecutor extends EventEmitter {
|
|
|
467
488
|
* @returns {Promise<Object>} Wait result
|
|
468
489
|
*/
|
|
469
490
|
async executeWaitAction(page, action) {
|
|
470
|
-
// Handle
|
|
471
|
-
|
|
491
|
+
// Handle 'duration'/'milliseconds' (and 'timeout' as a pause duration only
|
|
492
|
+
// when no selector/text is given — selector/text waits use 'timeout' as
|
|
493
|
+
// their abort deadline instead).
|
|
494
|
+
const waitTime = action.duration || action.milliseconds ||
|
|
495
|
+
(!action.selector && !action.text ? action.timeout : undefined);
|
|
472
496
|
if (waitTime) {
|
|
473
497
|
await this.delay(waitTime);
|
|
474
498
|
return { waited: waitTime };
|
|
@@ -492,7 +516,7 @@ export class ActionExecutor extends EventEmitter {
|
|
|
492
516
|
return { text: action.text };
|
|
493
517
|
}
|
|
494
518
|
|
|
495
|
-
throw new Error('Wait action requires duration, selector, or text');
|
|
519
|
+
throw new Error('Wait action requires duration/milliseconds/timeout, selector, or text');
|
|
496
520
|
}
|
|
497
521
|
|
|
498
522
|
/**
|
|
@@ -50,7 +50,16 @@ export class LLMsTxtAnalyzer {
|
|
|
50
50
|
apis: [],
|
|
51
51
|
contentTypes: {},
|
|
52
52
|
securityAreas: [],
|
|
53
|
-
|
|
53
|
+
// Conservative defaults so output never renders `undefined` when live
|
|
54
|
+
// rate-limit probing is skipped (analyzeRateLimiting only runs with
|
|
55
|
+
// probeRateLimit:true). Overwritten with measured values when probed.
|
|
56
|
+
rateLimit: {
|
|
57
|
+
recommendedDelay: 1000,
|
|
58
|
+
maxConcurrency: 5,
|
|
59
|
+
recommendedRPM: 30,
|
|
60
|
+
reasoning: 'Conservative defaults applied; live rate-limit probing was not performed (pass probeRateLimit:true to measure actual response times).',
|
|
61
|
+
averageResponseTime: null
|
|
62
|
+
},
|
|
54
63
|
guidelines: {},
|
|
55
64
|
metadata: {},
|
|
56
65
|
errors: []
|
|
@@ -32,6 +32,7 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
32
32
|
concurrency = 5,
|
|
33
33
|
enableSourceVerification = true,
|
|
34
34
|
enableConflictDetection = true,
|
|
35
|
+
credibilityThreshold = 0.3,
|
|
35
36
|
cacheEnabled = true,
|
|
36
37
|
cacheTTL = 1800000, // 30 minutes
|
|
37
38
|
researchApproach = 'broad',
|
|
@@ -61,6 +62,7 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
61
62
|
this.concurrency = Math.min(Math.max(1, concurrency), 20);
|
|
62
63
|
this.enableSourceVerification = enableSourceVerification;
|
|
63
64
|
this.enableConflictDetection = enableConflictDetection;
|
|
65
|
+
this.credibilityThreshold = Math.min(Math.max(0, credibilityThreshold), 1);
|
|
64
66
|
|
|
65
67
|
// Stealth fallback config + lazy state (browser launched only on first block)
|
|
66
68
|
this.enableStealthFallback = enableStealthFallback;
|
|
@@ -859,7 +861,7 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
859
861
|
}
|
|
860
862
|
|
|
861
863
|
// Only include sources that meet minimum credibility threshold
|
|
862
|
-
if (overallCredibility >=
|
|
864
|
+
if (overallCredibility >= this.credibilityThreshold) {
|
|
863
865
|
verifiedSources.push({
|
|
864
866
|
...source,
|
|
865
867
|
credibilityFactors,
|
|
@@ -1360,7 +1362,7 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
1360
1362
|
|
|
1361
1363
|
generateKeyFindings(claimGroups, sources) {
|
|
1362
1364
|
return claimGroups
|
|
1363
|
-
.filter(group => group.avgCredibility >=
|
|
1365
|
+
.filter(group => group.avgCredibility >= this.credibilityThreshold)
|
|
1364
1366
|
.sort((a, b) => b.consensusStrength - a.consensusStrength)
|
|
1365
1367
|
.slice(0, 10)
|
|
1366
1368
|
.map(group => ({
|
|
@@ -1373,7 +1375,7 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
1373
1375
|
|
|
1374
1376
|
compileSupportingEvidence(sources) {
|
|
1375
1377
|
return sources
|
|
1376
|
-
.filter(source => source.overallCredibility >=
|
|
1378
|
+
.filter(source => source.overallCredibility >= this.credibilityThreshold)
|
|
1377
1379
|
.map(source => ({
|
|
1378
1380
|
title: source.title,
|
|
1379
1381
|
url: source.link,
|
|
@@ -167,6 +167,9 @@ export class ResourceRegistry {
|
|
|
167
167
|
* @returns {{ contents: Array<{ uri: string, mimeType: string, text?: string, blob?: string }> }}
|
|
168
168
|
*/
|
|
169
169
|
async readResource(uri) {
|
|
170
|
+
// The MCP SDK hands the read callback a URL object, not a string; coerce so
|
|
171
|
+
// the sub-readers and parseResourceUri (which calls String#startsWith) work.
|
|
172
|
+
uri = typeof uri === 'string' ? uri : (uri?.href ?? String(uri));
|
|
170
173
|
const parsed = parseResourceUri(uri);
|
|
171
174
|
if (!parsed) {
|
|
172
175
|
throw new Error(`Unknown resource URI: ${uri}`);
|
|
@@ -619,6 +619,13 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
619
619
|
customSelectors: params.extractionOptions?.selectors
|
|
620
620
|
};
|
|
621
621
|
|
|
622
|
+
// extractContent only emits content.markdown when explicitly asked; honor
|
|
623
|
+
// a requested "markdown" format so generateFormats doesn't fall back to a
|
|
624
|
+
// "Content not available in markdown format" placeholder.
|
|
625
|
+
if (params.formats?.includes('markdown')) {
|
|
626
|
+
options.outputFormat = 'markdown';
|
|
627
|
+
}
|
|
628
|
+
|
|
622
629
|
// Prefer the post-action live page HTML captured during action execution.
|
|
623
630
|
// This ensures the final content reflects clicks/typing/navigation rather
|
|
624
631
|
// than re-fetching the original (pre-action) URL.
|
|
@@ -15,6 +15,22 @@ const _pkg = _require('../../../package.json');
|
|
|
15
15
|
const CRAWLFORGE_UA = `CrawlForge/${_pkg.version} (+https://crawlforge.dev)`;
|
|
16
16
|
import { fetchAndParse } from './_fetchAndParse.js';
|
|
17
17
|
|
|
18
|
+
// Semantic element selectors for well-known field names, tried as a last
|
|
19
|
+
// resort in the CSS fallback so common fields (e.g. "title") still resolve when
|
|
20
|
+
// no LLM provider and no selectorHints are available. Element/text selectors
|
|
21
|
+
// only — meta tags are already handled separately above.
|
|
22
|
+
const SEMANTIC_FIELD_SELECTORS = {
|
|
23
|
+
title: ['h1', 'title'],
|
|
24
|
+
name: ['h1', 'title'],
|
|
25
|
+
heading: ['h1', 'h2'],
|
|
26
|
+
headline: ['h1', 'h2'],
|
|
27
|
+
description: ['article p', 'main p', '.description', 'p'],
|
|
28
|
+
summary: ['article p', 'main p', 'p'],
|
|
29
|
+
author: ['[rel="author"]', '.author', '.byline'],
|
|
30
|
+
date: ['time', '.date'],
|
|
31
|
+
published: ['time', '.published', '.date']
|
|
32
|
+
};
|
|
33
|
+
|
|
18
34
|
const ExtractStructuredSchema = z.object({
|
|
19
35
|
url: z.string().url(),
|
|
20
36
|
schema: z.object({
|
|
@@ -245,6 +261,33 @@ export class ExtractStructuredTool {
|
|
|
245
261
|
}
|
|
246
262
|
}
|
|
247
263
|
}
|
|
264
|
+
|
|
265
|
+
// Last resort: semantic element selectors for well-known field names
|
|
266
|
+
// (e.g. title -> <h1>/<title>) so common fields resolve without hints.
|
|
267
|
+
if (!(key in extracted)) {
|
|
268
|
+
const semanticSelectors = SEMANTIC_FIELD_SELECTORS[key.toLowerCase()];
|
|
269
|
+
if (semanticSelectors) {
|
|
270
|
+
for (const sel of semanticSelectors) {
|
|
271
|
+
const el = $(sel);
|
|
272
|
+
if (el.length === 0) continue;
|
|
273
|
+
if (isArrayField && el.length > 1) {
|
|
274
|
+
const values = el.map((_, item) => $(item).text().trim()).get().filter(Boolean);
|
|
275
|
+
if (values.length > 0) {
|
|
276
|
+
extracted[key] = values;
|
|
277
|
+
fieldsFound++;
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
280
|
+
} else {
|
|
281
|
+
const rawValue = el.first().text().trim();
|
|
282
|
+
if (rawValue) {
|
|
283
|
+
extracted[key] = this._coerceValue(rawValue, fieldSchema);
|
|
284
|
+
fieldsFound++;
|
|
285
|
+
break;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
248
291
|
}
|
|
249
292
|
|
|
250
293
|
if (fieldsFound === 0) {
|
|
@@ -391,7 +391,9 @@ export class GenerateLLMsTxtTool {
|
|
|
391
391
|
lines.push('');
|
|
392
392
|
lines.push('### Technical Justification');
|
|
393
393
|
lines.push(`${analysis.rateLimit.reasoning}`);
|
|
394
|
-
|
|
394
|
+
if (analysis.rateLimit.averageResponseTime != null) {
|
|
395
|
+
lines.push(`Average response time: ${analysis.rateLimit.averageResponseTime}ms`);
|
|
396
|
+
}
|
|
395
397
|
lines.push('');
|
|
396
398
|
}
|
|
397
399
|
|
|
@@ -272,6 +272,10 @@ export class DeepResearchTool {
|
|
|
272
272
|
maxUrls: params.maxUrls,
|
|
273
273
|
timeLimit: params.timeLimit,
|
|
274
274
|
concurrency: params.concurrency,
|
|
275
|
+
// Minimum credibility a source must clear in verifySourceCredibility.
|
|
276
|
+
// Must be on the orchestrator *constructor* config (not the
|
|
277
|
+
// conductResearch options) — that is the only place it is now read.
|
|
278
|
+
credibilityThreshold: params.credibilityThreshold,
|
|
275
279
|
// The orchestrator tunes its query expansion to the approach (commercial
|
|
276
280
|
// vs academic vs current-events); without this it always used academic
|
|
277
281
|
// variations, which poisoned commercial/comparative searches.
|
|
@@ -356,7 +360,6 @@ export class DeepResearchTool {
|
|
|
356
360
|
buildResearchOptions(params) {
|
|
357
361
|
return {
|
|
358
362
|
sourceTypes: params.sourceTypes,
|
|
359
|
-
credibilityThreshold: params.credibilityThreshold,
|
|
360
363
|
includeRecentOnly: params.includeRecentOnly,
|
|
361
364
|
queryExpansion: params.queryExpansion,
|
|
362
365
|
enableConflictDetection: params.enableConflictDetection,
|