crawlforge-mcp-server 4.2.12 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/server.js +138 -20
- package/src/constants/config.js +5 -0
- package/src/core/ActionExecutor.js +13 -1
- package/src/core/ChangeTracker.js +8 -5
- package/src/core/LLMsTxtAnalyzer.js +71 -47
- package/src/core/LocalizationManager.js +7 -4
- package/src/core/ResearchOrchestrator.js +10 -6
- package/src/core/StealthBrowserManager.js +52 -13
- package/src/core/analysis/ContentAnalyzer.js +2 -2
- package/src/core/crawlers/BFSCrawler.js +23 -12
- package/src/core/processing/ContentProcessor.js +19 -3
- package/src/core/processing/PDFProcessor.js +72 -23
- package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
- package/src/tools/advanced/batchScrape/index.js +3 -1
- package/src/tools/advanced/batchScrape/reporter.js +5 -1
- package/src/tools/advanced/batchScrape/worker.js +6 -1
- package/src/tools/basic/_fetch.js +78 -5
- package/src/tools/basic/extractLinks.js +1 -1
- package/src/tools/basic/extractMetadata.js +65 -1
- package/src/tools/basic/extractText.js +61 -5
- package/src/tools/basic/scrapeStructured.js +48 -10
- package/src/tools/crawl/crawlDeep.js +13 -5
- package/src/tools/crawl/mapSite.js +24 -51
- package/src/tools/extract/analyzeContent.js +11 -6
- package/src/tools/extract/extractContent.js +23 -5
- package/src/tools/extract/extractStructured.js +65 -16
- package/src/tools/extract/extractWithLlm.js +192 -11
- package/src/tools/extract/listOllamaModels.js +19 -8
- package/src/tools/extract/processDocument.js +10 -4
- package/src/tools/extract/summarizeContent.js +58 -1
- package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
- package/src/tools/research/deepResearch.js +43 -4
- package/src/tools/search/providers/searxng.js +2 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
- package/src/tools/search/ranking/ResultRanker.js +13 -4
- package/src/tools/search/searchWeb.js +5 -5
- package/src/tools/templates/TemplateRegistry.js +3 -2
- package/src/tools/tracking/trackChanges/differ.js +33 -1
- package/src/utils/htmlToMarkdown.js +5 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlforge-mcp-server",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.5.0",
|
|
4
4
|
"description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
|
|
5
5
|
"main": "server.js",
|
|
6
6
|
"bin": {
|
|
@@ -113,6 +113,7 @@
|
|
|
113
113
|
"playwright": "^1.54.2",
|
|
114
114
|
"robots-parser": "^3.0.1",
|
|
115
115
|
"turndown": "^7.2.4",
|
|
116
|
+
"turndown-plugin-gfm": "^1.0.2",
|
|
116
117
|
"undici": "^7.24.0",
|
|
117
118
|
"winston": "^3.11.0",
|
|
118
119
|
"zod": "^3.23.8"
|
package/server.js
CHANGED
|
@@ -96,8 +96,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
96
96
|
// Create the server
|
|
97
97
|
const server = new McpServer({
|
|
98
98
|
name: "crawlforge",
|
|
99
|
-
version: "4.
|
|
100
|
-
description: "Production-ready MCP server with
|
|
99
|
+
version: "4.5.0",
|
|
100
|
+
description: "Production-ready MCP server with 24 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
|
|
101
101
|
homepage: "https://www.crawlforge.dev",
|
|
102
102
|
icon: "https://www.crawlforge.dev/icon.png"
|
|
103
103
|
});
|
|
@@ -299,7 +299,8 @@ server.registerTool("scrape_structured", {
|
|
|
299
299
|
annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
300
300
|
inputSchema: {
|
|
301
301
|
url: z.string().url().describe("The URL to scrape"),
|
|
302
|
-
selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
|
|
302
|
+
selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors. Append @attr to extract an attribute instead of text (e.g. \"a.link@href\", \"img@src\")"),
|
|
303
|
+
max_results: z.number().int().min(1).optional().describe("Maximum number of matches to return per field when a selector matches multiple elements")
|
|
303
304
|
}
|
|
304
305
|
}, withAuth("scrape_structured", scrapeStructuredHandler));
|
|
305
306
|
|
|
@@ -315,14 +316,50 @@ server.registerTool("search_web", {
|
|
|
315
316
|
safe_search: z.boolean().optional().describe("Enable safe search filtering"),
|
|
316
317
|
time_range: z.enum(["day", "week", "month", "year", "all"]).optional().describe("Filter results by time range"),
|
|
317
318
|
site: z.string().optional().describe("Limit results to a specific domain"),
|
|
318
|
-
file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')")
|
|
319
|
+
file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')"),
|
|
320
|
+
provider: z.enum(["crawlforge", "searxng"]).optional().describe("Search backend to use"),
|
|
321
|
+
expand_query: z.boolean().optional().describe("Expand the query with synonyms/stemming/etc."),
|
|
322
|
+
expansion_options: z.object({
|
|
323
|
+
enableSynonyms: z.boolean().optional(),
|
|
324
|
+
enableSpellCheck: z.boolean().optional(),
|
|
325
|
+
enableStemming: z.boolean().optional(),
|
|
326
|
+
enablePhraseDetection: z.boolean().optional(),
|
|
327
|
+
enableBooleanOperators: z.boolean().optional(),
|
|
328
|
+
maxExpansions: z.number().min(1).max(10).optional()
|
|
329
|
+
}).optional().describe("Query-expansion tuning"),
|
|
330
|
+
enable_ranking: z.boolean().optional().describe("Re-rank results (BM25 + signals)"),
|
|
331
|
+
ranking_weights: z.object({
|
|
332
|
+
bm25: z.number().min(0).max(1).optional(),
|
|
333
|
+
semantic: z.number().min(0).max(1).optional(),
|
|
334
|
+
authority: z.number().min(0).max(1).optional(),
|
|
335
|
+
freshness: z.number().min(0).max(1).optional()
|
|
336
|
+
}).optional().describe("Relative weights for ranking signals"),
|
|
337
|
+
enable_deduplication: z.boolean().optional().describe("Remove near-duplicate results"),
|
|
338
|
+
deduplication_thresholds: z.object({
|
|
339
|
+
url: z.number().min(0).max(1).optional(),
|
|
340
|
+
title: z.number().min(0).max(1).optional(),
|
|
341
|
+
content: z.number().min(0).max(1).optional(),
|
|
342
|
+
combined: z.number().min(0).max(1).optional()
|
|
343
|
+
}).optional().describe("Similarity thresholds for dedup"),
|
|
344
|
+
include_ranking_details: z.boolean().optional().describe("Include per-result ranking breakdown"),
|
|
345
|
+
include_deduplication_details: z.boolean().optional().describe("Include dedup decision details"),
|
|
346
|
+
localization: z.object({
|
|
347
|
+
countryCode: z.string().length(2).optional(),
|
|
348
|
+
language: z.string().optional(),
|
|
349
|
+
timezone: z.string().optional(),
|
|
350
|
+
enableGeoTargeting: z.boolean().optional(),
|
|
351
|
+
customLocation: z.object({
|
|
352
|
+
latitude: z.number().min(-90).max(90),
|
|
353
|
+
longitude: z.number().min(-180).max(180)
|
|
354
|
+
}).optional()
|
|
355
|
+
}).optional().describe("Geo/locale targeting for results")
|
|
319
356
|
}
|
|
320
|
-
}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
|
|
357
|
+
}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization }) => {
|
|
321
358
|
try {
|
|
322
359
|
if (!query) {
|
|
323
360
|
return { content: [{ type: "text", text: "Query parameter is required" }], isError: true };
|
|
324
361
|
}
|
|
325
|
-
const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
|
|
362
|
+
const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization });
|
|
326
363
|
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
327
364
|
} catch (error) {
|
|
328
365
|
return { content: [{ type: "text", text: `Search failed: ${error.message}` }], isError: true };
|
|
@@ -342,14 +379,38 @@ server.registerTool("crawl_deep", {
|
|
|
342
379
|
follow_external: z.boolean().optional().describe("Follow links to external domains"),
|
|
343
380
|
respect_robots: z.boolean().optional().describe("Respect robots.txt directives"),
|
|
344
381
|
extract_content: z.boolean().optional().describe("Extract page content during crawl"),
|
|
345
|
-
|
|
382
|
+
content_max_length: z.number().min(1).max(100000).optional().describe("Maximum characters of page content to include per page (default 500); sets a truncated flag when trimmed"),
|
|
383
|
+
concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests"),
|
|
384
|
+
enable_link_analysis: z.boolean().optional().describe("Compute PageRank/link-graph analysis over crawled pages"),
|
|
385
|
+
link_analysis_options: z.object({
|
|
386
|
+
dampingFactor: z.number().min(0).max(1).optional(),
|
|
387
|
+
maxIterations: z.number().min(1).max(1000).optional(),
|
|
388
|
+
enableCaching: z.boolean().optional()
|
|
389
|
+
}).optional().describe("PageRank tuning options"),
|
|
390
|
+
domain_filter: z.object({
|
|
391
|
+
whitelist: z.array(z.any()).optional(),
|
|
392
|
+
blacklist: z.array(z.any()).optional(),
|
|
393
|
+
domain_rules: z.record(z.any()).optional()
|
|
394
|
+
}).optional().describe("Per-domain allow/deny lists and crawl rules"),
|
|
395
|
+
import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config"),
|
|
396
|
+
session: z.object({
|
|
397
|
+
enabled: z.boolean(),
|
|
398
|
+
persistCookies: z.boolean().optional(),
|
|
399
|
+
headers: z.record(z.string()).optional(),
|
|
400
|
+
initialRequest: z.object({
|
|
401
|
+
url: z.string().url(),
|
|
402
|
+
method: z.string().optional(),
|
|
403
|
+
headers: z.record(z.string()).optional(),
|
|
404
|
+
body: z.string().optional()
|
|
405
|
+
}).optional()
|
|
406
|
+
}).optional().describe("Shared cookie-jar/session for login-then-crawl workflows")
|
|
346
407
|
}
|
|
347
|
-
}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
|
|
408
|
+
}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session }) => {
|
|
348
409
|
try {
|
|
349
410
|
if (!url) {
|
|
350
411
|
return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
|
|
351
412
|
}
|
|
352
|
-
const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency });
|
|
413
|
+
const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session });
|
|
353
414
|
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
354
415
|
} catch (error) {
|
|
355
416
|
return { content: [{ type: "text", text: `Crawl failed: ${error.message}` }], isError: true };
|
|
@@ -365,14 +426,21 @@ server.registerTool("map_site", {
|
|
|
365
426
|
include_sitemap: z.boolean().optional().describe("Include sitemap.xml data in results"),
|
|
366
427
|
max_urls: z.number().min(1).max(10000).optional().describe("Maximum number of URLs to discover"),
|
|
367
428
|
group_by_path: z.boolean().optional().describe("Group URLs by path segments"),
|
|
368
|
-
include_metadata: z.boolean().optional().describe("Include page metadata for each URL")
|
|
429
|
+
include_metadata: z.boolean().optional().describe("Include page metadata for each URL"),
|
|
430
|
+
domain_filter: z.object({
|
|
431
|
+
whitelist: z.array(z.string()).optional(),
|
|
432
|
+
blacklist: z.array(z.string()).optional(),
|
|
433
|
+
include_patterns: z.array(z.string()).optional(),
|
|
434
|
+
exclude_patterns: z.array(z.string()).optional()
|
|
435
|
+
}).optional().describe("Per-domain allow/deny lists and URL include/exclude patterns"),
|
|
436
|
+
import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config")
|
|
369
437
|
}
|
|
370
|
-
}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
|
|
438
|
+
}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config }) => {
|
|
371
439
|
try {
|
|
372
440
|
if (!url) {
|
|
373
441
|
return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
|
|
374
442
|
}
|
|
375
|
-
const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata });
|
|
443
|
+
const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config });
|
|
376
444
|
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
377
445
|
} catch (error) {
|
|
378
446
|
return { content: [{ type: "text", text: `Site mapping failed: ${error.message}` }], isError: true };
|
|
@@ -406,7 +474,9 @@ server.registerTool("process_document", {
|
|
|
406
474
|
inputSchema: {
|
|
407
475
|
source: z.string().describe("Document source - URL or file path"),
|
|
408
476
|
sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional().describe("Type of document source"),
|
|
409
|
-
|
|
477
|
+
// C3: passthrough so granular options (maxPages, pageRange:{start,end},
|
|
478
|
+
// extractText, outputFormat, etc.) reach the tool instead of being stripped.
|
|
479
|
+
options: z.object({}).passthrough().optional().describe("Additional processing options (maxPages, pageRange:{start,end}, extractText, extractMetadata, password, outputFormat, ...)")
|
|
410
480
|
}
|
|
411
481
|
}, withAuth("process_document", async ({ source, sourceType, options }) => {
|
|
412
482
|
try {
|
|
@@ -572,6 +642,27 @@ server.registerTool("batch_scrape", {
|
|
|
572
642
|
}
|
|
573
643
|
}));
|
|
574
644
|
|
|
645
|
+
// Tool: get_batch_results — C3: retrieve paginated results for a completed batch
|
|
646
|
+
server.registerTool("get_batch_results", {
|
|
647
|
+
description: "Retrieve paginated results for a completed or in-progress batch_scrape job. Use the batchId returned by batch_scrape. Example: get_batch_results({batchId: \"batch_1234567890_abc\", page: 2, pageSize: 25})",
|
|
648
|
+
annotations: { title: "Get Batch Results", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
649
|
+
inputSchema: {
|
|
650
|
+
batchId: z.string().describe("The batch ID returned by batch_scrape"),
|
|
651
|
+
page: z.number().min(1).default(1).describe("Page number (1-based)"),
|
|
652
|
+
pageSize: z.number().min(1).max(100).default(25).describe("Number of results per page")
|
|
653
|
+
}
|
|
654
|
+
}, withAuth("get_batch_results", async ({ batchId, page = 1, pageSize = 25 }) => {
|
|
655
|
+
try {
|
|
656
|
+
if (!batchId) {
|
|
657
|
+
return { content: [{ type: "text", text: "batchId parameter is required" }], isError: true };
|
|
658
|
+
}
|
|
659
|
+
const result = await batchScrapeTool.getBatchResults(batchId, page, pageSize);
|
|
660
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
661
|
+
} catch (error) {
|
|
662
|
+
return { content: [{ type: "text", text: `get_batch_results failed: ${error.message}` }], isError: true };
|
|
663
|
+
}
|
|
664
|
+
}));
|
|
665
|
+
|
|
575
666
|
// Tool: scrape_with_actions
|
|
576
667
|
server.registerTool("scrape_with_actions", {
|
|
577
668
|
description: "Use this when you need to interact with a page before scraping — login, click buttons, fill forms, scroll, or wait for dynamic content to load. Use for SPAs, login-gated content, or multi-step flows. Screenshots from this tool are stored as crawlforge://screenshot/{actionId} resources. Example: scrape_with_actions({url: \"https://app.com/dashboard\", actions: [{type:\"click\",selector:\"#login\"},{type:\"type\",selector:\"#email\",text:\"user@a.com\"}]})",
|
|
@@ -586,8 +677,34 @@ server.registerTool("scrape_with_actions", {
|
|
|
586
677
|
script: z.string().optional(),
|
|
587
678
|
timeout: z.number().optional(),
|
|
588
679
|
description: z.string().optional(),
|
|
589
|
-
continueOnError: z.boolean().
|
|
590
|
-
retries: z.number().min(0).max(5).
|
|
680
|
+
continueOnError: z.boolean().optional(),
|
|
681
|
+
retries: z.number().min(0).max(5).optional(),
|
|
682
|
+
captureAfter: z.boolean().optional().describe("Capture page content after this action"),
|
|
683
|
+
// wait
|
|
684
|
+
duration: z.number().min(0).max(30000).optional().describe("wait: milliseconds to wait"),
|
|
685
|
+
condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional().describe("wait: condition on selector"),
|
|
686
|
+
// click
|
|
687
|
+
button: z.enum(['left', 'right', 'middle']).optional().describe("click: mouse button"),
|
|
688
|
+
clickCount: z.number().min(1).max(3).optional().describe("click: number of clicks"),
|
|
689
|
+
delay: z.number().min(0).max(1000).optional().describe("click/type: delay in ms"),
|
|
690
|
+
force: z.boolean().optional().describe("click: bypass actionability checks"),
|
|
691
|
+
position: z.object({ x: z.number(), y: z.number() }).optional().describe("click: relative position"),
|
|
692
|
+
// type
|
|
693
|
+
clear: z.boolean().optional().describe("type: clear field before typing"),
|
|
694
|
+
// press
|
|
695
|
+
modifiers: z.array(z.enum(['Alt', 'Control', 'Meta', 'Shift'])).optional().describe("press: modifier keys"),
|
|
696
|
+
// scroll
|
|
697
|
+
direction: z.enum(['up', 'down', 'left', 'right']).optional().describe("scroll: direction"),
|
|
698
|
+
distance: z.number().min(0).optional().describe("scroll: pixels to scroll"),
|
|
699
|
+
smooth: z.boolean().optional().describe("scroll: smooth scrolling"),
|
|
700
|
+
toElement: z.string().optional().describe("scroll: selector to scroll to"),
|
|
701
|
+
// screenshot
|
|
702
|
+
fullPage: z.boolean().optional().describe("screenshot: capture full page"),
|
|
703
|
+
quality: z.number().min(0).max(100).optional().describe("screenshot: jpeg quality"),
|
|
704
|
+
format: z.enum(['png', 'jpeg']).optional().describe("screenshot: image format"),
|
|
705
|
+
// executeJavaScript
|
|
706
|
+
args: z.array(z.any()).optional().describe("executeJavaScript: arguments passed to the script"),
|
|
707
|
+
returnResult: z.boolean().optional().describe("executeJavaScript: return the script result")
|
|
591
708
|
})).min(1).max(20).describe("Browser actions to perform before scraping"),
|
|
592
709
|
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']).describe("Output formats for scraped content"),
|
|
593
710
|
captureIntermediateStates: z.boolean().default(false).describe("Capture page state after each action"),
|
|
@@ -1012,8 +1129,9 @@ server.registerTool("localization", {
|
|
|
1012
1129
|
};
|
|
1013
1130
|
break;
|
|
1014
1131
|
case 'handle_geo_blocking':
|
|
1015
|
-
|
|
1016
|
-
|
|
1132
|
+
case 'detect_geo_blocking':
|
|
1133
|
+
if (!params.url || !params.response) throw new Error('url and response are required for detect_geo_blocking operation');
|
|
1134
|
+
result = await localizationManager.detectGeoBlocking(params.url, params.response);
|
|
1017
1135
|
break;
|
|
1018
1136
|
case 'auto_detect':
|
|
1019
1137
|
if (!params.content || !params.url) throw new Error('content and url are required for auto_detect operation');
|
|
@@ -1103,12 +1221,12 @@ async function runServer() {
|
|
|
1103
1221
|
"fetch_url", "extract_text", "extract_links", "extract_metadata", "scrape_structured",
|
|
1104
1222
|
"search_web", "crawl_deep", "map_site",
|
|
1105
1223
|
"extract_content", "process_document", "summarize_content", "analyze_content",
|
|
1106
|
-
"batch_scrape", "scrape_with_actions",
|
|
1224
|
+
"batch_scrape", "get_batch_results", "scrape_with_actions",
|
|
1107
1225
|
"deep_research", "track_changes", "generate_llms_txt",
|
|
1108
1226
|
"stealth_mode", "localization", "extract_structured", "extract_with_llm",
|
|
1109
|
-
"scrape_template" // D3.3
|
|
1227
|
+
"list_ollama_models", "scrape_template" // D3.3
|
|
1110
1228
|
];
|
|
1111
|
-
console.error(`Tools available (
|
|
1229
|
+
console.error(`Tools available (24): ${allTools.join(", ")}`);
|
|
1112
1230
|
|
|
1113
1231
|
// Start memory monitoring in development
|
|
1114
1232
|
if (config.server.nodeEnv === "development") {
|
package/src/constants/config.js
CHANGED
|
@@ -15,6 +15,11 @@ export const config = {
|
|
|
15
15
|
apiBaseUrl: resolveApiEndpoint(process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev')
|
|
16
16
|
},
|
|
17
17
|
|
|
18
|
+
// Fetch body-size cap
|
|
19
|
+
fetch: {
|
|
20
|
+
maxBodySize: parseInt(process.env.MAX_FETCH_BODY_SIZE || String(25 * 1024 * 1024)) // 25 MB
|
|
21
|
+
},
|
|
22
|
+
|
|
18
23
|
// Performance
|
|
19
24
|
performance: {
|
|
20
25
|
maxWorkers: parseInt(process.env.MAX_WORKERS || '10'),
|
|
@@ -213,7 +213,17 @@ export class ActionExecutor extends EventEmitter {
|
|
|
213
213
|
|
|
214
214
|
// Execute chain with potential retries
|
|
215
215
|
chainResult = await this.executeChainWithRetries(executionContext);
|
|
216
|
-
|
|
216
|
+
|
|
217
|
+
// Capture the LIVE post-action page state before the page is closed,
|
|
218
|
+
// so callers can extract final content reflecting all actions
|
|
219
|
+
// (instead of re-fetching the original URL).
|
|
220
|
+
try {
|
|
221
|
+
executionContext.finalHtml = await page.content();
|
|
222
|
+
executionContext.finalUrl = page.url();
|
|
223
|
+
} catch (captureErr) {
|
|
224
|
+
this.log('warn', 'Failed to capture final page content: ' + captureErr.message);
|
|
225
|
+
}
|
|
226
|
+
|
|
217
227
|
this.stats.successfulChains++;
|
|
218
228
|
executionContext.success = true;
|
|
219
229
|
|
|
@@ -268,6 +278,8 @@ export class ActionExecutor extends EventEmitter {
|
|
|
268
278
|
success: true,
|
|
269
279
|
chainId,
|
|
270
280
|
url,
|
|
281
|
+
finalUrl: executionContext.finalUrl || url,
|
|
282
|
+
finalHtml: executionContext.finalHtml,
|
|
271
283
|
executionTime: Date.now() - startTime,
|
|
272
284
|
results: executionContext.results,
|
|
273
285
|
screenshots: executionContext.screenshots,
|
|
@@ -173,12 +173,15 @@ export class ChangeTracker extends EventEmitter {
|
|
|
173
173
|
*/
|
|
174
174
|
async compareWithBaseline(url, currentContent, options = {}) {
|
|
175
175
|
const startTime = Date.now();
|
|
176
|
-
|
|
176
|
+
|
|
177
|
+
// Expected no-baseline case: return a clean error WITHOUT emitting an
|
|
178
|
+
// unhandled 'error' event (which would crash callers with no 'error' listener).
|
|
179
|
+
if (!this.snapshots.has(url)) {
|
|
180
|
+
throw new Error(`No baseline found for ${url} — run create_baseline first`);
|
|
181
|
+
}
|
|
182
|
+
|
|
177
183
|
try {
|
|
178
|
-
|
|
179
|
-
throw new Error(`No baseline found for URL: ${url}`);
|
|
180
|
-
}
|
|
181
|
-
|
|
184
|
+
|
|
182
185
|
const snapshots = this.snapshots.get(url);
|
|
183
186
|
const baseline = snapshots[snapshots.length - 1]; // Get latest baseline
|
|
184
187
|
|
|
@@ -28,7 +28,10 @@ export class LLMsTxtAnalyzer {
|
|
|
28
28
|
respectRobots: options.respectRobots !== false,
|
|
29
29
|
detectAPIs: options.detectAPIs !== false,
|
|
30
30
|
analyzeContent: options.analyzeContent !== false,
|
|
31
|
-
|
|
31
|
+
// C1: intrusive probing is now opt-in (default false) to avoid hammering
|
|
32
|
+
// security-sensitive and rate-probe paths on every generation run.
|
|
33
|
+
checkSecurity: options.checkSecurity === true,
|
|
34
|
+
probeRateLimit: options.probeRateLimit === true,
|
|
32
35
|
...options
|
|
33
36
|
};
|
|
34
37
|
|
|
@@ -70,26 +73,31 @@ export class LLMsTxtAnalyzer {
|
|
|
70
73
|
analysisOptions: { ...this.options, ...options }
|
|
71
74
|
};
|
|
72
75
|
|
|
73
|
-
// Phase 1: Site Structure Analysis
|
|
76
|
+
// Phase 1: Site Structure Analysis (must run first — subsequent phases
|
|
77
|
+
// depend on the URL list it produces)
|
|
74
78
|
await this.analyzeSiteStructure(url, options);
|
|
75
79
|
|
|
76
|
-
//
|
|
80
|
+
// Phases 2-5 run in parallel where they are independent of each other.
|
|
81
|
+
// detectAPIEndpoints and analyzeSecurity each fire a bounded set of probe
|
|
82
|
+
// fetches (capped at PROBE_CONCURRENCY concurrent requests per phase).
|
|
83
|
+
// analyzeRateLimiting is only executed when the caller opts in via
|
|
84
|
+
// probeRateLimit:true — its 5 sequential requests are intrusive.
|
|
85
|
+
const parallelTasks = [];
|
|
86
|
+
|
|
77
87
|
if (this.options.detectAPIs) {
|
|
78
|
-
|
|
88
|
+
parallelTasks.push(this.detectAPIEndpoints(url));
|
|
79
89
|
}
|
|
80
|
-
|
|
81
|
-
// Phase 3: Content Classification
|
|
82
90
|
if (this.options.analyzeContent) {
|
|
83
|
-
|
|
91
|
+
parallelTasks.push(this.classifyContent());
|
|
84
92
|
}
|
|
85
|
-
|
|
86
|
-
// Phase 4: Security Analysis
|
|
87
93
|
if (this.options.checkSecurity) {
|
|
88
|
-
|
|
94
|
+
parallelTasks.push(this.analyzeSecurity(url));
|
|
95
|
+
}
|
|
96
|
+
if (this.options.probeRateLimit) {
|
|
97
|
+
parallelTasks.push(this.analyzeRateLimiting(url));
|
|
89
98
|
}
|
|
90
99
|
|
|
91
|
-
|
|
92
|
-
await this.analyzeRateLimiting(url);
|
|
100
|
+
await Promise.all(parallelTasks);
|
|
93
101
|
|
|
94
102
|
// Phase 6: Generate Guidelines
|
|
95
103
|
await this.generateUsageGuidelines();
|
|
@@ -160,35 +168,43 @@ export class LLMsTxtAnalyzer {
|
|
|
160
168
|
|
|
161
169
|
/**
|
|
162
170
|
* Detect API endpoints and data sources
|
|
171
|
+
* C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
|
|
163
172
|
*/
|
|
164
173
|
async detectAPIEndpoints(baseUrl) {
|
|
165
174
|
logger.info('Detecting API endpoints...');
|
|
166
175
|
|
|
176
|
+
const PROBE_CONCURRENCY = 6;
|
|
177
|
+
|
|
167
178
|
try {
|
|
168
|
-
const apis = [];
|
|
169
179
|
const commonPaths = [
|
|
170
180
|
'/api', '/v1', '/v2', '/v3', '/rest', '/graphql',
|
|
171
181
|
'/data', '/feed', '/json', '/xml', '/rss',
|
|
172
182
|
'/.well-known', '/openapi', '/swagger'
|
|
173
183
|
];
|
|
174
184
|
|
|
175
|
-
//
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
const
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
185
|
+
// Run path probes in parallel batches
|
|
186
|
+
const apis = [];
|
|
187
|
+
for (let i = 0; i < commonPaths.length; i += PROBE_CONCURRENCY) {
|
|
188
|
+
const batch = commonPaths.slice(i, i + PROBE_CONCURRENCY);
|
|
189
|
+
const results = await Promise.allSettled(
|
|
190
|
+
batch.map(async (path) => {
|
|
191
|
+
const apiUrl = `${baseUrl}${path}`;
|
|
192
|
+
const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
|
|
193
|
+
if (response.ok) {
|
|
194
|
+
const contentType = response.headers.get('content-type') || '';
|
|
195
|
+
return {
|
|
196
|
+
url: apiUrl,
|
|
197
|
+
type: this.determineAPIType(apiUrl, contentType),
|
|
198
|
+
status: response.status,
|
|
199
|
+
contentType,
|
|
200
|
+
accessible: true
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
return null;
|
|
204
|
+
})
|
|
205
|
+
);
|
|
206
|
+
for (const r of results) {
|
|
207
|
+
if (r.status === 'fulfilled' && r.value) apis.push(r.value);
|
|
192
208
|
}
|
|
193
209
|
}
|
|
194
210
|
|
|
@@ -278,13 +294,14 @@ export class LLMsTxtAnalyzer {
|
|
|
278
294
|
|
|
279
295
|
/**
|
|
280
296
|
* Analyze security boundaries and sensitive areas
|
|
297
|
+
* C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
|
|
281
298
|
*/
|
|
282
299
|
async analyzeSecurity(baseUrl) {
|
|
283
300
|
logger.info('Analyzing security boundaries...');
|
|
284
301
|
|
|
285
|
-
|
|
286
|
-
const securityAreas = [];
|
|
302
|
+
const PROBE_CONCURRENCY = 6;
|
|
287
303
|
|
|
304
|
+
try {
|
|
288
305
|
// Check for common sensitive paths
|
|
289
306
|
const sensitivePaths = [
|
|
290
307
|
'/admin', '/administrator', '/wp-admin', '/cms',
|
|
@@ -294,21 +311,28 @@ export class LLMsTxtAnalyzer {
|
|
|
294
311
|
'/config', '/settings', '/env'
|
|
295
312
|
];
|
|
296
313
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
314
|
+
// Run path probes in parallel batches
|
|
315
|
+
const securityAreas = [];
|
|
316
|
+
for (let i = 0; i < sensitivePaths.length; i += PROBE_CONCURRENCY) {
|
|
317
|
+
const batch = sensitivePaths.slice(i, i + PROBE_CONCURRENCY);
|
|
318
|
+
const results = await Promise.allSettled(
|
|
319
|
+
batch.map(async (path) => {
|
|
320
|
+
const testUrl = `${baseUrl}${path}`;
|
|
321
|
+
const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
|
|
322
|
+
if (response.status === 200 || response.status === 302 || response.status === 401) {
|
|
323
|
+
return {
|
|
324
|
+
path,
|
|
325
|
+
url: testUrl,
|
|
326
|
+
status: response.status,
|
|
327
|
+
type: this.classifySecurityArea(path),
|
|
328
|
+
recommendation: 'restrict'
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
return null;
|
|
332
|
+
})
|
|
333
|
+
);
|
|
334
|
+
for (const r of results) {
|
|
335
|
+
if (r.status === 'fulfilled' && r.value) securityAreas.push(r.value);
|
|
312
336
|
}
|
|
313
337
|
}
|
|
314
338
|
|
|
@@ -499,12 +499,14 @@ export class LocalizationManager extends EventEmitter {
|
|
|
499
499
|
}
|
|
500
500
|
|
|
501
501
|
/**
|
|
502
|
-
* Detect
|
|
502
|
+
* Detect geo-blocked content and return suggestions.
|
|
503
|
+
* C3: renamed from handleGeoBlocking — no bypass is actually applied here;
|
|
504
|
+
* the returned bypassStrategies are recommendations only.
|
|
503
505
|
* @param {string} url - URL to check
|
|
504
506
|
* @param {Object} response - HTTP response object
|
|
505
|
-
* @returns {Object} -
|
|
507
|
+
* @returns {Object} - Detection result and bypass suggestions
|
|
506
508
|
*/
|
|
507
|
-
async
|
|
509
|
+
async detectGeoBlocking(url, response) {
|
|
508
510
|
const geoBlockingIndicators = [
|
|
509
511
|
/not available in your country/i,
|
|
510
512
|
/access denied/i,
|
|
@@ -1386,8 +1388,9 @@ export class LocalizationManager extends EventEmitter {
|
|
|
1386
1388
|
}
|
|
1387
1389
|
|
|
1388
1390
|
// Phone number pattern analysis
|
|
1391
|
+
// C3: fix US pattern — was using \\d (literal backslash-d) instead of \d
|
|
1389
1392
|
const phonePatterns = {
|
|
1390
|
-
'US': /\+1[\s.-]?\(
|
|
1393
|
+
'US': /\+1[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
|
|
1391
1394
|
'GB': /\+44[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
|
|
1392
1395
|
'DE': /\+49[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
|
|
1393
1396
|
'FR': /\+33[\s.-]?\d{1}[\s.-]?\d{8}/
|
|
@@ -519,14 +519,18 @@ export class ResearchOrchestrator extends EventEmitter {
|
|
|
519
519
|
}
|
|
520
520
|
}
|
|
521
521
|
|
|
522
|
-
|
|
522
|
+
// Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
|
|
523
|
+
const contentText = contentData && contentData.content
|
|
524
|
+
? (typeof contentData.content === 'string'
|
|
525
|
+
? contentData.content
|
|
526
|
+
: (contentData.content.text || ''))
|
|
527
|
+
: '';
|
|
528
|
+
|
|
529
|
+
// Only count and enhance sources that actually produced non-empty content.
|
|
530
|
+
// Skip failed extractions and empty {text:""} results.
|
|
531
|
+
if (contentData && contentData.success !== false && contentText.trim().length > 0) {
|
|
523
532
|
this.metrics.contentExtracted++;
|
|
524
533
|
|
|
525
|
-
// Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
|
|
526
|
-
const contentText = typeof contentData.content === 'string'
|
|
527
|
-
? contentData.content
|
|
528
|
-
: (contentData.content.text || JSON.stringify(contentData.content));
|
|
529
|
-
|
|
530
534
|
// Enhance source with extracted content
|
|
531
535
|
let enhancedSource = {
|
|
532
536
|
...source,
|