crawlforge-mcp-server 4.2.11 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +2 -1
  2. package/server.js +152 -21
  3. package/src/constants/config.js +5 -0
  4. package/src/core/ActionExecutor.js +13 -1
  5. package/src/core/ChangeTracker.js +8 -5
  6. package/src/core/LLMsTxtAnalyzer.js +71 -47
  7. package/src/core/LocalizationManager.js +7 -4
  8. package/src/core/ResearchOrchestrator.js +10 -6
  9. package/src/core/StealthBrowserManager.js +111 -40
  10. package/src/core/analysis/ContentAnalyzer.js +2 -2
  11. package/src/core/crawlers/BFSCrawler.js +23 -12
  12. package/src/core/processing/ContentProcessor.js +19 -3
  13. package/src/core/processing/PDFProcessor.js +72 -23
  14. package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
  15. package/src/tools/advanced/batchScrape/index.js +3 -1
  16. package/src/tools/advanced/batchScrape/reporter.js +5 -1
  17. package/src/tools/advanced/batchScrape/worker.js +6 -1
  18. package/src/tools/basic/_fetch.js +78 -5
  19. package/src/tools/basic/extractLinks.js +1 -1
  20. package/src/tools/basic/extractMetadata.js +65 -1
  21. package/src/tools/basic/extractText.js +61 -5
  22. package/src/tools/basic/scrapeStructured.js +48 -10
  23. package/src/tools/crawl/crawlDeep.js +13 -5
  24. package/src/tools/crawl/mapSite.js +24 -51
  25. package/src/tools/extract/analyzeContent.js +11 -6
  26. package/src/tools/extract/extractContent.js +23 -5
  27. package/src/tools/extract/extractStructured.js +65 -16
  28. package/src/tools/extract/extractWithLlm.js +192 -11
  29. package/src/tools/extract/listOllamaModels.js +19 -8
  30. package/src/tools/extract/processDocument.js +10 -4
  31. package/src/tools/extract/summarizeContent.js +58 -1
  32. package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
  33. package/src/tools/research/deepResearch.js +43 -4
  34. package/src/tools/search/providers/searxng.js +2 -2
  35. package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
  36. package/src/tools/search/ranking/ResultRanker.js +13 -4
  37. package/src/tools/search/searchWeb.js +5 -5
  38. package/src/tools/templates/TemplateRegistry.js +3 -2
  39. package/src/tools/tracking/trackChanges/differ.js +33 -1
  40. package/src/utils/htmlToMarkdown.js +5 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawlforge-mcp-server",
3
- "version": "4.2.11",
3
+ "version": "4.5.0",
4
4
  "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
5
5
  "main": "server.js",
6
6
  "bin": {
@@ -113,6 +113,7 @@
113
113
  "playwright": "^1.54.2",
114
114
  "robots-parser": "^3.0.1",
115
115
  "turndown": "^7.2.4",
116
+ "turndown-plugin-gfm": "^1.0.2",
116
117
  "undici": "^7.24.0",
117
118
  "winston": "^3.11.0",
118
119
  "zod": "^3.23.8"
package/server.js CHANGED
@@ -96,8 +96,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
96
96
  // Create the server
97
97
  const server = new McpServer({
98
98
  name: "crawlforge",
99
- version: "4.2.6",
100
- description: "Production-ready MCP server with 23 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
99
+ version: "4.5.0",
100
+ description: "Production-ready MCP server with 24 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
101
101
  homepage: "https://www.crawlforge.dev",
102
102
  icon: "https://www.crawlforge.dev/icon.png"
103
103
  });
@@ -299,7 +299,8 @@ server.registerTool("scrape_structured", {
299
299
  annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
300
300
  inputSchema: {
301
301
  url: z.string().url().describe("The URL to scrape"),
302
- selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
302
+ selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors. Append @attr to extract an attribute instead of text (e.g. \"a.link@href\", \"img@src\")"),
303
+ max_results: z.number().int().min(1).optional().describe("Maximum number of matches to return per field when a selector matches multiple elements")
303
304
  }
304
305
  }, withAuth("scrape_structured", scrapeStructuredHandler));
305
306
 
@@ -315,14 +316,50 @@ server.registerTool("search_web", {
315
316
  safe_search: z.boolean().optional().describe("Enable safe search filtering"),
316
317
  time_range: z.enum(["day", "week", "month", "year", "all"]).optional().describe("Filter results by time range"),
317
318
  site: z.string().optional().describe("Limit results to a specific domain"),
318
- file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')")
319
+ file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')"),
320
+ provider: z.enum(["crawlforge", "searxng"]).optional().describe("Search backend to use"),
321
+ expand_query: z.boolean().optional().describe("Expand the query with synonyms/stemming/etc."),
322
+ expansion_options: z.object({
323
+ enableSynonyms: z.boolean().optional(),
324
+ enableSpellCheck: z.boolean().optional(),
325
+ enableStemming: z.boolean().optional(),
326
+ enablePhraseDetection: z.boolean().optional(),
327
+ enableBooleanOperators: z.boolean().optional(),
328
+ maxExpansions: z.number().min(1).max(10).optional()
329
+ }).optional().describe("Query-expansion tuning"),
330
+ enable_ranking: z.boolean().optional().describe("Re-rank results (BM25 + signals)"),
331
+ ranking_weights: z.object({
332
+ bm25: z.number().min(0).max(1).optional(),
333
+ semantic: z.number().min(0).max(1).optional(),
334
+ authority: z.number().min(0).max(1).optional(),
335
+ freshness: z.number().min(0).max(1).optional()
336
+ }).optional().describe("Relative weights for ranking signals"),
337
+ enable_deduplication: z.boolean().optional().describe("Remove near-duplicate results"),
338
+ deduplication_thresholds: z.object({
339
+ url: z.number().min(0).max(1).optional(),
340
+ title: z.number().min(0).max(1).optional(),
341
+ content: z.number().min(0).max(1).optional(),
342
+ combined: z.number().min(0).max(1).optional()
343
+ }).optional().describe("Similarity thresholds for dedup"),
344
+ include_ranking_details: z.boolean().optional().describe("Include per-result ranking breakdown"),
345
+ include_deduplication_details: z.boolean().optional().describe("Include dedup decision details"),
346
+ localization: z.object({
347
+ countryCode: z.string().length(2).optional(),
348
+ language: z.string().optional(),
349
+ timezone: z.string().optional(),
350
+ enableGeoTargeting: z.boolean().optional(),
351
+ customLocation: z.object({
352
+ latitude: z.number().min(-90).max(90),
353
+ longitude: z.number().min(-180).max(180)
354
+ }).optional()
355
+ }).optional().describe("Geo/locale targeting for results")
319
356
  }
320
- }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
357
+ }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization }) => {
321
358
  try {
322
359
  if (!query) {
323
360
  return { content: [{ type: "text", text: "Query parameter is required" }], isError: true };
324
361
  }
325
- const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
362
+ const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization });
326
363
  return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
327
364
  } catch (error) {
328
365
  return { content: [{ type: "text", text: `Search failed: ${error.message}` }], isError: true };
@@ -342,14 +379,38 @@ server.registerTool("crawl_deep", {
342
379
  follow_external: z.boolean().optional().describe("Follow links to external domains"),
343
380
  respect_robots: z.boolean().optional().describe("Respect robots.txt directives"),
344
381
  extract_content: z.boolean().optional().describe("Extract page content during crawl"),
345
- concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests")
382
+ content_max_length: z.number().min(1).max(100000).optional().describe("Maximum characters of page content to include per page (default 500); sets a truncated flag when trimmed"),
383
+ concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests"),
384
+ enable_link_analysis: z.boolean().optional().describe("Compute PageRank/link-graph analysis over crawled pages"),
385
+ link_analysis_options: z.object({
386
+ dampingFactor: z.number().min(0).max(1).optional(),
387
+ maxIterations: z.number().min(1).max(1000).optional(),
388
+ enableCaching: z.boolean().optional()
389
+ }).optional().describe("PageRank tuning options"),
390
+ domain_filter: z.object({
391
+ whitelist: z.array(z.any()).optional(),
392
+ blacklist: z.array(z.any()).optional(),
393
+ domain_rules: z.record(z.any()).optional()
394
+ }).optional().describe("Per-domain allow/deny lists and crawl rules"),
395
+ import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config"),
396
+ session: z.object({
397
+ enabled: z.boolean(),
398
+ persistCookies: z.boolean().optional(),
399
+ headers: z.record(z.string()).optional(),
400
+ initialRequest: z.object({
401
+ url: z.string().url(),
402
+ method: z.string().optional(),
403
+ headers: z.record(z.string()).optional(),
404
+ body: z.string().optional()
405
+ }).optional()
406
+ }).optional().describe("Shared cookie-jar/session for login-then-crawl workflows")
346
407
  }
347
- }, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
408
+ }, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session }) => {
348
409
  try {
349
410
  if (!url) {
350
411
  return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
351
412
  }
352
- const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency });
413
+ const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session });
353
414
  return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
354
415
  } catch (error) {
355
416
  return { content: [{ type: "text", text: `Crawl failed: ${error.message}` }], isError: true };
@@ -365,14 +426,21 @@ server.registerTool("map_site", {
365
426
  include_sitemap: z.boolean().optional().describe("Include sitemap.xml data in results"),
366
427
  max_urls: z.number().min(1).max(10000).optional().describe("Maximum number of URLs to discover"),
367
428
  group_by_path: z.boolean().optional().describe("Group URLs by path segments"),
368
- include_metadata: z.boolean().optional().describe("Include page metadata for each URL")
429
+ include_metadata: z.boolean().optional().describe("Include page metadata for each URL"),
430
+ domain_filter: z.object({
431
+ whitelist: z.array(z.string()).optional(),
432
+ blacklist: z.array(z.string()).optional(),
433
+ include_patterns: z.array(z.string()).optional(),
434
+ exclude_patterns: z.array(z.string()).optional()
435
+ }).optional().describe("Per-domain allow/deny lists and URL include/exclude patterns"),
436
+ import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config")
369
437
  }
370
- }, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
438
+ }, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config }) => {
371
439
  try {
372
440
  if (!url) {
373
441
  return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
374
442
  }
375
- const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata });
443
+ const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config });
376
444
  return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
377
445
  } catch (error) {
378
446
  return { content: [{ type: "text", text: `Site mapping failed: ${error.message}` }], isError: true };
@@ -406,7 +474,9 @@ server.registerTool("process_document", {
406
474
  inputSchema: {
407
475
  source: z.string().describe("Document source - URL or file path"),
408
476
  sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional().describe("Type of document source"),
409
- options: z.object({}).optional().describe("Additional processing options")
477
+ // C3: passthrough so granular options (maxPages, pageRange:{start,end},
478
+ // extractText, outputFormat, etc.) reach the tool instead of being stripped.
479
+ options: z.object({}).passthrough().optional().describe("Additional processing options (maxPages, pageRange:{start,end}, extractText, extractMetadata, password, outputFormat, ...)")
410
480
  }
411
481
  }, withAuth("process_document", async ({ source, sourceType, options }) => {
412
482
  try {
@@ -572,6 +642,27 @@ server.registerTool("batch_scrape", {
572
642
  }
573
643
  }));
574
644
 
645
+ // Tool: get_batch_results — C3: retrieve paginated results for a completed batch
646
+ server.registerTool("get_batch_results", {
647
+ description: "Retrieve paginated results for a completed or in-progress batch_scrape job. Use the batchId returned by batch_scrape. Example: get_batch_results({batchId: \"batch_1234567890_abc\", page: 2, pageSize: 25})",
648
+ annotations: { title: "Get Batch Results", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
649
+ inputSchema: {
650
+ batchId: z.string().describe("The batch ID returned by batch_scrape"),
651
+ page: z.number().min(1).default(1).describe("Page number (1-based)"),
652
+ pageSize: z.number().min(1).max(100).default(25).describe("Number of results per page")
653
+ }
654
+ }, withAuth("get_batch_results", async ({ batchId, page = 1, pageSize = 25 }) => {
655
+ try {
656
+ if (!batchId) {
657
+ return { content: [{ type: "text", text: "batchId parameter is required" }], isError: true };
658
+ }
659
+ const result = await batchScrapeTool.getBatchResults(batchId, page, pageSize);
660
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
661
+ } catch (error) {
662
+ return { content: [{ type: "text", text: `get_batch_results failed: ${error.message}` }], isError: true };
663
+ }
664
+ }));
665
+
575
666
  // Tool: scrape_with_actions
576
667
  server.registerTool("scrape_with_actions", {
577
668
  description: "Use this when you need to interact with a page before scraping — login, click buttons, fill forms, scroll, or wait for dynamic content to load. Use for SPAs, login-gated content, or multi-step flows. Screenshots from this tool are stored as crawlforge://screenshot/{actionId} resources. Example: scrape_with_actions({url: \"https://app.com/dashboard\", actions: [{type:\"click\",selector:\"#login\"},{type:\"type\",selector:\"#email\",text:\"user@a.com\"}]})",
@@ -586,8 +677,34 @@ server.registerTool("scrape_with_actions", {
586
677
  script: z.string().optional(),
587
678
  timeout: z.number().optional(),
588
679
  description: z.string().optional(),
589
- continueOnError: z.boolean().default(false),
590
- retries: z.number().min(0).max(5).default(0)
680
+ continueOnError: z.boolean().optional(),
681
+ retries: z.number().min(0).max(5).optional(),
682
+ captureAfter: z.boolean().optional().describe("Capture page content after this action"),
683
+ // wait
684
+ duration: z.number().min(0).max(30000).optional().describe("wait: milliseconds to wait"),
685
+ condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional().describe("wait: condition on selector"),
686
+ // click
687
+ button: z.enum(['left', 'right', 'middle']).optional().describe("click: mouse button"),
688
+ clickCount: z.number().min(1).max(3).optional().describe("click: number of clicks"),
689
+ delay: z.number().min(0).max(1000).optional().describe("click/type: delay in ms"),
690
+ force: z.boolean().optional().describe("click: bypass actionability checks"),
691
+ position: z.object({ x: z.number(), y: z.number() }).optional().describe("click: relative position"),
692
+ // type
693
+ clear: z.boolean().optional().describe("type: clear field before typing"),
694
+ // press
695
+ modifiers: z.array(z.enum(['Alt', 'Control', 'Meta', 'Shift'])).optional().describe("press: modifier keys"),
696
+ // scroll
697
+ direction: z.enum(['up', 'down', 'left', 'right']).optional().describe("scroll: direction"),
698
+ distance: z.number().min(0).optional().describe("scroll: pixels to scroll"),
699
+ smooth: z.boolean().optional().describe("scroll: smooth scrolling"),
700
+ toElement: z.string().optional().describe("scroll: selector to scroll to"),
701
+ // screenshot
702
+ fullPage: z.boolean().optional().describe("screenshot: capture full page"),
703
+ quality: z.number().min(0).max(100).optional().describe("screenshot: jpeg quality"),
704
+ format: z.enum(['png', 'jpeg']).optional().describe("screenshot: image format"),
705
+ // executeJavaScript
706
+ args: z.array(z.any()).optional().describe("executeJavaScript: arguments passed to the script"),
707
+ returnResult: z.boolean().optional().describe("executeJavaScript: return the script result")
591
708
  })).min(1).max(20).describe("Browser actions to perform before scraping"),
592
709
  formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']).describe("Output formats for scraped content"),
593
710
  captureIntermediateStates: z.boolean().default(false).describe("Capture page state after each action"),
@@ -899,7 +1016,20 @@ server.registerTool("stealth_mode", {
899
1016
  case 'create_page': {
900
1017
  if (!contextId) throw new Error('contextId is required for create_page operation');
901
1018
  const page = await stealthBrowserManager.createStealthPage(contextId);
902
- result = { pageCreated: true, contextId, url: urlToTest ? await page.goto(urlToTest) : null };
1019
+ let navigation = null;
1020
+ if (urlToTest) {
1021
+ // page.goto returns a Playwright Response handle, which is not
1022
+ // JSON-serializable — extract just the useful navigation details.
1023
+ const response = await page.goto(urlToTest);
1024
+ navigation = {
1025
+ requestedUrl: urlToTest,
1026
+ finalUrl: page.url(),
1027
+ status: response ? response.status() : null,
1028
+ ok: response ? response.ok() : null,
1029
+ title: await page.title().catch(() => null)
1030
+ };
1031
+ }
1032
+ result = { pageCreated: true, contextId, navigation };
903
1033
  break;
904
1034
  }
905
1035
  case 'get_stats':
@@ -999,8 +1129,9 @@ server.registerTool("localization", {
999
1129
  };
1000
1130
  break;
1001
1131
  case 'handle_geo_blocking':
1002
- if (!params.url || !params.response) throw new Error('url and response are required for handle_geo_blocking operation');
1003
- result = await localizationManager.handleGeoBlocking(params.url, params.response);
1132
+ case 'detect_geo_blocking':
1133
+ if (!params.url || !params.response) throw new Error('url and response are required for detect_geo_blocking operation');
1134
+ result = await localizationManager.detectGeoBlocking(params.url, params.response);
1004
1135
  break;
1005
1136
  case 'auto_detect':
1006
1137
  if (!params.content || !params.url) throw new Error('content and url are required for auto_detect operation');
@@ -1090,12 +1221,12 @@ async function runServer() {
1090
1221
  "fetch_url", "extract_text", "extract_links", "extract_metadata", "scrape_structured",
1091
1222
  "search_web", "crawl_deep", "map_site",
1092
1223
  "extract_content", "process_document", "summarize_content", "analyze_content",
1093
- "batch_scrape", "scrape_with_actions",
1224
+ "batch_scrape", "get_batch_results", "scrape_with_actions",
1094
1225
  "deep_research", "track_changes", "generate_llms_txt",
1095
1226
  "stealth_mode", "localization", "extract_structured", "extract_with_llm",
1096
- "scrape_template" // D3.3
1227
+ "list_ollama_models", "scrape_template" // D3.3
1097
1228
  ];
1098
- console.error(`Tools available (23): ${allTools.join(", ")}`);
1229
+ console.error(`Tools available (24): ${allTools.join(", ")}`);
1099
1230
 
1100
1231
  // Start memory monitoring in development
1101
1232
  if (config.server.nodeEnv === "development") {
@@ -15,6 +15,11 @@ export const config = {
15
15
  apiBaseUrl: resolveApiEndpoint(process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev')
16
16
  },
17
17
 
18
+ // Fetch body-size cap
19
+ fetch: {
20
+ maxBodySize: parseInt(process.env.MAX_FETCH_BODY_SIZE || String(25 * 1024 * 1024)) // 25 MB
21
+ },
22
+
18
23
  // Performance
19
24
  performance: {
20
25
  maxWorkers: parseInt(process.env.MAX_WORKERS || '10'),
@@ -213,7 +213,17 @@ export class ActionExecutor extends EventEmitter {
213
213
 
214
214
  // Execute chain with potential retries
215
215
  chainResult = await this.executeChainWithRetries(executionContext);
216
-
216
+
217
+ // Capture the LIVE post-action page state before the page is closed,
218
+ // so callers can extract final content reflecting all actions
219
+ // (instead of re-fetching the original URL).
220
+ try {
221
+ executionContext.finalHtml = await page.content();
222
+ executionContext.finalUrl = page.url();
223
+ } catch (captureErr) {
224
+ this.log('warn', 'Failed to capture final page content: ' + captureErr.message);
225
+ }
226
+
217
227
  this.stats.successfulChains++;
218
228
  executionContext.success = true;
219
229
 
@@ -268,6 +278,8 @@ export class ActionExecutor extends EventEmitter {
268
278
  success: true,
269
279
  chainId,
270
280
  url,
281
+ finalUrl: executionContext.finalUrl || url,
282
+ finalHtml: executionContext.finalHtml,
271
283
  executionTime: Date.now() - startTime,
272
284
  results: executionContext.results,
273
285
  screenshots: executionContext.screenshots,
@@ -173,12 +173,15 @@ export class ChangeTracker extends EventEmitter {
173
173
  */
174
174
  async compareWithBaseline(url, currentContent, options = {}) {
175
175
  const startTime = Date.now();
176
-
176
+
177
+ // Expected no-baseline case: return a clean error WITHOUT emitting an
178
+ // unhandled 'error' event (which would crash callers with no 'error' listener).
179
+ if (!this.snapshots.has(url)) {
180
+ throw new Error(`No baseline found for ${url} — run create_baseline first`);
181
+ }
182
+
177
183
  try {
178
- if (!this.snapshots.has(url)) {
179
- throw new Error(`No baseline found for URL: ${url}`);
180
- }
181
-
184
+
182
185
  const snapshots = this.snapshots.get(url);
183
186
  const baseline = snapshots[snapshots.length - 1]; // Get latest baseline
184
187
 
@@ -28,7 +28,10 @@ export class LLMsTxtAnalyzer {
28
28
  respectRobots: options.respectRobots !== false,
29
29
  detectAPIs: options.detectAPIs !== false,
30
30
  analyzeContent: options.analyzeContent !== false,
31
- checkSecurity: options.checkSecurity !== false,
31
+ // C1: intrusive probing is now opt-in (default false) to avoid hammering
32
+ // security-sensitive and rate-probe paths on every generation run.
33
+ checkSecurity: options.checkSecurity === true,
34
+ probeRateLimit: options.probeRateLimit === true,
32
35
  ...options
33
36
  };
34
37
 
@@ -70,26 +73,31 @@ export class LLMsTxtAnalyzer {
70
73
  analysisOptions: { ...this.options, ...options }
71
74
  };
72
75
 
73
- // Phase 1: Site Structure Analysis
76
+ // Phase 1: Site Structure Analysis (must run first — subsequent phases
77
+ // depend on the URL list it produces)
74
78
  await this.analyzeSiteStructure(url, options);
75
79
 
76
- // Phase 2: API Detection
80
+ // Phases 2-5 run in parallel where they are independent of each other.
81
+ // detectAPIEndpoints and analyzeSecurity each fire a bounded set of probe
82
+ // fetches (capped at PROBE_CONCURRENCY concurrent requests per phase).
83
+ // analyzeRateLimiting is only executed when the caller opts in via
84
+ // probeRateLimit:true — its 5 sequential requests are intrusive.
85
+ const parallelTasks = [];
86
+
77
87
  if (this.options.detectAPIs) {
78
- await this.detectAPIEndpoints(url);
88
+ parallelTasks.push(this.detectAPIEndpoints(url));
79
89
  }
80
-
81
- // Phase 3: Content Classification
82
90
  if (this.options.analyzeContent) {
83
- await this.classifyContent();
91
+ parallelTasks.push(this.classifyContent());
84
92
  }
85
-
86
- // Phase 4: Security Analysis
87
93
  if (this.options.checkSecurity) {
88
- await this.analyzeSecurity(url);
94
+ parallelTasks.push(this.analyzeSecurity(url));
95
+ }
96
+ if (this.options.probeRateLimit) {
97
+ parallelTasks.push(this.analyzeRateLimiting(url));
89
98
  }
90
99
 
91
- // Phase 5: Rate Limiting Analysis
92
- await this.analyzeRateLimiting(url);
100
+ await Promise.all(parallelTasks);
93
101
 
94
102
  // Phase 6: Generate Guidelines
95
103
  await this.generateUsageGuidelines();
@@ -160,35 +168,43 @@ export class LLMsTxtAnalyzer {
160
168
 
161
169
  /**
162
170
  * Detect API endpoints and data sources
171
+ * C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
163
172
  */
164
173
  async detectAPIEndpoints(baseUrl) {
165
174
  logger.info('Detecting API endpoints...');
166
175
 
176
+ const PROBE_CONCURRENCY = 6;
177
+
167
178
  try {
168
- const apis = [];
169
179
  const commonPaths = [
170
180
  '/api', '/v1', '/v2', '/v3', '/rest', '/graphql',
171
181
  '/data', '/feed', '/json', '/xml', '/rss',
172
182
  '/.well-known', '/openapi', '/swagger'
173
183
  ];
174
184
 
175
- // Check common API paths
176
- for (const path of commonPaths) {
177
- const apiUrl = `${baseUrl}${path}`;
178
- try {
179
- const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
180
- if (response.ok) {
181
- const contentType = response.headers.get('content-type') || '';
182
- apis.push({
183
- url: apiUrl,
184
- type: this.determineAPIType(apiUrl, contentType),
185
- status: response.status,
186
- contentType,
187
- accessible: true
188
- });
189
- }
190
- } catch {
191
- // API endpoint not accessible or doesn't exist
185
+ // Run path probes in parallel batches
186
+ const apis = [];
187
+ for (let i = 0; i < commonPaths.length; i += PROBE_CONCURRENCY) {
188
+ const batch = commonPaths.slice(i, i + PROBE_CONCURRENCY);
189
+ const results = await Promise.allSettled(
190
+ batch.map(async (path) => {
191
+ const apiUrl = `${baseUrl}${path}`;
192
+ const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
193
+ if (response.ok) {
194
+ const contentType = response.headers.get('content-type') || '';
195
+ return {
196
+ url: apiUrl,
197
+ type: this.determineAPIType(apiUrl, contentType),
198
+ status: response.status,
199
+ contentType,
200
+ accessible: true
201
+ };
202
+ }
203
+ return null;
204
+ })
205
+ );
206
+ for (const r of results) {
207
+ if (r.status === 'fulfilled' && r.value) apis.push(r.value);
192
208
  }
193
209
  }
194
210
 
@@ -278,13 +294,14 @@ export class LLMsTxtAnalyzer {
278
294
 
279
295
  /**
280
296
  * Analyze security boundaries and sensitive areas
297
+ * C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
281
298
  */
282
299
  async analyzeSecurity(baseUrl) {
283
300
  logger.info('Analyzing security boundaries...');
284
301
 
285
- try {
286
- const securityAreas = [];
302
+ const PROBE_CONCURRENCY = 6;
287
303
 
304
+ try {
288
305
  // Check for common sensitive paths
289
306
  const sensitivePaths = [
290
307
  '/admin', '/administrator', '/wp-admin', '/cms',
@@ -294,21 +311,28 @@ export class LLMsTxtAnalyzer {
294
311
  '/config', '/settings', '/env'
295
312
  ];
296
313
 
297
- for (const path of sensitivePaths) {
298
- const testUrl = `${baseUrl}${path}`;
299
- try {
300
- const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
301
- if (response.status === 200 || response.status === 302 || response.status === 401) {
302
- securityAreas.push({
303
- path,
304
- url: testUrl,
305
- status: response.status,
306
- type: this.classifySecurityArea(path),
307
- recommendation: 'restrict'
308
- });
309
- }
310
- } catch {
311
- // Area not accessible
314
+ // Run path probes in parallel batches
315
+ const securityAreas = [];
316
+ for (let i = 0; i < sensitivePaths.length; i += PROBE_CONCURRENCY) {
317
+ const batch = sensitivePaths.slice(i, i + PROBE_CONCURRENCY);
318
+ const results = await Promise.allSettled(
319
+ batch.map(async (path) => {
320
+ const testUrl = `${baseUrl}${path}`;
321
+ const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
322
+ if (response.status === 200 || response.status === 302 || response.status === 401) {
323
+ return {
324
+ path,
325
+ url: testUrl,
326
+ status: response.status,
327
+ type: this.classifySecurityArea(path),
328
+ recommendation: 'restrict'
329
+ };
330
+ }
331
+ return null;
332
+ })
333
+ );
334
+ for (const r of results) {
335
+ if (r.status === 'fulfilled' && r.value) securityAreas.push(r.value);
312
336
  }
313
337
  }
314
338
 
@@ -499,12 +499,14 @@ export class LocalizationManager extends EventEmitter {
499
499
  }
500
500
 
501
501
  /**
502
- * Detect and handle geo-blocked content
502
+ * Detect geo-blocked content and return suggestions.
503
+ * C3: renamed from handleGeoBlocking — no bypass is actually applied here;
504
+ * the returned bypassStrategies are recommendations only.
503
505
  * @param {string} url - URL to check
504
506
  * @param {Object} response - HTTP response object
505
- * @returns {Object} - Analysis and bypass suggestions
507
+ * @returns {Object} - Detection result and bypass suggestions
506
508
  */
507
- async handleGeoBlocking(url, response) {
509
+ async detectGeoBlocking(url, response) {
508
510
  const geoBlockingIndicators = [
509
511
  /not available in your country/i,
510
512
  /access denied/i,
@@ -1386,8 +1388,9 @@ export class LocalizationManager extends EventEmitter {
1386
1388
  }
1387
1389
 
1388
1390
  // Phone number pattern analysis
1391
+ // C3: fix US pattern — was using \\d (literal backslash-d) instead of \d
1389
1392
  const phonePatterns = {
1390
- 'US': /\+1[\s.-]?\(?\\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
1393
+ 'US': /\+1[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
1391
1394
  'GB': /\+44[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
1392
1395
  'DE': /\+49[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
1393
1396
  'FR': /\+33[\s.-]?\d{1}[\s.-]?\d{8}/
@@ -519,14 +519,18 @@ export class ResearchOrchestrator extends EventEmitter {
519
519
  }
520
520
  }
521
521
 
522
- if (contentData && contentData.content) {
522
+ // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
523
+ const contentText = contentData && contentData.content
524
+ ? (typeof contentData.content === 'string'
525
+ ? contentData.content
526
+ : (contentData.content.text || ''))
527
+ : '';
528
+
529
+ // Only count and enhance sources that actually produced non-empty content.
530
+ // Skip failed extractions and empty {text:""} results.
531
+ if (contentData && contentData.success !== false && contentText.trim().length > 0) {
523
532
  this.metrics.contentExtracted++;
524
533
 
525
- // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
526
- const contentText = typeof contentData.content === 'string'
527
- ? contentData.content
528
- : (contentData.content.text || JSON.stringify(contentData.content));
529
-
530
534
  // Enhance source with extracted content
531
535
  let enhancedSource = {
532
536
  ...source,