npm - crawlforge-mcp-server - Versions diffs - 4.2.12 → 4.5.0 - Mend

crawlforge-mcp-server 4.2.12 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/package.json +2 -1
package/server.js +138 -20
package/src/constants/config.js +5 -0
package/src/core/ActionExecutor.js +13 -1
package/src/core/ChangeTracker.js +8 -5
package/src/core/LLMsTxtAnalyzer.js +71 -47
package/src/core/LocalizationManager.js +7 -4
package/src/core/ResearchOrchestrator.js +10 -6
package/src/core/StealthBrowserManager.js +52 -13
package/src/core/analysis/ContentAnalyzer.js +2 -2
package/src/core/crawlers/BFSCrawler.js +23 -12
package/src/core/processing/ContentProcessor.js +19 -3
package/src/core/processing/PDFProcessor.js +72 -23
package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
package/src/tools/advanced/batchScrape/index.js +3 -1
package/src/tools/advanced/batchScrape/reporter.js +5 -1
package/src/tools/advanced/batchScrape/worker.js +6 -1
package/src/tools/basic/_fetch.js +78 -5
package/src/tools/basic/extractLinks.js +1 -1
package/src/tools/basic/extractMetadata.js +65 -1
package/src/tools/basic/extractText.js +61 -5
package/src/tools/basic/scrapeStructured.js +48 -10
package/src/tools/crawl/crawlDeep.js +13 -5
package/src/tools/crawl/mapSite.js +24 -51
package/src/tools/extract/analyzeContent.js +11 -6
package/src/tools/extract/extractContent.js +23 -5
package/src/tools/extract/extractStructured.js +65 -16
package/src/tools/extract/extractWithLlm.js +192 -11
package/src/tools/extract/listOllamaModels.js +19 -8
package/src/tools/extract/processDocument.js +10 -4
package/src/tools/extract/summarizeContent.js +58 -1
package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
package/src/tools/research/deepResearch.js +43 -4
package/src/tools/search/providers/searxng.js +2 -2
package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
package/src/tools/search/ranking/ResultRanker.js +13 -4
package/src/tools/search/searchWeb.js +5 -5
package/src/tools/templates/TemplateRegistry.js +3 -2
package/src/tools/tracking/trackChanges/differ.js +33 -1
package/src/utils/htmlToMarkdown.js +5 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawlforge-mcp-server",
-  "version": "4.2.12",
+  "version": "4.5.0",
   "description": "CrawlForge MCP Server - Professional Model Context Protocol server with 23 web scraping, crawling, and content processing tools. Defaults to local Ollama for LLM extraction (no API key needed); OpenAI/Anthropic available as opt-in. v4.0 adds Markdown-first output, pre-built site templates, Camoufox stealth engine, and cost transparency.",
   "main": "server.js",
   "bin": {
@@ -113,6 +113,7 @@
     "playwright": "^1.54.2",
     "robots-parser": "^3.0.1",
     "turndown": "^7.2.4",
+    "turndown-plugin-gfm": "^1.0.2",
     "undici": "^7.24.0",
     "winston": "^3.11.0",
     "zod": "^3.23.8"

package/server.js CHANGED Viewed

@@ -96,8 +96,8 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
 // Create the server
 const server = new McpServer({
   name: "crawlforge",
-  version: "4.2.6",
-  description: "Production-ready MCP server with 23 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
+  version: "4.5.0",
+  description: "Production-ready MCP server with 24 web scraping, crawling, and content processing tools. Features MCP Resources (crawlforge://), Prompts, Sampling fallback, Elicitation, stealth browsing, deep research, structured extraction, change tracking, and local-LLM extraction via Ollama.",
   homepage: "https://www.crawlforge.dev",
   icon: "https://www.crawlforge.dev/icon.png"
 });
@@ -299,7 +299,8 @@ server.registerTool("scrape_structured", {
   annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
   inputSchema: {
     url: z.string().url().describe("The URL to scrape"),
-    selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
+    selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors. Append @attr to extract an attribute instead of text (e.g. \"a.link@href\", \"img@src\")"),
+    max_results: z.number().int().min(1).optional().describe("Maximum number of matches to return per field when a selector matches multiple elements")
   }
 }, withAuth("scrape_structured", scrapeStructuredHandler));
@@ -315,14 +316,50 @@ server.registerTool("search_web", {
     safe_search: z.boolean().optional().describe("Enable safe search filtering"),
     time_range: z.enum(["day", "week", "month", "year", "all"]).optional().describe("Filter results by time range"),
     site: z.string().optional().describe("Limit results to a specific domain"),
-    file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')")
+    file_type: z.string().optional().describe("Filter by file type (e.g. 'pdf', 'doc')"),
+    provider: z.enum(["crawlforge", "searxng"]).optional().describe("Search backend to use"),
+    expand_query: z.boolean().optional().describe("Expand the query with synonyms/stemming/etc."),
+    expansion_options: z.object({
+      enableSynonyms: z.boolean().optional(),
+      enableSpellCheck: z.boolean().optional(),
+      enableStemming: z.boolean().optional(),
+      enablePhraseDetection: z.boolean().optional(),
+      enableBooleanOperators: z.boolean().optional(),
+      maxExpansions: z.number().min(1).max(10).optional()
+    }).optional().describe("Query-expansion tuning"),
+    enable_ranking: z.boolean().optional().describe("Re-rank results (BM25 + signals)"),
+    ranking_weights: z.object({
+      bm25: z.number().min(0).max(1).optional(),
+      semantic: z.number().min(0).max(1).optional(),
+      authority: z.number().min(0).max(1).optional(),
+      freshness: z.number().min(0).max(1).optional()
+    }).optional().describe("Relative weights for ranking signals"),
+    enable_deduplication: z.boolean().optional().describe("Remove near-duplicate results"),
+    deduplication_thresholds: z.object({
+      url: z.number().min(0).max(1).optional(),
+      title: z.number().min(0).max(1).optional(),
+      content: z.number().min(0).max(1).optional(),
+      combined: z.number().min(0).max(1).optional()
+    }).optional().describe("Similarity thresholds for dedup"),
+    include_ranking_details: z.boolean().optional().describe("Include per-result ranking breakdown"),
+    include_deduplication_details: z.boolean().optional().describe("Include dedup decision details"),
+    localization: z.object({
+      countryCode: z.string().length(2).optional(),
+      language: z.string().optional(),
+      timezone: z.string().optional(),
+      enableGeoTargeting: z.boolean().optional(),
+      customLocation: z.object({
+        latitude: z.number().min(-90).max(90),
+        longitude: z.number().min(-180).max(180)
+      }).optional()
+    }).optional().describe("Geo/locale targeting for results")
   }
-}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
+}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization }) => {
   try {
     if (!query) {
       return { content: [{ type: "text", text: "Query parameter is required" }], isError: true };
     }
-    const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
+    const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type, provider, expand_query, expansion_options, enable_ranking, ranking_weights, enable_deduplication, deduplication_thresholds, include_ranking_details, include_deduplication_details, localization });
     return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
   } catch (error) {
     return { content: [{ type: "text", text: `Search failed: ${error.message}` }], isError: true };
@@ -342,14 +379,38 @@ server.registerTool("crawl_deep", {
     follow_external: z.boolean().optional().describe("Follow links to external domains"),
     respect_robots: z.boolean().optional().describe("Respect robots.txt directives"),
     extract_content: z.boolean().optional().describe("Extract page content during crawl"),
-    concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests")
+    content_max_length: z.number().min(1).max(100000).optional().describe("Maximum characters of page content to include per page (default 500); sets a truncated flag when trimmed"),
+    concurrency: z.number().min(1).max(20).optional().describe("Number of concurrent requests"),
+    enable_link_analysis: z.boolean().optional().describe("Compute PageRank/link-graph analysis over crawled pages"),
+    link_analysis_options: z.object({
+      dampingFactor: z.number().min(0).max(1).optional(),
+      maxIterations: z.number().min(1).max(1000).optional(),
+      enableCaching: z.boolean().optional()
+    }).optional().describe("PageRank tuning options"),
+    domain_filter: z.object({
+      whitelist: z.array(z.any()).optional(),
+      blacklist: z.array(z.any()).optional(),
+      domain_rules: z.record(z.any()).optional()
+    }).optional().describe("Per-domain allow/deny lists and crawl rules"),
+    import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config"),
+    session: z.object({
+      enabled: z.boolean(),
+      persistCookies: z.boolean().optional(),
+      headers: z.record(z.string()).optional(),
+      initialRequest: z.object({
+        url: z.string().url(),
+        method: z.string().optional(),
+        headers: z.record(z.string()).optional(),
+        body: z.string().optional()
+      }).optional()
+    }).optional().describe("Shared cookie-jar/session for login-then-crawl workflows")
   }
-}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
+}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session }) => {
   try {
     if (!url) {
       return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
     }
-    const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency });
+    const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, content_max_length, concurrency, enable_link_analysis, link_analysis_options, domain_filter, import_filter_config, session });
     return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
   } catch (error) {
     return { content: [{ type: "text", text: `Crawl failed: ${error.message}` }], isError: true };
@@ -365,14 +426,21 @@ server.registerTool("map_site", {
     include_sitemap: z.boolean().optional().describe("Include sitemap.xml data in results"),
     max_urls: z.number().min(1).max(10000).optional().describe("Maximum number of URLs to discover"),
     group_by_path: z.boolean().optional().describe("Group URLs by path segments"),
-    include_metadata: z.boolean().optional().describe("Include page metadata for each URL")
+    include_metadata: z.boolean().optional().describe("Include page metadata for each URL"),
+    domain_filter: z.object({
+      whitelist: z.array(z.string()).optional(),
+      blacklist: z.array(z.string()).optional(),
+      include_patterns: z.array(z.string()).optional(),
+      exclude_patterns: z.array(z.string()).optional()
+    }).optional().describe("Per-domain allow/deny lists and URL include/exclude patterns"),
+    import_filter_config: z.string().optional().describe("JSON string of a previously exported domain-filter config")
   }
-}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
+}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config }) => {
   try {
     if (!url) {
       return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
     }
-    const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata });
+    const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata, domain_filter, import_filter_config });
     return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
   } catch (error) {
     return { content: [{ type: "text", text: `Site mapping failed: ${error.message}` }], isError: true };
@@ -406,7 +474,9 @@ server.registerTool("process_document", {
   inputSchema: {
     source: z.string().describe("Document source - URL or file path"),
     sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional().describe("Type of document source"),
-    options: z.object({}).optional().describe("Additional processing options")
+    // C3: passthrough so granular options (maxPages, pageRange:{start,end},
+    // extractText, outputFormat, etc.) reach the tool instead of being stripped.
+    options: z.object({}).passthrough().optional().describe("Additional processing options (maxPages, pageRange:{start,end}, extractText, extractMetadata, password, outputFormat, ...)")
   }
 }, withAuth("process_document", async ({ source, sourceType, options }) => {
   try {
@@ -572,6 +642,27 @@ server.registerTool("batch_scrape", {
   }
 }));
+// Tool: get_batch_results — C3: retrieve paginated results for a completed batch
+server.registerTool("get_batch_results", {
+  description: "Retrieve paginated results for a completed or in-progress batch_scrape job. Use the batchId returned by batch_scrape. Example: get_batch_results({batchId: \"batch_1234567890_abc\", page: 2, pageSize: 25})",
+  annotations: { title: "Get Batch Results", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
+  inputSchema: {
+    batchId: z.string().describe("The batch ID returned by batch_scrape"),
+    page: z.number().min(1).default(1).describe("Page number (1-based)"),
+    pageSize: z.number().min(1).max(100).default(25).describe("Number of results per page")
+  }
+}, withAuth("get_batch_results", async ({ batchId, page = 1, pageSize = 25 }) => {
+  try {
+    if (!batchId) {
+      return { content: [{ type: "text", text: "batchId parameter is required" }], isError: true };
+    }
+    const result = await batchScrapeTool.getBatchResults(batchId, page, pageSize);
+    return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
+  } catch (error) {
+    return { content: [{ type: "text", text: `get_batch_results failed: ${error.message}` }], isError: true };
+  }
+}));
 // Tool: scrape_with_actions
 server.registerTool("scrape_with_actions", {
   description: "Use this when you need to interact with a page before scraping — login, click buttons, fill forms, scroll, or wait for dynamic content to load. Use for SPAs, login-gated content, or multi-step flows. Screenshots from this tool are stored as crawlforge://screenshot/{actionId} resources. Example: scrape_with_actions({url: \"https://app.com/dashboard\", actions: [{type:\"click\",selector:\"#login\"},{type:\"type\",selector:\"#email\",text:\"user@a.com\"}]})",
@@ -586,8 +677,34 @@ server.registerTool("scrape_with_actions", {
       script: z.string().optional(),
       timeout: z.number().optional(),
       description: z.string().optional(),
-      continueOnError: z.boolean().default(false),
-      retries: z.number().min(0).max(5).default(0)
+      continueOnError: z.boolean().optional(),
+      retries: z.number().min(0).max(5).optional(),
+      captureAfter: z.boolean().optional().describe("Capture page content after this action"),
+      // wait
+      duration: z.number().min(0).max(30000).optional().describe("wait: milliseconds to wait"),
+      condition: z.enum(['visible', 'hidden', 'enabled', 'disabled', 'stable']).optional().describe("wait: condition on selector"),
+      // click
+      button: z.enum(['left', 'right', 'middle']).optional().describe("click: mouse button"),
+      clickCount: z.number().min(1).max(3).optional().describe("click: number of clicks"),
+      delay: z.number().min(0).max(1000).optional().describe("click/type: delay in ms"),
+      force: z.boolean().optional().describe("click: bypass actionability checks"),
+      position: z.object({ x: z.number(), y: z.number() }).optional().describe("click: relative position"),
+      // type
+      clear: z.boolean().optional().describe("type: clear field before typing"),
+      // press
+      modifiers: z.array(z.enum(['Alt', 'Control', 'Meta', 'Shift'])).optional().describe("press: modifier keys"),
+      // scroll
+      direction: z.enum(['up', 'down', 'left', 'right']).optional().describe("scroll: direction"),
+      distance: z.number().min(0).optional().describe("scroll: pixels to scroll"),
+      smooth: z.boolean().optional().describe("scroll: smooth scrolling"),
+      toElement: z.string().optional().describe("scroll: selector to scroll to"),
+      // screenshot
+      fullPage: z.boolean().optional().describe("screenshot: capture full page"),
+      quality: z.number().min(0).max(100).optional().describe("screenshot: jpeg quality"),
+      format: z.enum(['png', 'jpeg']).optional().describe("screenshot: image format"),
+      // executeJavaScript
+      args: z.array(z.any()).optional().describe("executeJavaScript: arguments passed to the script"),
+      returnResult: z.boolean().optional().describe("executeJavaScript: return the script result")
     })).min(1).max(20).describe("Browser actions to perform before scraping"),
     formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']).describe("Output formats for scraped content"),
     captureIntermediateStates: z.boolean().default(false).describe("Capture page state after each action"),
@@ -1012,8 +1129,9 @@ server.registerTool("localization", {
         };
         break;
       case 'handle_geo_blocking':
-        if (!params.url || !params.response) throw new Error('url and response are required for handle_geo_blocking operation');
-        result = await localizationManager.handleGeoBlocking(params.url, params.response);
+      case 'detect_geo_blocking':
+        if (!params.url || !params.response) throw new Error('url and response are required for detect_geo_blocking operation');
+        result = await localizationManager.detectGeoBlocking(params.url, params.response);
         break;
       case 'auto_detect':
         if (!params.content || !params.url) throw new Error('content and url are required for auto_detect operation');
@@ -1103,12 +1221,12 @@ async function runServer() {
     "fetch_url", "extract_text", "extract_links", "extract_metadata", "scrape_structured",
     "search_web", "crawl_deep", "map_site",
     "extract_content", "process_document", "summarize_content", "analyze_content",
-    "batch_scrape", "scrape_with_actions",
+    "batch_scrape", "get_batch_results", "scrape_with_actions",
     "deep_research", "track_changes", "generate_llms_txt",
     "stealth_mode", "localization", "extract_structured", "extract_with_llm",
-    "scrape_template"  // D3.3
+    "list_ollama_models", "scrape_template"  // D3.3
   ];
-  console.error(`Tools available (23): ${allTools.join(", ")}`);
+  console.error(`Tools available (24): ${allTools.join(", ")}`);
   // Start memory monitoring in development
   if (config.server.nodeEnv === "development") {

package/src/constants/config.js CHANGED Viewed

@@ -15,6 +15,11 @@ export const config = {
     apiBaseUrl: resolveApiEndpoint(process.env.CRAWLFORGE_API_URL || 'https://www.crawlforge.dev')
   },
+  // Fetch body-size cap
+  fetch: {
+    maxBodySize: parseInt(process.env.MAX_FETCH_BODY_SIZE || String(25 * 1024 * 1024)) // 25 MB
+  },
   // Performance
   performance: {
     maxWorkers: parseInt(process.env.MAX_WORKERS || '10'),

package/src/core/ActionExecutor.js CHANGED Viewed

@@ -213,7 +213,17 @@ export class ActionExecutor extends EventEmitter {
         // Execute chain with potential retries
         chainResult = await this.executeChainWithRetries(executionContext);
+        // Capture the LIVE post-action page state before the page is closed,
+        // so callers can extract final content reflecting all actions
+        // (instead of re-fetching the original URL).
+        try {
+          executionContext.finalHtml = await page.content();
+          executionContext.finalUrl = page.url();
+        } catch (captureErr) {
+          this.log('warn', 'Failed to capture final page content: ' + captureErr.message);
+        }
         this.stats.successfulChains++;
         executionContext.success = true;
@@ -268,6 +278,8 @@ export class ActionExecutor extends EventEmitter {
         success: true,
         chainId,
         url,
+        finalUrl: executionContext.finalUrl || url,
+        finalHtml: executionContext.finalHtml,
         executionTime: Date.now() - startTime,
         results: executionContext.results,
         screenshots: executionContext.screenshots,

package/src/core/ChangeTracker.js CHANGED Viewed

@@ -173,12 +173,15 @@ export class ChangeTracker extends EventEmitter {
    */
   async compareWithBaseline(url, currentContent, options = {}) {
     const startTime = Date.now();
+    // Expected no-baseline case: return a clean error WITHOUT emitting an
+    // unhandled 'error' event (which would crash callers with no 'error' listener).
+    if (!this.snapshots.has(url)) {
+      throw new Error(`No baseline found for ${url} — run create_baseline first`);
+    }
     try {
-      if (!this.snapshots.has(url)) {
-        throw new Error(`No baseline found for URL: ${url}`);
-      }
       const snapshots = this.snapshots.get(url);
       const baseline = snapshots[snapshots.length - 1]; // Get latest baseline

package/src/core/LLMsTxtAnalyzer.js CHANGED Viewed

@@ -28,7 +28,10 @@ export class LLMsTxtAnalyzer {
       respectRobots: options.respectRobots !== false,
       detectAPIs: options.detectAPIs !== false,
       analyzeContent: options.analyzeContent !== false,
-      checkSecurity: options.checkSecurity !== false,
+      // C1: intrusive probing is now opt-in (default false) to avoid hammering
+      // security-sensitive and rate-probe paths on every generation run.
+      checkSecurity: options.checkSecurity === true,
+      probeRateLimit: options.probeRateLimit === true,
       ...options
     };
@@ -70,26 +73,31 @@ export class LLMsTxtAnalyzer {
         analysisOptions: { ...this.options, ...options }
       };
-      // Phase 1: Site Structure Analysis
+      // Phase 1: Site Structure Analysis (must run first — subsequent phases
+      // depend on the URL list it produces)
       await this.analyzeSiteStructure(url, options);
-      // Phase 2: API Detection
+      // Phases 2-5 run in parallel where they are independent of each other.
+      // detectAPIEndpoints and analyzeSecurity each fire a bounded set of probe
+      // fetches (capped at PROBE_CONCURRENCY concurrent requests per phase).
+      // analyzeRateLimiting is only executed when the caller opts in via
+      // probeRateLimit:true — its 5 sequential requests are intrusive.
+      const parallelTasks = [];
       if (this.options.detectAPIs) {
-        await this.detectAPIEndpoints(url);
+        parallelTasks.push(this.detectAPIEndpoints(url));
       }
-      // Phase 3: Content Classification
       if (this.options.analyzeContent) {
-        await this.classifyContent();
+        parallelTasks.push(this.classifyContent());
       }
-      // Phase 4: Security Analysis
       if (this.options.checkSecurity) {
-        await this.analyzeSecurity(url);
+        parallelTasks.push(this.analyzeSecurity(url));
+      }
+      if (this.options.probeRateLimit) {
+        parallelTasks.push(this.analyzeRateLimiting(url));
       }
-      // Phase 5: Rate Limiting Analysis
-      await this.analyzeRateLimiting(url);
+      await Promise.all(parallelTasks);
       // Phase 6: Generate Guidelines
       await this.generateUsageGuidelines();
@@ -160,35 +168,43 @@ export class LLMsTxtAnalyzer {
   /**
    * Detect API endpoints and data sources
+   * C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
    */
   async detectAPIEndpoints(baseUrl) {
     logger.info('Detecting API endpoints...');
+    const PROBE_CONCURRENCY = 6;
     try {
-      const apis = [];
       const commonPaths = [
         '/api', '/v1', '/v2', '/v3', '/rest', '/graphql',
         '/data', '/feed', '/json', '/xml', '/rss',
         '/.well-known', '/openapi', '/swagger'
       ];
-      // Check common API paths
-      for (const path of commonPaths) {
-        const apiUrl = `${baseUrl}${path}`;
-        try {
-          const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
-          if (response.ok) {
-            const contentType = response.headers.get('content-type') || '';
-            apis.push({
-              url: apiUrl,
-              type: this.determineAPIType(apiUrl, contentType),
-              status: response.status,
-              contentType,
-              accessible: true
-            });
-          }
-        } catch {
-          // API endpoint not accessible or doesn't exist
+      // Run path probes in parallel batches
+      const apis = [];
+      for (let i = 0; i < commonPaths.length; i += PROBE_CONCURRENCY) {
+        const batch = commonPaths.slice(i, i + PROBE_CONCURRENCY);
+        const results = await Promise.allSettled(
+          batch.map(async (path) => {
+            const apiUrl = `${baseUrl}${path}`;
+            const response = await this.fetchWithTimeout(apiUrl, { timeout: 5000 });
+            if (response.ok) {
+              const contentType = response.headers.get('content-type') || '';
+              return {
+                url: apiUrl,
+                type: this.determineAPIType(apiUrl, contentType),
+                status: response.status,
+                contentType,
+                accessible: true
+              };
+            }
+            return null;
+          })
+        );
+        for (const r of results) {
+          if (r.status === 'fulfilled' && r.value) apis.push(r.value);
         }
       }
@@ -278,13 +294,14 @@ export class LLMsTxtAnalyzer {
   /**
    * Analyze security boundaries and sensitive areas
+   * C1: probe fetches run in parallel (capped at PROBE_CONCURRENCY).
    */
   async analyzeSecurity(baseUrl) {
     logger.info('Analyzing security boundaries...');
-    try {
-      const securityAreas = [];
+    const PROBE_CONCURRENCY = 6;
+    try {
       // Check for common sensitive paths
       const sensitivePaths = [
         '/admin', '/administrator', '/wp-admin', '/cms',
@@ -294,21 +311,28 @@ export class LLMsTxtAnalyzer {
         '/config', '/settings', '/env'
       ];
-      for (const path of sensitivePaths) {
-        const testUrl = `${baseUrl}${path}`;
-        try {
-          const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
-          if (response.status === 200 || response.status === 302 || response.status === 401) {
-            securityAreas.push({
-              path,
-              url: testUrl,
-              status: response.status,
-              type: this.classifySecurityArea(path),
-              recommendation: 'restrict'
-            });
-          }
-        } catch {
-          // Area not accessible
+      // Run path probes in parallel batches
+      const securityAreas = [];
+      for (let i = 0; i < sensitivePaths.length; i += PROBE_CONCURRENCY) {
+        const batch = sensitivePaths.slice(i, i + PROBE_CONCURRENCY);
+        const results = await Promise.allSettled(
+          batch.map(async (path) => {
+            const testUrl = `${baseUrl}${path}`;
+            const response = await this.fetchWithTimeout(testUrl, { timeout: 3000 });
+            if (response.status === 200 || response.status === 302 || response.status === 401) {
+              return {
+                path,
+                url: testUrl,
+                status: response.status,
+                type: this.classifySecurityArea(path),
+                recommendation: 'restrict'
+              };
+            }
+            return null;
+          })
+        );
+        for (const r of results) {
+          if (r.status === 'fulfilled' && r.value) securityAreas.push(r.value);
         }
       }

package/src/core/LocalizationManager.js CHANGED Viewed

@@ -499,12 +499,14 @@ export class LocalizationManager extends EventEmitter {
   }
   /**
-   * Detect and handle geo-blocked content
+   * Detect geo-blocked content and return suggestions.
+   * C3: renamed from handleGeoBlocking — no bypass is actually applied here;
+   * the returned bypassStrategies are recommendations only.
    * @param {string} url - URL to check
    * @param {Object} response - HTTP response object
-   * @returns {Object} - Analysis and bypass suggestions
+   * @returns {Object} - Detection result and bypass suggestions
    */
-  async handleGeoBlocking(url, response) {
+  async detectGeoBlocking(url, response) {
     const geoBlockingIndicators = [
       /not available in your country/i,
       /access denied/i,
@@ -1386,8 +1388,9 @@ export class LocalizationManager extends EventEmitter {
     }
     // Phone number pattern analysis
+    // C3: fix US pattern — was using \\d (literal backslash-d) instead of \d
     const phonePatterns = {
-      'US': /\+1[\s.-]?\(?\\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
+      'US': /\+1[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/,
       'GB': /\+44[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
       'DE': /\+49[\s.-]?\d{2,4}[\s.-]?\d{6,8}/,
       'FR': /\+33[\s.-]?\d{1}[\s.-]?\d{8}/

package/src/core/ResearchOrchestrator.js CHANGED Viewed

@@ -519,14 +519,18 @@ export class ResearchOrchestrator extends EventEmitter {
               }
             }
-            if (contentData && contentData.content) {
+            // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
+            const contentText = contentData && contentData.content
+              ? (typeof contentData.content === 'string'
+                  ? contentData.content
+                  : (contentData.content.text || ''))
+              : '';
+            // Only count and enhance sources that actually produced non-empty content.
+            // Skip failed extractions and empty {text:""} results.
+            if (contentData && contentData.success !== false && contentText.trim().length > 0) {
               this.metrics.contentExtracted++;
-              // Normalize content to string (extract_content returns {text: "..."}, fallback returns string)
-              const contentText = typeof contentData.content === 'string'
-                ? contentData.content
-                : (contentData.content.text || JSON.stringify(contentData.content));
               // Enhance source with extracted content
               let enhancedSource = {
                 ...source,