npm - crawlforge-mcp-server - Versions diffs - 4.2.11 → 4.5.0 - Mend

crawlforge-mcp-server 4.2.11 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/package.json +2 -1
package/server.js +152 -21
package/src/constants/config.js +5 -0
package/src/core/ActionExecutor.js +13 -1
package/src/core/ChangeTracker.js +8 -5
package/src/core/LLMsTxtAnalyzer.js +71 -47
package/src/core/LocalizationManager.js +7 -4
package/src/core/ResearchOrchestrator.js +10 -6
package/src/core/StealthBrowserManager.js +111 -40
package/src/core/analysis/ContentAnalyzer.js +2 -2
package/src/core/crawlers/BFSCrawler.js +23 -12
package/src/core/processing/ContentProcessor.js +19 -3
package/src/core/processing/PDFProcessor.js +72 -23
package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
package/src/tools/advanced/batchScrape/index.js +3 -1
package/src/tools/advanced/batchScrape/reporter.js +5 -1
package/src/tools/advanced/batchScrape/worker.js +6 -1
package/src/tools/basic/_fetch.js +78 -5
package/src/tools/basic/extractLinks.js +1 -1
package/src/tools/basic/extractMetadata.js +65 -1
package/src/tools/basic/extractText.js +61 -5
package/src/tools/basic/scrapeStructured.js +48 -10
package/src/tools/crawl/crawlDeep.js +13 -5
package/src/tools/crawl/mapSite.js +24 -51
package/src/tools/extract/analyzeContent.js +11 -6
package/src/tools/extract/extractContent.js +23 -5
package/src/tools/extract/extractStructured.js +65 -16
package/src/tools/extract/extractWithLlm.js +192 -11
package/src/tools/extract/listOllamaModels.js +19 -8
package/src/tools/extract/processDocument.js +10 -4
package/src/tools/extract/summarizeContent.js +58 -1
package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
package/src/tools/research/deepResearch.js +43 -4
package/src/tools/search/providers/searxng.js +2 -2
package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
package/src/tools/search/ranking/ResultRanker.js +13 -4
package/src/tools/search/searchWeb.js +5 -5
package/src/tools/templates/TemplateRegistry.js +3 -2
package/src/tools/tracking/trackChanges/differ.js +33 -1
package/src/utils/htmlToMarkdown.js +5 -1

package/src/core/StealthBrowserManager.js CHANGED Viewed

@@ -47,14 +47,17 @@ const StealthConfigSchema = z.object({
     spoofMediaDevices: z.boolean().default(true),
     spoofBatteryAPI: z.boolean().default(true)
   }).optional(),
   fingerprinting: z.object({
     canvasNoise: z.boolean().default(true),
     webglSpoofing: z.boolean().default(true),
     audioContextSpoofing: z.boolean().default(true),
     fontSpoofing: z.boolean().default(true),
     hardwareSpoofing: z.boolean().default(true)
-  }).optional()
+  }).optional(),
+  // C2: browser engine selection — 'chromium' (default) or 'camoufox' (Firefox-based)
+  engine: z.enum(['chromium', 'camoufox']).optional().default('chromium')
 });
 export class StealthBrowserManager {
@@ -232,16 +235,41 @@ export class StealthBrowserManager {
   }
   /**
-   * Launch stealth browser with anti-detection configurations
+   * Launch stealth browser with anti-detection configurations.
+   * C2: honours config.engine — 'chromium' (default) or 'camoufox' (Firefox-based).
    */
   async launchStealthBrowser(config = {}) {
+    const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
+    // C2: if the requested engine differs from the running browser, tear it down first.
+    if (this.browser && this._launchedEngine && this._launchedEngine !== validatedConfig.engine) {
+      await this.browser.close().catch(() => {});
+      this.browser = null;
+    }
     if (this.browser) {
       return this.browser;
     }
-    const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
-    // Base browser args for stealth
+    // C2: delegate to CamoufoxAdapter when engine === 'camoufox'
+    if (validatedConfig.engine === 'camoufox') {
+      const adapter = new CamoufoxAdapter();
+      const available = await adapter.isAvailable();
+      if (!available) {
+        throw new Error(
+          'camoufox is not installed. Run: npm install camoufox to use the Firefox-based stealth engine.'
+        );
+      }
+      this.browser = await adapter.launch({
+        headless: true,
+        launchOptions: {}
+      });
+      this._launchedEngine = 'camoufox';
+      return this.browser;
+    }
+    this._launchedEngine = 'chromium';
+    // Base browser args for stealth (Chromium path)
     const stealthArgs = [
       '--no-sandbox',
       '--disable-dev-shm-usage',
@@ -390,8 +418,11 @@ export class StealthBrowserManager {
    * Generate advanced browser fingerprint with enhanced randomization
    */
   generateAdvancedFingerprint(config = {}) {
+    // Select the OS once and thread it through UA, headers, and hardware so
+    // navigator.platform / sec-ch-ua-platform / userAgent stay consistent.
+    const selectedOS = this.selectOS(config);
     const fingerprint = {
-      userAgent: this.selectRealisticUserAgent(config),
+      userAgent: this.selectRealisticUserAgent(config, selectedOS),
       viewport: config.customViewport || this.selectWeightedViewport(),
       timezone: config.timezone || this.selectTimezone(),
       deviceScaleFactor: this.randomFloat(1, 2, 1),
@@ -400,13 +431,13 @@ export class StealthBrowserManager {
       colorScheme: Math.random() < 0.3 ? 'dark' : 'light',
       reducedMotion: Math.random() < 0.1 ? 'reduce' : 'no-preference',
       forcedColors: Math.random() < 0.05 ? 'active' : 'none',
-      headers: this.generateAdvancedHeaders(config),
+      headers: this.generateAdvancedHeaders(config, selectedOS),
       webRTC: this.generateWebRTCConfig(config),
       canvas: this.generateAdvancedCanvasFingerprint(),
       webGL: this.generateAdvancedWebGLFingerprint(),
       audioContext: this.generateAudioContextFingerprint(),
       mediaDevices: this.generateMediaDevicesFingerprint(),
-      hardware: this.generateHardwareFingerprint(),
+      hardware: this.generateHardwareFingerprint(selectedOS),
       fonts: this.generateAdvancedFontList(),
       plugins: this.generateAdvancedPluginList(),
       geolocation: this.generateRealisticGeolocation(),
@@ -417,10 +448,34 @@ export class StealthBrowserManager {
     return fingerprint;
   }
+  /**
+   * Choose a single OS ('windows' | 'macos' | 'linux') for a fingerprint.
+   * A custom UA pins the OS to whatever that UA reports; a non-random UA pins
+   * to windows (the default pool below); otherwise weighted-random.
+   */
+  selectOS(config = {}) {
+    if (config.customUserAgent) {
+      return this.inferOSFromUserAgent(config.customUserAgent);
+    }
+    if (!config.useRandomUserAgent) {
+      return 'windows';
+    }
+    return this.weightedRandom(this.osDistribution);
+  }
+  /**
+   * Infer the OS key from a user-agent string.
+   */
+  inferOSFromUserAgent(ua = '') {
+    if (/Macintosh|Mac OS X/i.test(ua)) return 'macos';
+    if (/Linux|X11|CrOS/i.test(ua)) return 'linux';
+    return 'windows';
+  }
   /**
    * Select realistic user agent based on market distribution
    */
-  selectRealisticUserAgent(config) {
+  selectRealisticUserAgent(config, selectedOS) {
     if (config.customUserAgent) {
       return config.customUserAgent;
     }
@@ -429,9 +484,10 @@ export class StealthBrowserManager {
       return this.userAgentPools.chrome.windows[0];
     }
-    // Select OS based on distribution
-    const selectedOS = this.weightedRandom(this.osDistribution);
+    // Use the OS chosen once for this fingerprint (falls back to a fresh draw
+    // if called without one, preserving the original standalone behavior).
+    selectedOS = selectedOS || this.weightedRandom(this.osDistribution);
     // Select browser based on distribution and OS compatibility
     let availableBrowsers = { ...this.browserDistribution };
     if (selectedOS === 'linux' && availableBrowsers.safari) {
@@ -469,7 +525,10 @@ export class StealthBrowserManager {
   /**
    * Generate advanced HTTP headers with realistic patterns
    */
-  generateAdvancedHeaders(config) {
+  generateAdvancedHeaders(config, selectedOS) {
+    // Resolve the UA first so sec-ch-ua brand version can match.
+    const resolvedUA = this.selectRealisticUserAgent(config, selectedOS);
     const headers = {
       'Accept-Language': `${(config.locale || 'en-US').toLowerCase()},en;q=0.9`,
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@@ -481,11 +540,11 @@ export class StealthBrowserManager {
       'Sec-Fetch-Site': 'none',
       'Sec-Fetch-User': '?1',
       'sec-ch-ua-mobile': '?0',
-      'sec-ch-ua-platform': this.generateSecChUaPlatform()
+      'sec-ch-ua-platform': this.generateSecChUaPlatform(selectedOS)
     };
-    // Add sec-ch-ua header
-    headers['sec-ch-ua'] = this.generateSecChUaHeader();
+    // C2: pass UA so sec-ch-ua brand version matches the Chrome major version.
+    headers['sec-ch-ua'] = this.generateSecChUaHeader(resolvedUA);
     // Randomize some headers
     if (Math.random() < 0.25) {
@@ -505,15 +564,23 @@ export class StealthBrowserManager {
   }
   /**
-   * Generate sec-ch-ua header
+   * Generate sec-ch-ua header.
+   * C2: brand versions are derived from the UA's Chrome major version so
+   * sec-ch-ua and the User-Agent header stay consistent.
+   * @param {string} [userAgent] — the selected user agent string
    */
-  generateSecChUaHeader() {
+  generateSecChUaHeader(userAgent = '') {
+    // Extract Chrome major version from the UA (e.g. "Chrome/121.0.0.0" → "121").
+    // Fall back to 121 if the UA is not a Chrome UA.
+    const match = userAgent.match(/Chrome\/(\d+)/i);
+    const version = match ? match[1] : '121';
     const brands = [
       { brand: 'Not_A Brand', version: '8' },
-      { brand: 'Chromium', version: '120' },
-      { brand: 'Google Chrome', version: '120' }
+      { brand: 'Chromium', version },
+      { brand: 'Google Chrome', version }
     ];
     return brands
       .map(b => `"${b.brand}";v="${b.version}"`)
       .join(', ');
@@ -522,14 +589,14 @@ export class StealthBrowserManager {
   /**
    * Generate sec-ch-ua-platform header
    */
-  generateSecChUaPlatform() {
+  generateSecChUaPlatform(selectedOS) {
     const platforms = {
       windows: '"Windows"',
       macos: '"macOS"',
       linux: '"Linux"'
     };
-    const selectedOS = this.weightedRandom(this.osDistribution);
+    selectedOS = selectedOS || this.weightedRandom(this.osDistribution);
     return platforms[selectedOS] || '"Windows"';
   }
@@ -746,7 +813,9 @@ export class StealthBrowserManager {
   /**
    * Generate realistic hardware fingerprint
    */
-  generateHardwareFingerprint() {
+  generateHardwareFingerprint(selectedOS) {
+    selectedOS = selectedOS || this.weightedRandom(this.osDistribution);
     const processors = [
       { cores: 4, threads: 8, name: 'Intel(R) Core(TM) i5-8250U CPU @ 1.60GHz' },
       { cores: 6, threads: 12, name: 'Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz' },
@@ -755,31 +824,33 @@ export class StealthBrowserManager {
       { cores: 6, threads: 6, name: 'AMD Ryzen 5 3600 6-Core Processor' },
       { cores: 8, threads: 16, name: 'AMD Ryzen 7 3700X 8-Core Processor' }
     ];
     const selectedProcessor = processors[Math.floor(Math.random() * processors.length)];
     return {
       hardwareConcurrency: selectedProcessor.threads,
       processor: selectedProcessor.name,
-      architecture: Math.random() < 0.9 ? 'x86_64' : 'arm64',
+      architecture: 'x86_64',
       memory: Math.floor(Math.random() * 24) + 8, // 8-32 GB
       deviceMemory: Math.pow(2, Math.floor(Math.random() * 3) + 3), // 8, 16, or 32 GB
-      platform: this.selectRealisticPlatform()
+      platform: this.selectRealisticPlatform(selectedOS)
     };
   }
   /**
-   * Select realistic platform based on distribution
+   * Map the chosen OS to its navigator.platform value so it stays consistent
+   * with the user-agent and sec-ch-ua-platform header.
    */
-  selectRealisticPlatform() {
-    const platforms = {
-      'Win32': 0.75,
-      'MacIntel': 0.15,
-      'Linux x86_64': 0.08,
-      'Linux armv7l': 0.02
-    };
-    return this.weightedRandom(platforms);
+  selectRealisticPlatform(selectedOS) {
+    switch (selectedOS) {
+      case 'macos':
+        return 'MacIntel';
+      case 'linux':
+        return 'Linux x86_64';
+      case 'windows':
+      default:
+        return 'Win32';
+    }
   }
   /**

package/src/core/analysis/ContentAnalyzer.js CHANGED Viewed

@@ -4,7 +4,7 @@
  */
 import { SummarizerManager } from 'node-summarizer';
-import { franc } from 'franc';
+import { franc, francAll } from 'franc';
 import nlp from 'compromise';
 import { z } from 'zod';
 import { splitSentences } from './sentenceUtils.js';
@@ -316,7 +316,7 @@ export class ContentAnalyzer {
       const confidence = Math.min(1, 0.5 + (text.length / 500) * 0.5);
       // Get alternative languages using franc.all
-      const alternatives = franc.all(text, {
+      const alternatives = francAll(text, {
         minLength: 10,
         whitelist: Object.keys(LANGUAGE_NAMES)
       })

package/src/core/crawlers/BFSCrawler.js CHANGED Viewed

@@ -6,6 +6,9 @@ import { RobotsChecker } from '../../utils/robotsChecker.js';
 import { DomainFilter } from '../../utils/domainFilter.js';
 import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
 import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
+import { Logger } from '../../utils/Logger.js';
+const logger = new Logger('BFSCrawler');
 export class BFSCrawler {
   constructor(options = {}) {
@@ -43,7 +46,10 @@ export class BFSCrawler {
     this.queue = new QueueManager({ concurrency, timeout });
     this.cache = new CacheManager({ ttl: 3600000 }); // 1 hour cache
+    // C1: per-domain rate-limiter map — reuse existing limiter when
+    // effectiveRateLimit hasn't changed, rather than recreating it on every URL.
     this.rateLimiter = new RateLimiter({ requestsPerSecond: 10 });
+    this._domainRateLimiters = new Map();
     this.robotsChecker = respectRobots ? new RobotsChecker(userAgent) : null;
     // Initialize domain filter (create new if not provided)
@@ -142,13 +148,13 @@ export class BFSCrawler {
     });
     if (!filterDecision.allowed) {
-      console.error(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
+      logger.debug(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
       return;
     }
     // Backward compatibility: also check legacy patterns
     if (!this.shouldCrawlUrl(normalizedUrl)) {
-      console.error(`Legacy pattern blocks: ${normalizedUrl}`);
+      logger.debug(`Legacy pattern blocks: ${normalizedUrl}`);
       return;
     }
@@ -156,7 +162,7 @@ export class BFSCrawler {
     if (this.respectRobots && this.robotsChecker) {
       const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
       if (!canFetch) {
-        console.error(`Robots.txt blocks: ${normalizedUrl}`);
+        logger.debug(`Robots.txt blocks: ${normalizedUrl}`);
         return;
       }
     }
@@ -171,17 +177,22 @@ export class BFSCrawler {
       if (!pageData) {
         // Apply domain-specific rate limiting
+        // C1: reuse per-domain limiter from the map to avoid recreating on each URL.
         const urlObj = new URL(normalizedUrl);
-        const domainRules = this.domainFilter.getDomainRules(urlObj.hostname);
-        // Use domain-specific rate limit if available
+        const domain = urlObj.hostname;
+        const domainRules = this.domainFilter.getDomainRules(domain);
         const effectiveRateLimit = domainRules.rateLimit || 10;
-        if (this.rateLimiter.requestsPerSecond !== effectiveRateLimit) {
-          // Update rate limiter for this domain
-          this.rateLimiter = new RateLimiter({ requestsPerSecond: effectiveRateLimit });
+        if (!this._domainRateLimiters.has(domain)) {
+          this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
+        } else {
+          const existing = this._domainRateLimiters.get(domain);
+          if (existing.requestsPerSecond !== effectiveRateLimit) {
+            this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
+          }
         }
-        await this.rateLimiter.checkLimit(normalizedUrl);
+        await this._domainRateLimiters.get(domain).checkLimit(normalizedUrl);
         // Fetch the page
         pageData = await this.fetchPage(normalizedUrl);

package/src/core/processing/ContentProcessor.js CHANGED Viewed

@@ -401,9 +401,10 @@ export class ContentProcessor {
     const avgWordsPerSentence = words.length / sentences.length;
     const avgCharsPerWord = charactersNoSpaces / words.length;
-    // Simple readability score (lower is better)
-    const readabilityScore = (avgWordsPerSentence * 1.015) + (avgCharsPerWord * 84.6) - 206.835;
+    const avgSyllablesPerWord = words.reduce((sum, w) => sum + this._countSyllables(w), 0) / words.length;
+    // Flesch Reading-Ease: higher score = easier to read
+    const readabilityScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
     return {
       sentences: sentences.length,
@@ -412,6 +413,7 @@ export class ContentProcessor {
       charactersNoSpaces,
       avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
       avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
+      avgSyllablesPerWord: Math.round(avgSyllablesPerWord * 100) / 100,
       readabilityScore: Math.round(readabilityScore * 100) / 100,
       readabilityLevel: this.getReadabilityLevel(readabilityScore)
     };
@@ -432,6 +434,20 @@ export class ContentProcessor {
     return 'Very Difficult';
   }
+  /**
+   * Count syllables in a word (heuristic)
+   * @param {string} word
+   * @returns {number}
+   */
+  _countSyllables(word) {
+    const w = word.toLowerCase().replace(/[^a-z]/g, '');
+    if (w.length <= 3) return 1;
+    // Remove trailing silent e
+    const stripped = w.replace(/e$/, '');
+    const matches = stripped.match(/[aeiouy]+/g);
+    return Math.max(1, matches ? matches.length : 1);
+  }
   /**
    * Extract fallback content when Readability fails
    * @param {string} html - HTML content

package/src/core/processing/PDFProcessor.js CHANGED Viewed

@@ -16,6 +16,12 @@ const PDFProcessorSchema = z.object({
     extractText: z.boolean().default(true),
     password: z.string().optional(),
     maxPages: z.number().min(1).max(1000).default(100),
+    // C3: true page-range extraction (1-based, inclusive). When set, only the
+    // text from pages [start..end] is returned.
+    pageRange: z.object({
+      start: z.number().min(1).default(1),
+      end: z.number().min(1).optional()
+    }).optional(),
     parseOptions: z.object({
       normalizeWhitespace: z.boolean().default(true),
       disableCombineTextItems: z.boolean().default(false)
@@ -95,12 +101,29 @@ export class PDFProcessor {
         return result;
       }
+      // C3: when a page range is requested, capture per-page text so we can
+      // return exactly pages [start..end] (pdf-parse otherwise concatenates the
+      // whole document and its `max` option only caps the *upper* page bound).
+      const pageRange = processingOptions.pageRange;
+      const capturedPages = [];
       // Parse PDF with options
       const parseOptions = {
         ...processingOptions.parseOptions,
         max: processingOptions.maxPages
       };
+      // If extracting a range, raise `max` to at least the requested end page
+      // and install a pagerender that records each page's text.
+      if (pageRange) {
+        if (pageRange.end) {
+          parseOptions.max = Math.max(parseOptions.max, pageRange.end);
+        } else {
+          parseOptions.max = processingOptions.maxPages;
+        }
+        parseOptions.pagerender = (pageData) => this._renderPage(pageData, capturedPages);
+      }
       if (processingOptions.password) {
         parseOptions.password = processingOptions.password;
       }
@@ -118,7 +141,15 @@ export class PDFProcessor {
       // Extract text content
       if (processingOptions.extractText) {
-        result.text = this.cleanPDFText(pdfData.text);
+        if (pageRange) {
+          const start = pageRange.start || 1;
+          const end = pageRange.end || capturedPages.length;
+          const slice = capturedPages.slice(start - 1, end);
+          result.text = this.cleanPDFText(slice.join('\n\n'));
+          result.extractedPages = { start, end, count: slice.length };
+        } else {
+          result.text = this.cleanPDFText(pdfData.text);
+        }
       }
       // Extract metadata
@@ -414,34 +445,52 @@ export class PDFProcessor {
   }
   /**
-   * Extract specific pages from PDF
-   * @param {Object} params - Processing parameters with page range
-   * @returns {Promise<Object>} - Processing result for specified pages
+   * Render a single PDF page to text and record it.
+   * Mirrors pdf-parse's default render (newline on Y-position change) but
+   * accumulates per-page text so callers can slice a true page range.
+   * Note: like pdf-parse, this does not reconstruct multi-column / table
+   * layout — column order follows the PDF's text-item stream.
+   * @param {Object} pageData - pdf.js page proxy from pdf-parse
+   * @param {string[]} sink - array that receives this page's text
+   * @returns {Promise<string>}
+   */
+  async _renderPage(pageData, sink) {
+    const textContent = await pageData.getTextContent({
+      normalizeWhitespace: true,
+      disableCombineTextItems: false
+    });
+    let lastY;
+    let text = '';
+    for (const item of textContent.items) {
+      if (lastY === item.transform[5] || lastY === undefined) {
+        text += item.str;
+      } else {
+        text += '\n' + item.str;
+      }
+      lastY = item.transform[5];
+    }
+    sink.push(text);
+    // pdf-parse joins page renders with '\n\n' for pdfData.text
+    return text;
+  }
+  /**
+   * Extract a specific page range from a PDF (1-based, inclusive).
+   * @param {Object} params - Processing parameters
+   * @param {number} [params.startPage=1] - First page to include
+   * @param {number} [params.endPage] - Last page to include (defaults to end)
+   * @returns {Promise<Object>} - Processing result for the requested pages
    */
   async extractPDFPages(params) {
     const { startPage = 1, endPage, ...processingParams } = params;
-    // Override parse options to limit page range
-    const options = {
-      ...processingParams.options,
-      parseOptions: {
-        ...processingParams.options?.parseOptions,
-        max: endPage || processingParams.options?.maxPages || 100
-      }
-    };
-    const result = await this.processPDF({
+    return this.processPDF({
       ...processingParams,
-      options
+      options: {
+        ...processingParams.options,
+        pageRange: { start: startPage, ...(endPage ? { end: endPage } : {}) }
+      }
     });
-    if (result.success && result.text && startPage > 1) {
-      // This is a simplified approach - pdf-parse doesn't provide per-page text
-      // For proper page-by-page extraction, consider using pdf2pic or pdf-poppler
-      console.warn('Page-specific extraction is limited with current PDF parser');
-    }
-    return result;
   }
 }

package/src/tools/advanced/ScrapeWithActionsTool.js CHANGED Viewed

@@ -121,8 +121,16 @@ const ScrapeWithActionsSchema = z.object({
   captureIntermediateStates: z.boolean().default(false),
   captureScreenshots: z.boolean().default(true),
-  // Form auto-fill
-  formAutoFill: z.record(z.string()).optional(),
+  // Form auto-fill — structured shape ({fields:[{selector,value,...}], submitSelector, waitAfterSubmit}).
+  // A flat z.record(string) of selector→value is still accepted for backward compatibility.
+  formAutoFill: z.union([
+    z.object({
+      fields: z.array(FormFieldSchema),
+      submitSelector: z.string().optional(),
+      waitAfterSubmit: z.number().min(0).max(30000).default(2000)
+    }),
+    z.record(z.string())
+  ]).optional(),
   // Browser options
   browserOptions: z.object({
@@ -386,8 +394,9 @@ export class ScrapeWithActionsTool extends EventEmitter {
     const intermediateStates = params.captureIntermediateStates ?
       await this.extractIntermediateStates(actionResults, params) : [];
-    // Get final page content after all actions
-    const finalContent = await this.extractFinalContent(params);
+    // Get final page content after all actions (reads the post-action live page
+    // captured by ActionExecutor, falling back to a fresh fetch only if missing).
+    const finalContent = await this.extractFinalContent(params, chainResult);
     // Generate different formats
     const content = this.generateFormats(finalContent, params.formats, {
@@ -446,21 +455,37 @@ export class ScrapeWithActionsTool extends EventEmitter {
   insertFormAutoFillActions(actions, formAutoFill) {
     const fillActions = [];
-    // Convert object with key-value pairs to fill actions
-    for (const [selector, value] of Object.entries(formAutoFill)) {
-      if (selector === 'submitSelector' || selector === 'waitAfterSubmit') {
-        continue; // Skip special keys
+    if (Array.isArray(formAutoFill.fields)) {
+      // Structured shape: { fields: [{selector, value, type, waitAfter}], submitSelector, waitAfterSubmit }
+      for (const field of formAutoFill.fields) {
+        fillActions.push({
+          type: 'type',
+          selector: field.selector,
+          text: field.value,
+          description: `Auto-fill field: ${field.selector}`,
+          continueOnError: true,
+          retries: 1
+        });
+        if (field.waitAfter) {
+          fillActions.push({ type: 'wait', duration: field.waitAfter });
+        }
+      }
+    } else {
+      // Backward-compatible flat shape: { selector: value, ... }
+      for (const [selector, value] of Object.entries(formAutoFill)) {
+        if (selector === 'submitSelector' || selector === 'waitAfterSubmit' || selector === 'fields') {
+          continue; // Skip special keys
+        }
+        fillActions.push({
+          type: 'type',
+          selector,
+          text: value,
+          description: `Auto-fill field: ${selector}`,
+          continueOnError: true,
+          retries: 1
+        });
       }
-      fillActions.push({
-        type: 'type',
-        selector,
-        text: value,
-        description: `Auto-fill field: ${selector}`,
-        continueOnError: true,
-        retries: 1
-      });
     }
     // Add submit action if specified
@@ -585,16 +610,29 @@ export class ScrapeWithActionsTool extends EventEmitter {
     return states;
   }
-  async extractFinalContent(params) {
+  async extractFinalContent(params, chainResult = null) {
     try {
+      const options = {
+        includeMetadata: params.extractionOptions?.includeMetadata !== false,
+        includeLinks: params.extractionOptions?.includeLinks !== false,
+        includeImages: params.extractionOptions?.includeImages !== false,
+        customSelectors: params.extractionOptions?.selectors
+      };
+      // Prefer the post-action live page HTML captured during action execution.
+      // This ensures the final content reflects clicks/typing/navigation rather
+      // than re-fetching the original (pre-action) URL.
+      if (chainResult?.finalHtml) {
+        return await this.extractContentTool.execute({
+          url: chainResult.finalUrl || params.url,
+          html: chainResult.finalHtml,
+          options
+        });
+      }
       const extractResult = await this.extractContentTool.execute({
         url: params.url,
-        options: {
-          includeMetadata: params.extractionOptions?.includeMetadata !== false,
-          includeLinks: params.extractionOptions?.includeLinks !== false,
-          includeImages: params.extractionOptions?.includeImages !== false,
-          customSelectors: params.extractionOptions?.selectors
-        }
+        options
       });
       return extractResult;