npm - crawlforge-mcp-server - Versions diffs - 4.2.12 → 4.5.0 - Mend

crawlforge-mcp-server 4.2.12 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/package.json +2 -1
package/server.js +138 -20
package/src/constants/config.js +5 -0
package/src/core/ActionExecutor.js +13 -1
package/src/core/ChangeTracker.js +8 -5
package/src/core/LLMsTxtAnalyzer.js +71 -47
package/src/core/LocalizationManager.js +7 -4
package/src/core/ResearchOrchestrator.js +10 -6
package/src/core/StealthBrowserManager.js +52 -13
package/src/core/analysis/ContentAnalyzer.js +2 -2
package/src/core/crawlers/BFSCrawler.js +23 -12
package/src/core/processing/ContentProcessor.js +19 -3
package/src/core/processing/PDFProcessor.js +72 -23
package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
package/src/tools/advanced/batchScrape/index.js +3 -1
package/src/tools/advanced/batchScrape/reporter.js +5 -1
package/src/tools/advanced/batchScrape/worker.js +6 -1
package/src/tools/basic/_fetch.js +78 -5
package/src/tools/basic/extractLinks.js +1 -1
package/src/tools/basic/extractMetadata.js +65 -1
package/src/tools/basic/extractText.js +61 -5
package/src/tools/basic/scrapeStructured.js +48 -10
package/src/tools/crawl/crawlDeep.js +13 -5
package/src/tools/crawl/mapSite.js +24 -51
package/src/tools/extract/analyzeContent.js +11 -6
package/src/tools/extract/extractContent.js +23 -5
package/src/tools/extract/extractStructured.js +65 -16
package/src/tools/extract/extractWithLlm.js +192 -11
package/src/tools/extract/listOllamaModels.js +19 -8
package/src/tools/extract/processDocument.js +10 -4
package/src/tools/extract/summarizeContent.js +58 -1
package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
package/src/tools/research/deepResearch.js +43 -4
package/src/tools/search/providers/searxng.js +2 -2
package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
package/src/tools/search/ranking/ResultRanker.js +13 -4
package/src/tools/search/searchWeb.js +5 -5
package/src/tools/templates/TemplateRegistry.js +3 -2
package/src/tools/tracking/trackChanges/differ.js +33 -1
package/src/utils/htmlToMarkdown.js +5 -1

package/src/core/StealthBrowserManager.js CHANGED Viewed

@@ -47,14 +47,17 @@ const StealthConfigSchema = z.object({
     spoofMediaDevices: z.boolean().default(true),
     spoofBatteryAPI: z.boolean().default(true)
   }).optional(),
   fingerprinting: z.object({
     canvasNoise: z.boolean().default(true),
     webglSpoofing: z.boolean().default(true),
     audioContextSpoofing: z.boolean().default(true),
     fontSpoofing: z.boolean().default(true),
     hardwareSpoofing: z.boolean().default(true)
-  }).optional()
+  }).optional(),
+  // C2: browser engine selection — 'chromium' (default) or 'camoufox' (Firefox-based)
+  engine: z.enum(['chromium', 'camoufox']).optional().default('chromium')
 });
 export class StealthBrowserManager {
@@ -232,16 +235,41 @@ export class StealthBrowserManager {
   }
   /**
-   * Launch stealth browser with anti-detection configurations
+   * Launch stealth browser with anti-detection configurations.
+   * C2: honours config.engine — 'chromium' (default) or 'camoufox' (Firefox-based).
    */
   async launchStealthBrowser(config = {}) {
+    const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
+    // C2: if the requested engine differs from the running browser, tear it down first.
+    if (this.browser && this._launchedEngine && this._launchedEngine !== validatedConfig.engine) {
+      await this.browser.close().catch(() => {});
+      this.browser = null;
+    }
     if (this.browser) {
       return this.browser;
     }
-    const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
-    // Base browser args for stealth
+    // C2: delegate to CamoufoxAdapter when engine === 'camoufox'
+    if (validatedConfig.engine === 'camoufox') {
+      const adapter = new CamoufoxAdapter();
+      const available = await adapter.isAvailable();
+      if (!available) {
+        throw new Error(
+          'camoufox is not installed. Run: npm install camoufox to use the Firefox-based stealth engine.'
+        );
+      }
+      this.browser = await adapter.launch({
+        headless: true,
+        launchOptions: {}
+      });
+      this._launchedEngine = 'camoufox';
+      return this.browser;
+    }
+    this._launchedEngine = 'chromium';
+    // Base browser args for stealth (Chromium path)
     const stealthArgs = [
       '--no-sandbox',
       '--disable-dev-shm-usage',
@@ -498,6 +526,9 @@ export class StealthBrowserManager {
    * Generate advanced HTTP headers with realistic patterns
    */
   generateAdvancedHeaders(config, selectedOS) {
+    // Resolve the UA first so sec-ch-ua brand version can match.
+    const resolvedUA = this.selectRealisticUserAgent(config, selectedOS);
     const headers = {
       'Accept-Language': `${(config.locale || 'en-US').toLowerCase()},en;q=0.9`,
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@@ -512,8 +543,8 @@ export class StealthBrowserManager {
       'sec-ch-ua-platform': this.generateSecChUaPlatform(selectedOS)
     };
-    // Add sec-ch-ua header
-    headers['sec-ch-ua'] = this.generateSecChUaHeader();
+    // C2: pass UA so sec-ch-ua brand version matches the Chrome major version.
+    headers['sec-ch-ua'] = this.generateSecChUaHeader(resolvedUA);
     // Randomize some headers
     if (Math.random() < 0.25) {
@@ -533,15 +564,23 @@ export class StealthBrowserManager {
   }
   /**
-   * Generate sec-ch-ua header
+   * Generate sec-ch-ua header.
+   * C2: brand versions are derived from the UA's Chrome major version so
+   * sec-ch-ua and the User-Agent header stay consistent.
+   * @param {string} [userAgent] — the selected user agent string
    */
-  generateSecChUaHeader() {
+  generateSecChUaHeader(userAgent = '') {
+    // Extract Chrome major version from the UA (e.g. "Chrome/121.0.0.0" → "121").
+    // Fall back to 121 if the UA is not a Chrome UA.
+    const match = userAgent.match(/Chrome\/(\d+)/i);
+    const version = match ? match[1] : '121';
     const brands = [
       { brand: 'Not_A Brand', version: '8' },
-      { brand: 'Chromium', version: '120' },
-      { brand: 'Google Chrome', version: '120' }
+      { brand: 'Chromium', version },
+      { brand: 'Google Chrome', version }
     ];
     return brands
       .map(b => `"${b.brand}";v="${b.version}"`)
       .join(', ');

package/src/core/analysis/ContentAnalyzer.js CHANGED Viewed

@@ -4,7 +4,7 @@
  */
 import { SummarizerManager } from 'node-summarizer';
-import { franc } from 'franc';
+import { franc, francAll } from 'franc';
 import nlp from 'compromise';
 import { z } from 'zod';
 import { splitSentences } from './sentenceUtils.js';
@@ -316,7 +316,7 @@ export class ContentAnalyzer {
       const confidence = Math.min(1, 0.5 + (text.length / 500) * 0.5);
       // Get alternative languages using franc.all
-      const alternatives = franc.all(text, {
+      const alternatives = francAll(text, {
         minLength: 10,
         whitelist: Object.keys(LANGUAGE_NAMES)
       })

package/src/core/crawlers/BFSCrawler.js CHANGED Viewed

@@ -6,6 +6,9 @@ import { RobotsChecker } from '../../utils/robotsChecker.js';
 import { DomainFilter } from '../../utils/domainFilter.js';
 import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
 import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
+import { Logger } from '../../utils/Logger.js';
+const logger = new Logger('BFSCrawler');
 export class BFSCrawler {
   constructor(options = {}) {
@@ -43,7 +46,10 @@ export class BFSCrawler {
     this.queue = new QueueManager({ concurrency, timeout });
     this.cache = new CacheManager({ ttl: 3600000 }); // 1 hour cache
+    // C1: per-domain rate-limiter map — reuse existing limiter when
+    // effectiveRateLimit hasn't changed, rather than recreating it on every URL.
     this.rateLimiter = new RateLimiter({ requestsPerSecond: 10 });
+    this._domainRateLimiters = new Map();
     this.robotsChecker = respectRobots ? new RobotsChecker(userAgent) : null;
     // Initialize domain filter (create new if not provided)
@@ -142,13 +148,13 @@ export class BFSCrawler {
     });
     if (!filterDecision.allowed) {
-      console.error(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
+      logger.debug(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
       return;
     }
     // Backward compatibility: also check legacy patterns
     if (!this.shouldCrawlUrl(normalizedUrl)) {
-      console.error(`Legacy pattern blocks: ${normalizedUrl}`);
+      logger.debug(`Legacy pattern blocks: ${normalizedUrl}`);
       return;
     }
@@ -156,7 +162,7 @@ export class BFSCrawler {
     if (this.respectRobots && this.robotsChecker) {
       const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
       if (!canFetch) {
-        console.error(`Robots.txt blocks: ${normalizedUrl}`);
+        logger.debug(`Robots.txt blocks: ${normalizedUrl}`);
         return;
       }
     }
@@ -171,17 +177,22 @@ export class BFSCrawler {
       if (!pageData) {
         // Apply domain-specific rate limiting
+        // C1: reuse per-domain limiter from the map to avoid recreating on each URL.
         const urlObj = new URL(normalizedUrl);
-        const domainRules = this.domainFilter.getDomainRules(urlObj.hostname);
-        // Use domain-specific rate limit if available
+        const domain = urlObj.hostname;
+        const domainRules = this.domainFilter.getDomainRules(domain);
         const effectiveRateLimit = domainRules.rateLimit || 10;
-        if (this.rateLimiter.requestsPerSecond !== effectiveRateLimit) {
-          // Update rate limiter for this domain
-          this.rateLimiter = new RateLimiter({ requestsPerSecond: effectiveRateLimit });
+        if (!this._domainRateLimiters.has(domain)) {
+          this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
+        } else {
+          const existing = this._domainRateLimiters.get(domain);
+          if (existing.requestsPerSecond !== effectiveRateLimit) {
+            this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
+          }
         }
-        await this.rateLimiter.checkLimit(normalizedUrl);
+        await this._domainRateLimiters.get(domain).checkLimit(normalizedUrl);
         // Fetch the page
         pageData = await this.fetchPage(normalizedUrl);

package/src/core/processing/ContentProcessor.js CHANGED Viewed

@@ -401,9 +401,10 @@ export class ContentProcessor {
     const avgWordsPerSentence = words.length / sentences.length;
     const avgCharsPerWord = charactersNoSpaces / words.length;
-    // Simple readability score (lower is better)
-    const readabilityScore = (avgWordsPerSentence * 1.015) + (avgCharsPerWord * 84.6) - 206.835;
+    const avgSyllablesPerWord = words.reduce((sum, w) => sum + this._countSyllables(w), 0) / words.length;
+    // Flesch Reading-Ease: higher score = easier to read
+    const readabilityScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
     return {
       sentences: sentences.length,
@@ -412,6 +413,7 @@ export class ContentProcessor {
       charactersNoSpaces,
       avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
       avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
+      avgSyllablesPerWord: Math.round(avgSyllablesPerWord * 100) / 100,
       readabilityScore: Math.round(readabilityScore * 100) / 100,
       readabilityLevel: this.getReadabilityLevel(readabilityScore)
     };
@@ -432,6 +434,20 @@ export class ContentProcessor {
     return 'Very Difficult';
   }
+  /**
+   * Count syllables in a word (heuristic)
+   * @param {string} word
+   * @returns {number}
+   */
+  _countSyllables(word) {
+    const w = word.toLowerCase().replace(/[^a-z]/g, '');
+    if (w.length <= 3) return 1;
+    // Remove trailing silent e
+    const stripped = w.replace(/e$/, '');
+    const matches = stripped.match(/[aeiouy]+/g);
+    return Math.max(1, matches ? matches.length : 1);
+  }
   /**
    * Extract fallback content when Readability fails
    * @param {string} html - HTML content

package/src/core/processing/PDFProcessor.js CHANGED Viewed

@@ -16,6 +16,12 @@ const PDFProcessorSchema = z.object({
     extractText: z.boolean().default(true),
     password: z.string().optional(),
     maxPages: z.number().min(1).max(1000).default(100),
+    // C3: true page-range extraction (1-based, inclusive). When set, only the
+    // text from pages [start..end] is returned.
+    pageRange: z.object({
+      start: z.number().min(1).default(1),
+      end: z.number().min(1).optional()
+    }).optional(),
     parseOptions: z.object({
       normalizeWhitespace: z.boolean().default(true),
       disableCombineTextItems: z.boolean().default(false)
@@ -95,12 +101,29 @@ export class PDFProcessor {
         return result;
       }
+      // C3: when a page range is requested, capture per-page text so we can
+      // return exactly pages [start..end] (pdf-parse otherwise concatenates the
+      // whole document and its `max` option only caps the *upper* page bound).
+      const pageRange = processingOptions.pageRange;
+      const capturedPages = [];
       // Parse PDF with options
       const parseOptions = {
         ...processingOptions.parseOptions,
         max: processingOptions.maxPages
       };
+      // If extracting a range, raise `max` to at least the requested end page
+      // and install a pagerender that records each page's text.
+      if (pageRange) {
+        if (pageRange.end) {
+          parseOptions.max = Math.max(parseOptions.max, pageRange.end);
+        } else {
+          parseOptions.max = processingOptions.maxPages;
+        }
+        parseOptions.pagerender = (pageData) => this._renderPage(pageData, capturedPages);
+      }
       if (processingOptions.password) {
         parseOptions.password = processingOptions.password;
       }
@@ -118,7 +141,15 @@ export class PDFProcessor {
       // Extract text content
       if (processingOptions.extractText) {
-        result.text = this.cleanPDFText(pdfData.text);
+        if (pageRange) {
+          const start = pageRange.start || 1;
+          const end = pageRange.end || capturedPages.length;
+          const slice = capturedPages.slice(start - 1, end);
+          result.text = this.cleanPDFText(slice.join('\n\n'));
+          result.extractedPages = { start, end, count: slice.length };
+        } else {
+          result.text = this.cleanPDFText(pdfData.text);
+        }
       }
       // Extract metadata
@@ -414,34 +445,52 @@ export class PDFProcessor {
   }
   /**
-   * Extract specific pages from PDF
-   * @param {Object} params - Processing parameters with page range
-   * @returns {Promise<Object>} - Processing result for specified pages
+   * Render a single PDF page to text and record it.
+   * Mirrors pdf-parse's default render (newline on Y-position change) but
+   * accumulates per-page text so callers can slice a true page range.
+   * Note: like pdf-parse, this does not reconstruct multi-column / table
+   * layout — column order follows the PDF's text-item stream.
+   * @param {Object} pageData - pdf.js page proxy from pdf-parse
+   * @param {string[]} sink - array that receives this page's text
+   * @returns {Promise<string>}
+   */
+  async _renderPage(pageData, sink) {
+    const textContent = await pageData.getTextContent({
+      normalizeWhitespace: true,
+      disableCombineTextItems: false
+    });
+    let lastY;
+    let text = '';
+    for (const item of textContent.items) {
+      if (lastY === item.transform[5] || lastY === undefined) {
+        text += item.str;
+      } else {
+        text += '\n' + item.str;
+      }
+      lastY = item.transform[5];
+    }
+    sink.push(text);
+    // pdf-parse joins page renders with '\n\n' for pdfData.text
+    return text;
+  }
+  /**
+   * Extract a specific page range from a PDF (1-based, inclusive).
+   * @param {Object} params - Processing parameters
+   * @param {number} [params.startPage=1] - First page to include
+   * @param {number} [params.endPage] - Last page to include (defaults to end)
+   * @returns {Promise<Object>} - Processing result for the requested pages
    */
   async extractPDFPages(params) {
     const { startPage = 1, endPage, ...processingParams } = params;
-    // Override parse options to limit page range
-    const options = {
-      ...processingParams.options,
-      parseOptions: {
-        ...processingParams.options?.parseOptions,
-        max: endPage || processingParams.options?.maxPages || 100
-      }
-    };
-    const result = await this.processPDF({
+    return this.processPDF({
       ...processingParams,
-      options
+      options: {
+        ...processingParams.options,
+        pageRange: { start: startPage, ...(endPage ? { end: endPage } : {}) }
+      }
     });
-    if (result.success && result.text && startPage > 1) {
-      // This is a simplified approach - pdf-parse doesn't provide per-page text
-      // For proper page-by-page extraction, consider using pdf2pic or pdf-poppler
-      console.warn('Page-specific extraction is limited with current PDF parser');
-    }
-    return result;
   }
 }

package/src/tools/advanced/ScrapeWithActionsTool.js CHANGED Viewed

@@ -121,8 +121,16 @@ const ScrapeWithActionsSchema = z.object({
   captureIntermediateStates: z.boolean().default(false),
   captureScreenshots: z.boolean().default(true),
-  // Form auto-fill
-  formAutoFill: z.record(z.string()).optional(),
+  // Form auto-fill — structured shape ({fields:[{selector,value,...}], submitSelector, waitAfterSubmit}).
+  // A flat z.record(string) of selector→value is still accepted for backward compatibility.
+  formAutoFill: z.union([
+    z.object({
+      fields: z.array(FormFieldSchema),
+      submitSelector: z.string().optional(),
+      waitAfterSubmit: z.number().min(0).max(30000).default(2000)
+    }),
+    z.record(z.string())
+  ]).optional(),
   // Browser options
   browserOptions: z.object({
@@ -386,8 +394,9 @@ export class ScrapeWithActionsTool extends EventEmitter {
     const intermediateStates = params.captureIntermediateStates ?
       await this.extractIntermediateStates(actionResults, params) : [];
-    // Get final page content after all actions
-    const finalContent = await this.extractFinalContent(params);
+    // Get final page content after all actions (reads the post-action live page
+    // captured by ActionExecutor, falling back to a fresh fetch only if missing).
+    const finalContent = await this.extractFinalContent(params, chainResult);
     // Generate different formats
     const content = this.generateFormats(finalContent, params.formats, {
@@ -446,21 +455,37 @@ export class ScrapeWithActionsTool extends EventEmitter {
   insertFormAutoFillActions(actions, formAutoFill) {
     const fillActions = [];
-    // Convert object with key-value pairs to fill actions
-    for (const [selector, value] of Object.entries(formAutoFill)) {
-      if (selector === 'submitSelector' || selector === 'waitAfterSubmit') {
-        continue; // Skip special keys
+    if (Array.isArray(formAutoFill.fields)) {
+      // Structured shape: { fields: [{selector, value, type, waitAfter}], submitSelector, waitAfterSubmit }
+      for (const field of formAutoFill.fields) {
+        fillActions.push({
+          type: 'type',
+          selector: field.selector,
+          text: field.value,
+          description: `Auto-fill field: ${field.selector}`,
+          continueOnError: true,
+          retries: 1
+        });
+        if (field.waitAfter) {
+          fillActions.push({ type: 'wait', duration: field.waitAfter });
+        }
+      }
+    } else {
+      // Backward-compatible flat shape: { selector: value, ... }
+      for (const [selector, value] of Object.entries(formAutoFill)) {
+        if (selector === 'submitSelector' || selector === 'waitAfterSubmit' || selector === 'fields') {
+          continue; // Skip special keys
+        }
+        fillActions.push({
+          type: 'type',
+          selector,
+          text: value,
+          description: `Auto-fill field: ${selector}`,
+          continueOnError: true,
+          retries: 1
+        });
       }
-      fillActions.push({
-        type: 'type',
-        selector,
-        text: value,
-        description: `Auto-fill field: ${selector}`,
-        continueOnError: true,
-        retries: 1
-      });
     }
     // Add submit action if specified
@@ -585,16 +610,29 @@ export class ScrapeWithActionsTool extends EventEmitter {
     return states;
   }
-  async extractFinalContent(params) {
+  async extractFinalContent(params, chainResult = null) {
     try {
+      const options = {
+        includeMetadata: params.extractionOptions?.includeMetadata !== false,
+        includeLinks: params.extractionOptions?.includeLinks !== false,
+        includeImages: params.extractionOptions?.includeImages !== false,
+        customSelectors: params.extractionOptions?.selectors
+      };
+      // Prefer the post-action live page HTML captured during action execution.
+      // This ensures the final content reflects clicks/typing/navigation rather
+      // than re-fetching the original (pre-action) URL.
+      if (chainResult?.finalHtml) {
+        return await this.extractContentTool.execute({
+          url: chainResult.finalUrl || params.url,
+          html: chainResult.finalHtml,
+          options
+        });
+      }
       const extractResult = await this.extractContentTool.execute({
         url: params.url,
-        options: {
-          includeMetadata: params.extractionOptions?.includeMetadata !== false,
-          includeLinks: params.extractionOptions?.includeLinks !== false,
-          includeImages: params.extractionOptions?.includeImages !== false,
-          customSelectors: params.extractionOptions?.selectors
-        }
+        options
       });
       return extractResult;

package/src/tools/advanced/batchScrape/index.js CHANGED Viewed

@@ -161,7 +161,9 @@ export class BatchScrapeTool extends EventEmitter {
       this.stats.lastUpdated = Date.now();
       this.activeBatches.delete(batchId);
-      await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
+      // C3: include webhook delivery status in the result
+      const webhookStatus = await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
+      if (webhookStatus) batchResult.webhookDelivery = webhookStatus;
       this.emit('batchCompleted', batchResult);
       return batchResult;
     } catch (error) {

package/src/tools/advanced/batchScrape/reporter.js CHANGED Viewed

@@ -5,14 +5,16 @@
 /**
  * Send a batch event via the webhookDispatcher.
+ * C3: returns a delivery status object so callers can include it in the result.
  * @param {string}  event
  * @param {Object}  data
  * @param {Object}  webhookConfig
  * @param {Object}  webhookDispatcher
  * @param {boolean} enabled
+ * @returns {Promise<{queued: boolean, url?: string, error?: string}|null>}
  */
 export async function sendWebhookNotification(event, data, webhookConfig, webhookDispatcher, enabled) {
-  if (!enabled || !webhookConfig || !webhookDispatcher) return;
+  if (!enabled || !webhookConfig || !webhookDispatcher) return null;
   try {
     await webhookDispatcher.dispatch(event, data, {
@@ -20,7 +22,9 @@ export async function sendWebhookNotification(event, data, webhookConfig, webhoo
       immediate: false,
       metadata: { batchId: data.batchId, timestamp: Date.now() }
     });
+    return { queued: true, url: webhookConfig.url };
   } catch (error) {
     console.warn(`[batchScrape] Webhook notification failed: ${error.message}`);
+    return { queued: false, url: webhookConfig.url, error: error.message };
   }
 }

package/src/tools/advanced/batchScrape/worker.js CHANGED Viewed

@@ -111,7 +111,6 @@ function generateFormats($, html, formats) {
 function buildMarkdown($) {
   let md = '';
   const title = $('title').text().trim();
-  if (title) md += `# ${title}\n\n`;
   const selectors = ['article', 'main', '.content', '#content', '.post-content', '.entry-content'];
   let $body = null;
@@ -121,6 +120,12 @@ function buildMarkdown($) {
   }
   if (!$body || $body.length === 0) $body = $('body');
+  // C3: de-dup title — only emit the <title> heading if the page has no <h1>
+  // or if the first <h1> text differs from the <title> text (case-insensitive).
+  const firstH1 = $body.find('h1').first().text().trim();
+  const titleDuplicated = firstH1 && firstH1.toLowerCase() === title.toLowerCase();
+  if (title && !titleDuplicated) md += `# ${title}\n\n`;
   $body.find('h1').each((_, el) => { md += `# ${$(el).text().trim()}\n\n`; });
   $body.find('h2').each((_, el) => { md += `## ${$(el).text().trim()}\n\n`; });
   $body.find('h3').each((_, el) => { md += `### ${$(el).text().trim()}\n\n`; });