crawlforge-mcp-server 4.2.12 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +2 -1
  2. package/server.js +138 -20
  3. package/src/constants/config.js +5 -0
  4. package/src/core/ActionExecutor.js +13 -1
  5. package/src/core/ChangeTracker.js +8 -5
  6. package/src/core/LLMsTxtAnalyzer.js +71 -47
  7. package/src/core/LocalizationManager.js +7 -4
  8. package/src/core/ResearchOrchestrator.js +10 -6
  9. package/src/core/StealthBrowserManager.js +52 -13
  10. package/src/core/analysis/ContentAnalyzer.js +2 -2
  11. package/src/core/crawlers/BFSCrawler.js +23 -12
  12. package/src/core/processing/ContentProcessor.js +19 -3
  13. package/src/core/processing/PDFProcessor.js +72 -23
  14. package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
  15. package/src/tools/advanced/batchScrape/index.js +3 -1
  16. package/src/tools/advanced/batchScrape/reporter.js +5 -1
  17. package/src/tools/advanced/batchScrape/worker.js +6 -1
  18. package/src/tools/basic/_fetch.js +78 -5
  19. package/src/tools/basic/extractLinks.js +1 -1
  20. package/src/tools/basic/extractMetadata.js +65 -1
  21. package/src/tools/basic/extractText.js +61 -5
  22. package/src/tools/basic/scrapeStructured.js +48 -10
  23. package/src/tools/crawl/crawlDeep.js +13 -5
  24. package/src/tools/crawl/mapSite.js +24 -51
  25. package/src/tools/extract/analyzeContent.js +11 -6
  26. package/src/tools/extract/extractContent.js +23 -5
  27. package/src/tools/extract/extractStructured.js +65 -16
  28. package/src/tools/extract/extractWithLlm.js +192 -11
  29. package/src/tools/extract/listOllamaModels.js +19 -8
  30. package/src/tools/extract/processDocument.js +10 -4
  31. package/src/tools/extract/summarizeContent.js +58 -1
  32. package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
  33. package/src/tools/research/deepResearch.js +43 -4
  34. package/src/tools/search/providers/searxng.js +2 -2
  35. package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
  36. package/src/tools/search/ranking/ResultRanker.js +13 -4
  37. package/src/tools/search/searchWeb.js +5 -5
  38. package/src/tools/templates/TemplateRegistry.js +3 -2
  39. package/src/tools/tracking/trackChanges/differ.js +33 -1
  40. package/src/utils/htmlToMarkdown.js +5 -1
@@ -47,14 +47,17 @@ const StealthConfigSchema = z.object({
47
47
  spoofMediaDevices: z.boolean().default(true),
48
48
  spoofBatteryAPI: z.boolean().default(true)
49
49
  }).optional(),
50
-
50
+
51
51
  fingerprinting: z.object({
52
52
  canvasNoise: z.boolean().default(true),
53
53
  webglSpoofing: z.boolean().default(true),
54
54
  audioContextSpoofing: z.boolean().default(true),
55
55
  fontSpoofing: z.boolean().default(true),
56
56
  hardwareSpoofing: z.boolean().default(true)
57
- }).optional()
57
+ }).optional(),
58
+
59
+ // C2: browser engine selection — 'chromium' (default) or 'camoufox' (Firefox-based)
60
+ engine: z.enum(['chromium', 'camoufox']).optional().default('chromium')
58
61
  });
59
62
 
60
63
  export class StealthBrowserManager {
@@ -232,16 +235,41 @@ export class StealthBrowserManager {
232
235
  }
233
236
 
234
237
  /**
235
- * Launch stealth browser with anti-detection configurations
238
+ * Launch stealth browser with anti-detection configurations.
239
+ * C2: honours config.engine — 'chromium' (default) or 'camoufox' (Firefox-based).
236
240
  */
237
241
  async launchStealthBrowser(config = {}) {
242
+ const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
243
+
244
+ // C2: if the requested engine differs from the running browser, tear it down first.
245
+ if (this.browser && this._launchedEngine && this._launchedEngine !== validatedConfig.engine) {
246
+ await this.browser.close().catch(() => {});
247
+ this.browser = null;
248
+ }
249
+
238
250
  if (this.browser) {
239
251
  return this.browser;
240
252
  }
241
253
 
242
- const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
243
-
244
- // Base browser args for stealth
254
+ // C2: delegate to CamoufoxAdapter when engine === 'camoufox'
255
+ if (validatedConfig.engine === 'camoufox') {
256
+ const adapter = new CamoufoxAdapter();
257
+ const available = await adapter.isAvailable();
258
+ if (!available) {
259
+ throw new Error(
260
+ 'camoufox is not installed. Run: npm install camoufox to use the Firefox-based stealth engine.'
261
+ );
262
+ }
263
+ this.browser = await adapter.launch({
264
+ headless: true,
265
+ launchOptions: {}
266
+ });
267
+ this._launchedEngine = 'camoufox';
268
+ return this.browser;
269
+ }
270
+
271
+ this._launchedEngine = 'chromium';
272
+ // Base browser args for stealth (Chromium path)
245
273
  const stealthArgs = [
246
274
  '--no-sandbox',
247
275
  '--disable-dev-shm-usage',
@@ -498,6 +526,9 @@ export class StealthBrowserManager {
498
526
  * Generate advanced HTTP headers with realistic patterns
499
527
  */
500
528
  generateAdvancedHeaders(config, selectedOS) {
529
+ // Resolve the UA first so sec-ch-ua brand version can match.
530
+ const resolvedUA = this.selectRealisticUserAgent(config, selectedOS);
531
+
501
532
  const headers = {
502
533
  'Accept-Language': `${(config.locale || 'en-US').toLowerCase()},en;q=0.9`,
503
534
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@@ -512,8 +543,8 @@ export class StealthBrowserManager {
512
543
  'sec-ch-ua-platform': this.generateSecChUaPlatform(selectedOS)
513
544
  };
514
545
 
515
- // Add sec-ch-ua header
516
- headers['sec-ch-ua'] = this.generateSecChUaHeader();
546
+ // C2: pass UA so sec-ch-ua brand version matches the Chrome major version.
547
+ headers['sec-ch-ua'] = this.generateSecChUaHeader(resolvedUA);
517
548
 
518
549
  // Randomize some headers
519
550
  if (Math.random() < 0.25) {
@@ -533,15 +564,23 @@ export class StealthBrowserManager {
533
564
  }
534
565
 
535
566
  /**
536
- * Generate sec-ch-ua header
567
+ * Generate sec-ch-ua header.
568
+ * C2: brand versions are derived from the UA's Chrome major version so
569
+ * sec-ch-ua and the User-Agent header stay consistent.
570
+ * @param {string} [userAgent] — the selected user agent string
537
571
  */
538
- generateSecChUaHeader() {
572
+ generateSecChUaHeader(userAgent = '') {
573
+ // Extract Chrome major version from the UA (e.g. "Chrome/121.0.0.0" → "121").
574
+ // Fall back to 121 if the UA is not a Chrome UA.
575
+ const match = userAgent.match(/Chrome\/(\d+)/i);
576
+ const version = match ? match[1] : '121';
577
+
539
578
  const brands = [
540
579
  { brand: 'Not_A Brand', version: '8' },
541
- { brand: 'Chromium', version: '120' },
542
- { brand: 'Google Chrome', version: '120' }
580
+ { brand: 'Chromium', version },
581
+ { brand: 'Google Chrome', version }
543
582
  ];
544
-
583
+
545
584
  return brands
546
585
  .map(b => `"${b.brand}";v="${b.version}"`)
547
586
  .join(', ');
@@ -4,7 +4,7 @@
4
4
  */
5
5
 
6
6
  import { SummarizerManager } from 'node-summarizer';
7
- import { franc } from 'franc';
7
+ import { franc, francAll } from 'franc';
8
8
  import nlp from 'compromise';
9
9
  import { z } from 'zod';
10
10
  import { splitSentences } from './sentenceUtils.js';
@@ -316,7 +316,7 @@ export class ContentAnalyzer {
316
316
  const confidence = Math.min(1, 0.5 + (text.length / 500) * 0.5);
317
317
 
318
318
  // Get alternative languages using franc.all
319
- const alternatives = franc.all(text, {
319
+ const alternatives = francAll(text, {
320
320
  minLength: 10,
321
321
  whitelist: Object.keys(LANGUAGE_NAMES)
322
322
  })
@@ -6,6 +6,9 @@ import { RobotsChecker } from '../../utils/robotsChecker.js';
6
6
  import { DomainFilter } from '../../utils/domainFilter.js';
7
7
  import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
8
8
  import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
9
+ import { Logger } from '../../utils/Logger.js';
10
+
11
+ const logger = new Logger('BFSCrawler');
9
12
 
10
13
  export class BFSCrawler {
11
14
  constructor(options = {}) {
@@ -43,7 +46,10 @@ export class BFSCrawler {
43
46
 
44
47
  this.queue = new QueueManager({ concurrency, timeout });
45
48
  this.cache = new CacheManager({ ttl: 3600000 }); // 1 hour cache
49
+ // C1: per-domain rate-limiter map — reuse existing limiter when
50
+ // effectiveRateLimit hasn't changed, rather than recreating it on every URL.
46
51
  this.rateLimiter = new RateLimiter({ requestsPerSecond: 10 });
52
+ this._domainRateLimiters = new Map();
47
53
  this.robotsChecker = respectRobots ? new RobotsChecker(userAgent) : null;
48
54
 
49
55
  // Initialize domain filter (create new if not provided)
@@ -142,13 +148,13 @@ export class BFSCrawler {
142
148
  });
143
149
 
144
150
  if (!filterDecision.allowed) {
145
- console.error(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
151
+ logger.debug(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
146
152
  return;
147
153
  }
148
-
154
+
149
155
  // Backward compatibility: also check legacy patterns
150
156
  if (!this.shouldCrawlUrl(normalizedUrl)) {
151
- console.error(`Legacy pattern blocks: ${normalizedUrl}`);
157
+ logger.debug(`Legacy pattern blocks: ${normalizedUrl}`);
152
158
  return;
153
159
  }
154
160
 
@@ -156,7 +162,7 @@ export class BFSCrawler {
156
162
  if (this.respectRobots && this.robotsChecker) {
157
163
  const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
158
164
  if (!canFetch) {
159
- console.error(`Robots.txt blocks: ${normalizedUrl}`);
165
+ logger.debug(`Robots.txt blocks: ${normalizedUrl}`);
160
166
  return;
161
167
  }
162
168
  }
@@ -171,17 +177,22 @@ export class BFSCrawler {
171
177
 
172
178
  if (!pageData) {
173
179
  // Apply domain-specific rate limiting
180
+ // C1: reuse per-domain limiter from the map to avoid recreating on each URL.
174
181
  const urlObj = new URL(normalizedUrl);
175
- const domainRules = this.domainFilter.getDomainRules(urlObj.hostname);
176
-
177
- // Use domain-specific rate limit if available
182
+ const domain = urlObj.hostname;
183
+ const domainRules = this.domainFilter.getDomainRules(domain);
178
184
  const effectiveRateLimit = domainRules.rateLimit || 10;
179
- if (this.rateLimiter.requestsPerSecond !== effectiveRateLimit) {
180
- // Update rate limiter for this domain
181
- this.rateLimiter = new RateLimiter({ requestsPerSecond: effectiveRateLimit });
185
+
186
+ if (!this._domainRateLimiters.has(domain)) {
187
+ this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
188
+ } else {
189
+ const existing = this._domainRateLimiters.get(domain);
190
+ if (existing.requestsPerSecond !== effectiveRateLimit) {
191
+ this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
192
+ }
182
193
  }
183
-
184
- await this.rateLimiter.checkLimit(normalizedUrl);
194
+
195
+ await this._domainRateLimiters.get(domain).checkLimit(normalizedUrl);
185
196
 
186
197
  // Fetch the page
187
198
  pageData = await this.fetchPage(normalizedUrl);
@@ -401,9 +401,10 @@ export class ContentProcessor {
401
401
 
402
402
  const avgWordsPerSentence = words.length / sentences.length;
403
403
  const avgCharsPerWord = charactersNoSpaces / words.length;
404
-
405
- // Simple readability score (lower is better)
406
- const readabilityScore = (avgWordsPerSentence * 1.015) + (avgCharsPerWord * 84.6) - 206.835;
404
+ const avgSyllablesPerWord = words.reduce((sum, w) => sum + this._countSyllables(w), 0) / words.length;
405
+
406
+ // Flesch Reading-Ease: higher score = easier to read
407
+ const readabilityScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
407
408
 
408
409
  return {
409
410
  sentences: sentences.length,
@@ -412,6 +413,7 @@ export class ContentProcessor {
412
413
  charactersNoSpaces,
413
414
  avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
414
415
  avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
416
+ avgSyllablesPerWord: Math.round(avgSyllablesPerWord * 100) / 100,
415
417
  readabilityScore: Math.round(readabilityScore * 100) / 100,
416
418
  readabilityLevel: this.getReadabilityLevel(readabilityScore)
417
419
  };
@@ -432,6 +434,20 @@ export class ContentProcessor {
432
434
  return 'Very Difficult';
433
435
  }
434
436
 
437
+ /**
438
+ * Count syllables in a word (heuristic)
439
+ * @param {string} word
440
+ * @returns {number}
441
+ */
442
+ _countSyllables(word) {
443
+ const w = word.toLowerCase().replace(/[^a-z]/g, '');
444
+ if (w.length <= 3) return 1;
445
+ // Remove trailing silent e
446
+ const stripped = w.replace(/e$/, '');
447
+ const matches = stripped.match(/[aeiouy]+/g);
448
+ return Math.max(1, matches ? matches.length : 1);
449
+ }
450
+
435
451
  /**
436
452
  * Extract fallback content when Readability fails
437
453
  * @param {string} html - HTML content
@@ -16,6 +16,12 @@ const PDFProcessorSchema = z.object({
16
16
  extractText: z.boolean().default(true),
17
17
  password: z.string().optional(),
18
18
  maxPages: z.number().min(1).max(1000).default(100),
19
+ // C3: true page-range extraction (1-based, inclusive). When set, only the
20
+ // text from pages [start..end] is returned.
21
+ pageRange: z.object({
22
+ start: z.number().min(1).default(1),
23
+ end: z.number().min(1).optional()
24
+ }).optional(),
19
25
  parseOptions: z.object({
20
26
  normalizeWhitespace: z.boolean().default(true),
21
27
  disableCombineTextItems: z.boolean().default(false)
@@ -95,12 +101,29 @@ export class PDFProcessor {
95
101
  return result;
96
102
  }
97
103
 
104
+ // C3: when a page range is requested, capture per-page text so we can
105
+ // return exactly pages [start..end] (pdf-parse otherwise concatenates the
106
+ // whole document and its `max` option only caps the *upper* page bound).
107
+ const pageRange = processingOptions.pageRange;
108
+ const capturedPages = [];
109
+
98
110
  // Parse PDF with options
99
111
  const parseOptions = {
100
112
  ...processingOptions.parseOptions,
101
113
  max: processingOptions.maxPages
102
114
  };
103
115
 
116
+ // If extracting a range, raise `max` to at least the requested end page
117
+ // and install a pagerender that records each page's text.
118
+ if (pageRange) {
119
+ if (pageRange.end) {
120
+ parseOptions.max = Math.max(parseOptions.max, pageRange.end);
121
+ } else {
122
+ parseOptions.max = processingOptions.maxPages;
123
+ }
124
+ parseOptions.pagerender = (pageData) => this._renderPage(pageData, capturedPages);
125
+ }
126
+
104
127
  if (processingOptions.password) {
105
128
  parseOptions.password = processingOptions.password;
106
129
  }
@@ -118,7 +141,15 @@ export class PDFProcessor {
118
141
 
119
142
  // Extract text content
120
143
  if (processingOptions.extractText) {
121
- result.text = this.cleanPDFText(pdfData.text);
144
+ if (pageRange) {
145
+ const start = pageRange.start || 1;
146
+ const end = pageRange.end || capturedPages.length;
147
+ const slice = capturedPages.slice(start - 1, end);
148
+ result.text = this.cleanPDFText(slice.join('\n\n'));
149
+ result.extractedPages = { start, end, count: slice.length };
150
+ } else {
151
+ result.text = this.cleanPDFText(pdfData.text);
152
+ }
122
153
  }
123
154
 
124
155
  // Extract metadata
@@ -414,34 +445,52 @@ export class PDFProcessor {
414
445
  }
415
446
 
416
447
  /**
417
- * Extract specific pages from PDF
418
- * @param {Object} params - Processing parameters with page range
419
- * @returns {Promise<Object>} - Processing result for specified pages
448
+ * Render a single PDF page to text and record it.
449
+ * Mirrors pdf-parse's default render (newline on Y-position change) but
450
+ * accumulates per-page text so callers can slice a true page range.
451
+ * Note: like pdf-parse, this does not reconstruct multi-column / table
452
+ * layout — column order follows the PDF's text-item stream.
453
+ * @param {Object} pageData - pdf.js page proxy from pdf-parse
454
+ * @param {string[]} sink - array that receives this page's text
455
+ * @returns {Promise<string>}
456
+ */
457
+ async _renderPage(pageData, sink) {
458
+ const textContent = await pageData.getTextContent({
459
+ normalizeWhitespace: true,
460
+ disableCombineTextItems: false
461
+ });
462
+ let lastY;
463
+ let text = '';
464
+ for (const item of textContent.items) {
465
+ if (lastY === item.transform[5] || lastY === undefined) {
466
+ text += item.str;
467
+ } else {
468
+ text += '\n' + item.str;
469
+ }
470
+ lastY = item.transform[5];
471
+ }
472
+ sink.push(text);
473
+ // pdf-parse joins page renders with '\n\n' for pdfData.text
474
+ return text;
475
+ }
476
+
477
+ /**
478
+ * Extract a specific page range from a PDF (1-based, inclusive).
479
+ * @param {Object} params - Processing parameters
480
+ * @param {number} [params.startPage=1] - First page to include
481
+ * @param {number} [params.endPage] - Last page to include (defaults to end)
482
+ * @returns {Promise<Object>} - Processing result for the requested pages
420
483
  */
421
484
  async extractPDFPages(params) {
422
485
  const { startPage = 1, endPage, ...processingParams } = params;
423
-
424
- // Override parse options to limit page range
425
- const options = {
426
- ...processingParams.options,
427
- parseOptions: {
428
- ...processingParams.options?.parseOptions,
429
- max: endPage || processingParams.options?.maxPages || 100
430
- }
431
- };
432
486
 
433
- const result = await this.processPDF({
487
+ return this.processPDF({
434
488
  ...processingParams,
435
- options
489
+ options: {
490
+ ...processingParams.options,
491
+ pageRange: { start: startPage, ...(endPage ? { end: endPage } : {}) }
492
+ }
436
493
  });
437
-
438
- if (result.success && result.text && startPage > 1) {
439
- // This is a simplified approach - pdf-parse doesn't provide per-page text
440
- // For proper page-by-page extraction, consider using pdf2pic or pdf-poppler
441
- console.warn('Page-specific extraction is limited with current PDF parser');
442
- }
443
-
444
- return result;
445
494
  }
446
495
  }
447
496
 
@@ -121,8 +121,16 @@ const ScrapeWithActionsSchema = z.object({
121
121
  captureIntermediateStates: z.boolean().default(false),
122
122
  captureScreenshots: z.boolean().default(true),
123
123
 
124
- // Form auto-fill
125
- formAutoFill: z.record(z.string()).optional(),
124
+ // Form auto-fill — structured shape ({fields:[{selector,value,...}], submitSelector, waitAfterSubmit}).
125
+ // A flat z.record(string) of selector→value is still accepted for backward compatibility.
126
+ formAutoFill: z.union([
127
+ z.object({
128
+ fields: z.array(FormFieldSchema),
129
+ submitSelector: z.string().optional(),
130
+ waitAfterSubmit: z.number().min(0).max(30000).default(2000)
131
+ }),
132
+ z.record(z.string())
133
+ ]).optional(),
126
134
 
127
135
  // Browser options
128
136
  browserOptions: z.object({
@@ -386,8 +394,9 @@ export class ScrapeWithActionsTool extends EventEmitter {
386
394
  const intermediateStates = params.captureIntermediateStates ?
387
395
  await this.extractIntermediateStates(actionResults, params) : [];
388
396
 
389
- // Get final page content after all actions
390
- const finalContent = await this.extractFinalContent(params);
397
+ // Get final page content after all actions (reads the post-action live page
398
+ // captured by ActionExecutor, falling back to a fresh fetch only if missing).
399
+ const finalContent = await this.extractFinalContent(params, chainResult);
391
400
 
392
401
  // Generate different formats
393
402
  const content = this.generateFormats(finalContent, params.formats, {
@@ -446,21 +455,37 @@ export class ScrapeWithActionsTool extends EventEmitter {
446
455
 
447
456
  insertFormAutoFillActions(actions, formAutoFill) {
448
457
  const fillActions = [];
449
-
450
- // Convert object with key-value pairs to fill actions
451
- for (const [selector, value] of Object.entries(formAutoFill)) {
452
- if (selector === 'submitSelector' || selector === 'waitAfterSubmit') {
453
- continue; // Skip special keys
458
+
459
+ if (Array.isArray(formAutoFill.fields)) {
460
+ // Structured shape: { fields: [{selector, value, type, waitAfter}], submitSelector, waitAfterSubmit }
461
+ for (const field of formAutoFill.fields) {
462
+ fillActions.push({
463
+ type: 'type',
464
+ selector: field.selector,
465
+ text: field.value,
466
+ description: `Auto-fill field: ${field.selector}`,
467
+ continueOnError: true,
468
+ retries: 1
469
+ });
470
+ if (field.waitAfter) {
471
+ fillActions.push({ type: 'wait', duration: field.waitAfter });
472
+ }
473
+ }
474
+ } else {
475
+ // Backward-compatible flat shape: { selector: value, ... }
476
+ for (const [selector, value] of Object.entries(formAutoFill)) {
477
+ if (selector === 'submitSelector' || selector === 'waitAfterSubmit' || selector === 'fields') {
478
+ continue; // Skip special keys
479
+ }
480
+ fillActions.push({
481
+ type: 'type',
482
+ selector,
483
+ text: value,
484
+ description: `Auto-fill field: ${selector}`,
485
+ continueOnError: true,
486
+ retries: 1
487
+ });
454
488
  }
455
-
456
- fillActions.push({
457
- type: 'type',
458
- selector,
459
- text: value,
460
- description: `Auto-fill field: ${selector}`,
461
- continueOnError: true,
462
- retries: 1
463
- });
464
489
  }
465
490
 
466
491
  // Add submit action if specified
@@ -585,16 +610,29 @@ export class ScrapeWithActionsTool extends EventEmitter {
585
610
  return states;
586
611
  }
587
612
 
588
- async extractFinalContent(params) {
613
+ async extractFinalContent(params, chainResult = null) {
589
614
  try {
615
+ const options = {
616
+ includeMetadata: params.extractionOptions?.includeMetadata !== false,
617
+ includeLinks: params.extractionOptions?.includeLinks !== false,
618
+ includeImages: params.extractionOptions?.includeImages !== false,
619
+ customSelectors: params.extractionOptions?.selectors
620
+ };
621
+
622
+ // Prefer the post-action live page HTML captured during action execution.
623
+ // This ensures the final content reflects clicks/typing/navigation rather
624
+ // than re-fetching the original (pre-action) URL.
625
+ if (chainResult?.finalHtml) {
626
+ return await this.extractContentTool.execute({
627
+ url: chainResult.finalUrl || params.url,
628
+ html: chainResult.finalHtml,
629
+ options
630
+ });
631
+ }
632
+
590
633
  const extractResult = await this.extractContentTool.execute({
591
634
  url: params.url,
592
- options: {
593
- includeMetadata: params.extractionOptions?.includeMetadata !== false,
594
- includeLinks: params.extractionOptions?.includeLinks !== false,
595
- includeImages: params.extractionOptions?.includeImages !== false,
596
- customSelectors: params.extractionOptions?.selectors
597
- }
635
+ options
598
636
  });
599
637
 
600
638
  return extractResult;
@@ -161,7 +161,9 @@ export class BatchScrapeTool extends EventEmitter {
161
161
  this.stats.lastUpdated = Date.now();
162
162
  this.activeBatches.delete(batchId);
163
163
 
164
- await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
164
+ // C3: include webhook delivery status in the result
165
+ const webhookStatus = await sendWebhookNotification('batch_completed', batchResult, webhookConfig, this.webhookDispatcher, this.enableWebhookNotifications);
166
+ if (webhookStatus) batchResult.webhookDelivery = webhookStatus;
165
167
  this.emit('batchCompleted', batchResult);
166
168
  return batchResult;
167
169
  } catch (error) {
@@ -5,14 +5,16 @@
5
5
 
6
6
  /**
7
7
  * Send a batch event via the webhookDispatcher.
8
+ * C3: returns a delivery status object so callers can include it in the result.
8
9
  * @param {string} event
9
10
  * @param {Object} data
10
11
  * @param {Object} webhookConfig
11
12
  * @param {Object} webhookDispatcher
12
13
  * @param {boolean} enabled
14
+ * @returns {Promise<{queued: boolean, url?: string, error?: string}|null>}
13
15
  */
14
16
  export async function sendWebhookNotification(event, data, webhookConfig, webhookDispatcher, enabled) {
15
- if (!enabled || !webhookConfig || !webhookDispatcher) return;
17
+ if (!enabled || !webhookConfig || !webhookDispatcher) return null;
16
18
 
17
19
  try {
18
20
  await webhookDispatcher.dispatch(event, data, {
@@ -20,7 +22,9 @@ export async function sendWebhookNotification(event, data, webhookConfig, webhoo
20
22
  immediate: false,
21
23
  metadata: { batchId: data.batchId, timestamp: Date.now() }
22
24
  });
25
+ return { queued: true, url: webhookConfig.url };
23
26
  } catch (error) {
24
27
  console.warn(`[batchScrape] Webhook notification failed: ${error.message}`);
28
+ return { queued: false, url: webhookConfig.url, error: error.message };
25
29
  }
26
30
  }
@@ -111,7 +111,6 @@ function generateFormats($, html, formats) {
111
111
  function buildMarkdown($) {
112
112
  let md = '';
113
113
  const title = $('title').text().trim();
114
- if (title) md += `# ${title}\n\n`;
115
114
 
116
115
  const selectors = ['article', 'main', '.content', '#content', '.post-content', '.entry-content'];
117
116
  let $body = null;
@@ -121,6 +120,12 @@ function buildMarkdown($) {
121
120
  }
122
121
  if (!$body || $body.length === 0) $body = $('body');
123
122
 
123
+ // C3: de-dup title — only emit the <title> heading if the page has no <h1>
124
+ // or if the first <h1> text differs from the <title> text (case-insensitive).
125
+ const firstH1 = $body.find('h1').first().text().trim();
126
+ const titleDuplicated = firstH1 && firstH1.toLowerCase() === title.toLowerCase();
127
+ if (title && !titleDuplicated) md += `# ${title}\n\n`;
128
+
124
129
  $body.find('h1').each((_, el) => { md += `# ${$(el).text().trim()}\n\n`; });
125
130
  $body.find('h2').each((_, el) => { md += `## ${$(el).text().trim()}\n\n`; });
126
131
  $body.find('h3').each((_, el) => { md += `### ${$(el).text().trim()}\n\n`; });