crawlforge-mcp-server 4.2.11 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +2 -1
  2. package/server.js +152 -21
  3. package/src/constants/config.js +5 -0
  4. package/src/core/ActionExecutor.js +13 -1
  5. package/src/core/ChangeTracker.js +8 -5
  6. package/src/core/LLMsTxtAnalyzer.js +71 -47
  7. package/src/core/LocalizationManager.js +7 -4
  8. package/src/core/ResearchOrchestrator.js +10 -6
  9. package/src/core/StealthBrowserManager.js +111 -40
  10. package/src/core/analysis/ContentAnalyzer.js +2 -2
  11. package/src/core/crawlers/BFSCrawler.js +23 -12
  12. package/src/core/processing/ContentProcessor.js +19 -3
  13. package/src/core/processing/PDFProcessor.js +72 -23
  14. package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
  15. package/src/tools/advanced/batchScrape/index.js +3 -1
  16. package/src/tools/advanced/batchScrape/reporter.js +5 -1
  17. package/src/tools/advanced/batchScrape/worker.js +6 -1
  18. package/src/tools/basic/_fetch.js +78 -5
  19. package/src/tools/basic/extractLinks.js +1 -1
  20. package/src/tools/basic/extractMetadata.js +65 -1
  21. package/src/tools/basic/extractText.js +61 -5
  22. package/src/tools/basic/scrapeStructured.js +48 -10
  23. package/src/tools/crawl/crawlDeep.js +13 -5
  24. package/src/tools/crawl/mapSite.js +24 -51
  25. package/src/tools/extract/analyzeContent.js +11 -6
  26. package/src/tools/extract/extractContent.js +23 -5
  27. package/src/tools/extract/extractStructured.js +65 -16
  28. package/src/tools/extract/extractWithLlm.js +192 -11
  29. package/src/tools/extract/listOllamaModels.js +19 -8
  30. package/src/tools/extract/processDocument.js +10 -4
  31. package/src/tools/extract/summarizeContent.js +58 -1
  32. package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
  33. package/src/tools/research/deepResearch.js +43 -4
  34. package/src/tools/search/providers/searxng.js +2 -2
  35. package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
  36. package/src/tools/search/ranking/ResultRanker.js +13 -4
  37. package/src/tools/search/searchWeb.js +5 -5
  38. package/src/tools/templates/TemplateRegistry.js +3 -2
  39. package/src/tools/tracking/trackChanges/differ.js +33 -1
  40. package/src/utils/htmlToMarkdown.js +5 -1
@@ -47,14 +47,17 @@ const StealthConfigSchema = z.object({
47
47
  spoofMediaDevices: z.boolean().default(true),
48
48
  spoofBatteryAPI: z.boolean().default(true)
49
49
  }).optional(),
50
-
50
+
51
51
  fingerprinting: z.object({
52
52
  canvasNoise: z.boolean().default(true),
53
53
  webglSpoofing: z.boolean().default(true),
54
54
  audioContextSpoofing: z.boolean().default(true),
55
55
  fontSpoofing: z.boolean().default(true),
56
56
  hardwareSpoofing: z.boolean().default(true)
57
- }).optional()
57
+ }).optional(),
58
+
59
+ // C2: browser engine selection — 'chromium' (default) or 'camoufox' (Firefox-based)
60
+ engine: z.enum(['chromium', 'camoufox']).optional().default('chromium')
58
61
  });
59
62
 
60
63
  export class StealthBrowserManager {
@@ -232,16 +235,41 @@ export class StealthBrowserManager {
232
235
  }
233
236
 
234
237
  /**
235
- * Launch stealth browser with anti-detection configurations
238
+ * Launch stealth browser with anti-detection configurations.
239
+ * C2: honours config.engine — 'chromium' (default) or 'camoufox' (Firefox-based).
236
240
  */
237
241
  async launchStealthBrowser(config = {}) {
242
+ const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
243
+
244
+ // C2: if the requested engine differs from the running browser, tear it down first.
245
+ if (this.browser && this._launchedEngine && this._launchedEngine !== validatedConfig.engine) {
246
+ await this.browser.close().catch(() => {});
247
+ this.browser = null;
248
+ }
249
+
238
250
  if (this.browser) {
239
251
  return this.browser;
240
252
  }
241
253
 
242
- const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
243
-
244
- // Base browser args for stealth
254
+ // C2: delegate to CamoufoxAdapter when engine === 'camoufox'
255
+ if (validatedConfig.engine === 'camoufox') {
256
+ const adapter = new CamoufoxAdapter();
257
+ const available = await adapter.isAvailable();
258
+ if (!available) {
259
+ throw new Error(
260
+ 'camoufox is not installed. Run: npm install camoufox to use the Firefox-based stealth engine.'
261
+ );
262
+ }
263
+ this.browser = await adapter.launch({
264
+ headless: true,
265
+ launchOptions: {}
266
+ });
267
+ this._launchedEngine = 'camoufox';
268
+ return this.browser;
269
+ }
270
+
271
+ this._launchedEngine = 'chromium';
272
+ // Base browser args for stealth (Chromium path)
245
273
  const stealthArgs = [
246
274
  '--no-sandbox',
247
275
  '--disable-dev-shm-usage',
@@ -390,8 +418,11 @@ export class StealthBrowserManager {
390
418
  * Generate advanced browser fingerprint with enhanced randomization
391
419
  */
392
420
  generateAdvancedFingerprint(config = {}) {
421
+ // Select the OS once and thread it through UA, headers, and hardware so
422
+ // navigator.platform / sec-ch-ua-platform / userAgent stay consistent.
423
+ const selectedOS = this.selectOS(config);
393
424
  const fingerprint = {
394
- userAgent: this.selectRealisticUserAgent(config),
425
+ userAgent: this.selectRealisticUserAgent(config, selectedOS),
395
426
  viewport: config.customViewport || this.selectWeightedViewport(),
396
427
  timezone: config.timezone || this.selectTimezone(),
397
428
  deviceScaleFactor: this.randomFloat(1, 2, 1),
@@ -400,13 +431,13 @@ export class StealthBrowserManager {
400
431
  colorScheme: Math.random() < 0.3 ? 'dark' : 'light',
401
432
  reducedMotion: Math.random() < 0.1 ? 'reduce' : 'no-preference',
402
433
  forcedColors: Math.random() < 0.05 ? 'active' : 'none',
403
- headers: this.generateAdvancedHeaders(config),
434
+ headers: this.generateAdvancedHeaders(config, selectedOS),
404
435
  webRTC: this.generateWebRTCConfig(config),
405
436
  canvas: this.generateAdvancedCanvasFingerprint(),
406
437
  webGL: this.generateAdvancedWebGLFingerprint(),
407
438
  audioContext: this.generateAudioContextFingerprint(),
408
439
  mediaDevices: this.generateMediaDevicesFingerprint(),
409
- hardware: this.generateHardwareFingerprint(),
440
+ hardware: this.generateHardwareFingerprint(selectedOS),
410
441
  fonts: this.generateAdvancedFontList(),
411
442
  plugins: this.generateAdvancedPluginList(),
412
443
  geolocation: this.generateRealisticGeolocation(),
@@ -417,10 +448,34 @@ export class StealthBrowserManager {
417
448
  return fingerprint;
418
449
  }
419
450
 
451
+ /**
452
+ * Choose a single OS ('windows' | 'macos' | 'linux') for a fingerprint.
453
+ * A custom UA pins the OS to whatever that UA reports; a non-random UA pins
454
+ * to windows (the default pool below); otherwise weighted-random.
455
+ */
456
+ selectOS(config = {}) {
457
+ if (config.customUserAgent) {
458
+ return this.inferOSFromUserAgent(config.customUserAgent);
459
+ }
460
+ if (!config.useRandomUserAgent) {
461
+ return 'windows';
462
+ }
463
+ return this.weightedRandom(this.osDistribution);
464
+ }
465
+
466
+ /**
467
+ * Infer the OS key from a user-agent string.
468
+ */
469
+ inferOSFromUserAgent(ua = '') {
470
+ if (/Macintosh|Mac OS X/i.test(ua)) return 'macos';
471
+ if (/Linux|X11|CrOS/i.test(ua)) return 'linux';
472
+ return 'windows';
473
+ }
474
+
420
475
  /**
421
476
  * Select realistic user agent based on market distribution
422
477
  */
423
- selectRealisticUserAgent(config) {
478
+ selectRealisticUserAgent(config, selectedOS) {
424
479
  if (config.customUserAgent) {
425
480
  return config.customUserAgent;
426
481
  }
@@ -429,9 +484,10 @@ export class StealthBrowserManager {
429
484
  return this.userAgentPools.chrome.windows[0];
430
485
  }
431
486
 
432
- // Select OS based on distribution
433
- const selectedOS = this.weightedRandom(this.osDistribution);
434
-
487
+ // Use the OS chosen once for this fingerprint (falls back to a fresh draw
488
+ // if called without one, preserving the original standalone behavior).
489
+ selectedOS = selectedOS || this.weightedRandom(this.osDistribution);
490
+
435
491
  // Select browser based on distribution and OS compatibility
436
492
  let availableBrowsers = { ...this.browserDistribution };
437
493
  if (selectedOS === 'linux' && availableBrowsers.safari) {
@@ -469,7 +525,10 @@ export class StealthBrowserManager {
469
525
  /**
470
526
  * Generate advanced HTTP headers with realistic patterns
471
527
  */
472
- generateAdvancedHeaders(config) {
528
+ generateAdvancedHeaders(config, selectedOS) {
529
+ // Resolve the UA first so sec-ch-ua brand version can match.
530
+ const resolvedUA = this.selectRealisticUserAgent(config, selectedOS);
531
+
473
532
  const headers = {
474
533
  'Accept-Language': `${(config.locale || 'en-US').toLowerCase()},en;q=0.9`,
475
534
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@@ -481,11 +540,11 @@ export class StealthBrowserManager {
481
540
  'Sec-Fetch-Site': 'none',
482
541
  'Sec-Fetch-User': '?1',
483
542
  'sec-ch-ua-mobile': '?0',
484
- 'sec-ch-ua-platform': this.generateSecChUaPlatform()
543
+ 'sec-ch-ua-platform': this.generateSecChUaPlatform(selectedOS)
485
544
  };
486
545
 
487
- // Add sec-ch-ua header
488
- headers['sec-ch-ua'] = this.generateSecChUaHeader();
546
+ // C2: pass UA so sec-ch-ua brand version matches the Chrome major version.
547
+ headers['sec-ch-ua'] = this.generateSecChUaHeader(resolvedUA);
489
548
 
490
549
  // Randomize some headers
491
550
  if (Math.random() < 0.25) {
@@ -505,15 +564,23 @@ export class StealthBrowserManager {
505
564
  }
506
565
 
507
566
  /**
508
- * Generate sec-ch-ua header
567
+ * Generate sec-ch-ua header.
568
+ * C2: brand versions are derived from the UA's Chrome major version so
569
+ * sec-ch-ua and the User-Agent header stay consistent.
570
+ * @param {string} [userAgent] — the selected user agent string
509
571
  */
510
- generateSecChUaHeader() {
572
+ generateSecChUaHeader(userAgent = '') {
573
+ // Extract Chrome major version from the UA (e.g. "Chrome/121.0.0.0" → "121").
574
+ // Fall back to 121 if the UA is not a Chrome UA.
575
+ const match = userAgent.match(/Chrome\/(\d+)/i);
576
+ const version = match ? match[1] : '121';
577
+
511
578
  const brands = [
512
579
  { brand: 'Not_A Brand', version: '8' },
513
- { brand: 'Chromium', version: '120' },
514
- { brand: 'Google Chrome', version: '120' }
580
+ { brand: 'Chromium', version },
581
+ { brand: 'Google Chrome', version }
515
582
  ];
516
-
583
+
517
584
  return brands
518
585
  .map(b => `"${b.brand}";v="${b.version}"`)
519
586
  .join(', ');
@@ -522,14 +589,14 @@ export class StealthBrowserManager {
522
589
  /**
523
590
  * Generate sec-ch-ua-platform header
524
591
  */
525
- generateSecChUaPlatform() {
592
+ generateSecChUaPlatform(selectedOS) {
526
593
  const platforms = {
527
594
  windows: '"Windows"',
528
595
  macos: '"macOS"',
529
596
  linux: '"Linux"'
530
597
  };
531
-
532
- const selectedOS = this.weightedRandom(this.osDistribution);
598
+
599
+ selectedOS = selectedOS || this.weightedRandom(this.osDistribution);
533
600
  return platforms[selectedOS] || '"Windows"';
534
601
  }
535
602
 
@@ -746,7 +813,9 @@ export class StealthBrowserManager {
746
813
  /**
747
814
  * Generate realistic hardware fingerprint
748
815
  */
749
- generateHardwareFingerprint() {
816
+ generateHardwareFingerprint(selectedOS) {
817
+ selectedOS = selectedOS || this.weightedRandom(this.osDistribution);
818
+
750
819
  const processors = [
751
820
  { cores: 4, threads: 8, name: 'Intel(R) Core(TM) i5-8250U CPU @ 1.60GHz' },
752
821
  { cores: 6, threads: 12, name: 'Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz' },
@@ -755,31 +824,33 @@ export class StealthBrowserManager {
755
824
  { cores: 6, threads: 6, name: 'AMD Ryzen 5 3600 6-Core Processor' },
756
825
  { cores: 8, threads: 16, name: 'AMD Ryzen 7 3700X 8-Core Processor' }
757
826
  ];
758
-
827
+
759
828
  const selectedProcessor = processors[Math.floor(Math.random() * processors.length)];
760
-
829
+
761
830
  return {
762
831
  hardwareConcurrency: selectedProcessor.threads,
763
832
  processor: selectedProcessor.name,
764
- architecture: Math.random() < 0.9 ? 'x86_64' : 'arm64',
833
+ architecture: 'x86_64',
765
834
  memory: Math.floor(Math.random() * 24) + 8, // 8-32 GB
766
835
  deviceMemory: Math.pow(2, Math.floor(Math.random() * 3) + 3), // 8, 16, or 32 GB
767
- platform: this.selectRealisticPlatform()
836
+ platform: this.selectRealisticPlatform(selectedOS)
768
837
  };
769
838
  }
770
839
 
771
840
  /**
772
- * Select realistic platform based on distribution
841
+ * Map the chosen OS to its navigator.platform value so it stays consistent
842
+ * with the user-agent and sec-ch-ua-platform header.
773
843
  */
774
- selectRealisticPlatform() {
775
- const platforms = {
776
- 'Win32': 0.75,
777
- 'MacIntel': 0.15,
778
- 'Linux x86_64': 0.08,
779
- 'Linux armv7l': 0.02
780
- };
781
-
782
- return this.weightedRandom(platforms);
844
+ selectRealisticPlatform(selectedOS) {
845
+ switch (selectedOS) {
846
+ case 'macos':
847
+ return 'MacIntel';
848
+ case 'linux':
849
+ return 'Linux x86_64';
850
+ case 'windows':
851
+ default:
852
+ return 'Win32';
853
+ }
783
854
  }
784
855
 
785
856
  /**
@@ -4,7 +4,7 @@
4
4
  */
5
5
 
6
6
  import { SummarizerManager } from 'node-summarizer';
7
- import { franc } from 'franc';
7
+ import { franc, francAll } from 'franc';
8
8
  import nlp from 'compromise';
9
9
  import { z } from 'zod';
10
10
  import { splitSentences } from './sentenceUtils.js';
@@ -316,7 +316,7 @@ export class ContentAnalyzer {
316
316
  const confidence = Math.min(1, 0.5 + (text.length / 500) * 0.5);
317
317
 
318
318
  // Get alternative languages using franc.all
319
- const alternatives = franc.all(text, {
319
+ const alternatives = francAll(text, {
320
320
  minLength: 10,
321
321
  whitelist: Object.keys(LANGUAGE_NAMES)
322
322
  })
@@ -6,6 +6,9 @@ import { RobotsChecker } from '../../utils/robotsChecker.js';
6
6
  import { DomainFilter } from '../../utils/domainFilter.js';
7
7
  import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
8
8
  import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
9
+ import { Logger } from '../../utils/Logger.js';
10
+
11
+ const logger = new Logger('BFSCrawler');
9
12
 
10
13
  export class BFSCrawler {
11
14
  constructor(options = {}) {
@@ -43,7 +46,10 @@ export class BFSCrawler {
43
46
 
44
47
  this.queue = new QueueManager({ concurrency, timeout });
45
48
  this.cache = new CacheManager({ ttl: 3600000 }); // 1 hour cache
49
+ // C1: per-domain rate-limiter map — reuse existing limiter when
50
+ // effectiveRateLimit hasn't changed, rather than recreating it on every URL.
46
51
  this.rateLimiter = new RateLimiter({ requestsPerSecond: 10 });
52
+ this._domainRateLimiters = new Map();
47
53
  this.robotsChecker = respectRobots ? new RobotsChecker(userAgent) : null;
48
54
 
49
55
  // Initialize domain filter (create new if not provided)
@@ -142,13 +148,13 @@ export class BFSCrawler {
142
148
  });
143
149
 
144
150
  if (!filterDecision.allowed) {
145
- console.error(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
151
+ logger.debug(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
146
152
  return;
147
153
  }
148
-
154
+
149
155
  // Backward compatibility: also check legacy patterns
150
156
  if (!this.shouldCrawlUrl(normalizedUrl)) {
151
- console.error(`Legacy pattern blocks: ${normalizedUrl}`);
157
+ logger.debug(`Legacy pattern blocks: ${normalizedUrl}`);
152
158
  return;
153
159
  }
154
160
 
@@ -156,7 +162,7 @@ export class BFSCrawler {
156
162
  if (this.respectRobots && this.robotsChecker) {
157
163
  const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
158
164
  if (!canFetch) {
159
- console.error(`Robots.txt blocks: ${normalizedUrl}`);
165
+ logger.debug(`Robots.txt blocks: ${normalizedUrl}`);
160
166
  return;
161
167
  }
162
168
  }
@@ -171,17 +177,22 @@ export class BFSCrawler {
171
177
 
172
178
  if (!pageData) {
173
179
  // Apply domain-specific rate limiting
180
+ // C1: reuse per-domain limiter from the map to avoid recreating on each URL.
174
181
  const urlObj = new URL(normalizedUrl);
175
- const domainRules = this.domainFilter.getDomainRules(urlObj.hostname);
176
-
177
- // Use domain-specific rate limit if available
182
+ const domain = urlObj.hostname;
183
+ const domainRules = this.domainFilter.getDomainRules(domain);
178
184
  const effectiveRateLimit = domainRules.rateLimit || 10;
179
- if (this.rateLimiter.requestsPerSecond !== effectiveRateLimit) {
180
- // Update rate limiter for this domain
181
- this.rateLimiter = new RateLimiter({ requestsPerSecond: effectiveRateLimit });
185
+
186
+ if (!this._domainRateLimiters.has(domain)) {
187
+ this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
188
+ } else {
189
+ const existing = this._domainRateLimiters.get(domain);
190
+ if (existing.requestsPerSecond !== effectiveRateLimit) {
191
+ this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
192
+ }
182
193
  }
183
-
184
- await this.rateLimiter.checkLimit(normalizedUrl);
194
+
195
+ await this._domainRateLimiters.get(domain).checkLimit(normalizedUrl);
185
196
 
186
197
  // Fetch the page
187
198
  pageData = await this.fetchPage(normalizedUrl);
@@ -401,9 +401,10 @@ export class ContentProcessor {
401
401
 
402
402
  const avgWordsPerSentence = words.length / sentences.length;
403
403
  const avgCharsPerWord = charactersNoSpaces / words.length;
404
-
405
- // Simple readability score (lower is better)
406
- const readabilityScore = (avgWordsPerSentence * 1.015) + (avgCharsPerWord * 84.6) - 206.835;
404
+ const avgSyllablesPerWord = words.reduce((sum, w) => sum + this._countSyllables(w), 0) / words.length;
405
+
406
+ // Flesch Reading-Ease: higher score = easier to read
407
+ const readabilityScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
407
408
 
408
409
  return {
409
410
  sentences: sentences.length,
@@ -412,6 +413,7 @@ export class ContentProcessor {
412
413
  charactersNoSpaces,
413
414
  avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
414
415
  avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
416
+ avgSyllablesPerWord: Math.round(avgSyllablesPerWord * 100) / 100,
415
417
  readabilityScore: Math.round(readabilityScore * 100) / 100,
416
418
  readabilityLevel: this.getReadabilityLevel(readabilityScore)
417
419
  };
@@ -432,6 +434,20 @@ export class ContentProcessor {
432
434
  return 'Very Difficult';
433
435
  }
434
436
 
437
+ /**
438
+ * Count syllables in a word (heuristic)
439
+ * @param {string} word
440
+ * @returns {number}
441
+ */
442
+ _countSyllables(word) {
443
+ const w = word.toLowerCase().replace(/[^a-z]/g, '');
444
+ if (w.length <= 3) return 1;
445
+ // Remove trailing silent e
446
+ const stripped = w.replace(/e$/, '');
447
+ const matches = stripped.match(/[aeiouy]+/g);
448
+ return Math.max(1, matches ? matches.length : 1);
449
+ }
450
+
435
451
  /**
436
452
  * Extract fallback content when Readability fails
437
453
  * @param {string} html - HTML content
@@ -16,6 +16,12 @@ const PDFProcessorSchema = z.object({
16
16
  extractText: z.boolean().default(true),
17
17
  password: z.string().optional(),
18
18
  maxPages: z.number().min(1).max(1000).default(100),
19
+ // C3: true page-range extraction (1-based, inclusive). When set, only the
20
+ // text from pages [start..end] is returned.
21
+ pageRange: z.object({
22
+ start: z.number().min(1).default(1),
23
+ end: z.number().min(1).optional()
24
+ }).optional(),
19
25
  parseOptions: z.object({
20
26
  normalizeWhitespace: z.boolean().default(true),
21
27
  disableCombineTextItems: z.boolean().default(false)
@@ -95,12 +101,29 @@ export class PDFProcessor {
95
101
  return result;
96
102
  }
97
103
 
104
+ // C3: when a page range is requested, capture per-page text so we can
105
+ // return exactly pages [start..end] (pdf-parse otherwise concatenates the
106
+ // whole document and its `max` option only caps the *upper* page bound).
107
+ const pageRange = processingOptions.pageRange;
108
+ const capturedPages = [];
109
+
98
110
  // Parse PDF with options
99
111
  const parseOptions = {
100
112
  ...processingOptions.parseOptions,
101
113
  max: processingOptions.maxPages
102
114
  };
103
115
 
116
+ // If extracting a range, raise `max` to at least the requested end page
117
+ // and install a pagerender that records each page's text.
118
+ if (pageRange) {
119
+ if (pageRange.end) {
120
+ parseOptions.max = Math.max(parseOptions.max, pageRange.end);
121
+ } else {
122
+ parseOptions.max = processingOptions.maxPages;
123
+ }
124
+ parseOptions.pagerender = (pageData) => this._renderPage(pageData, capturedPages);
125
+ }
126
+
104
127
  if (processingOptions.password) {
105
128
  parseOptions.password = processingOptions.password;
106
129
  }
@@ -118,7 +141,15 @@ export class PDFProcessor {
118
141
 
119
142
  // Extract text content
120
143
  if (processingOptions.extractText) {
121
- result.text = this.cleanPDFText(pdfData.text);
144
+ if (pageRange) {
145
+ const start = pageRange.start || 1;
146
+ const end = pageRange.end || capturedPages.length;
147
+ const slice = capturedPages.slice(start - 1, end);
148
+ result.text = this.cleanPDFText(slice.join('\n\n'));
149
+ result.extractedPages = { start, end, count: slice.length };
150
+ } else {
151
+ result.text = this.cleanPDFText(pdfData.text);
152
+ }
122
153
  }
123
154
 
124
155
  // Extract metadata
@@ -414,34 +445,52 @@ export class PDFProcessor {
414
445
  }
415
446
 
416
447
  /**
417
- * Extract specific pages from PDF
418
- * @param {Object} params - Processing parameters with page range
419
- * @returns {Promise<Object>} - Processing result for specified pages
448
+ * Render a single PDF page to text and record it.
449
+ * Mirrors pdf-parse's default render (newline on Y-position change) but
450
+ * accumulates per-page text so callers can slice a true page range.
451
+ * Note: like pdf-parse, this does not reconstruct multi-column / table
452
+ * layout — column order follows the PDF's text-item stream.
453
+ * @param {Object} pageData - pdf.js page proxy from pdf-parse
454
+ * @param {string[]} sink - array that receives this page's text
455
+ * @returns {Promise<string>}
456
+ */
457
+ async _renderPage(pageData, sink) {
458
+ const textContent = await pageData.getTextContent({
459
+ normalizeWhitespace: true,
460
+ disableCombineTextItems: false
461
+ });
462
+ let lastY;
463
+ let text = '';
464
+ for (const item of textContent.items) {
465
+ if (lastY === item.transform[5] || lastY === undefined) {
466
+ text += item.str;
467
+ } else {
468
+ text += '\n' + item.str;
469
+ }
470
+ lastY = item.transform[5];
471
+ }
472
+ sink.push(text);
473
+ // pdf-parse joins page renders with '\n\n' for pdfData.text
474
+ return text;
475
+ }
476
+
477
+ /**
478
+ * Extract a specific page range from a PDF (1-based, inclusive).
479
+ * @param {Object} params - Processing parameters
480
+ * @param {number} [params.startPage=1] - First page to include
481
+ * @param {number} [params.endPage] - Last page to include (defaults to end)
482
+ * @returns {Promise<Object>} - Processing result for the requested pages
420
483
  */
421
484
  async extractPDFPages(params) {
422
485
  const { startPage = 1, endPage, ...processingParams } = params;
423
-
424
- // Override parse options to limit page range
425
- const options = {
426
- ...processingParams.options,
427
- parseOptions: {
428
- ...processingParams.options?.parseOptions,
429
- max: endPage || processingParams.options?.maxPages || 100
430
- }
431
- };
432
486
 
433
- const result = await this.processPDF({
487
+ return this.processPDF({
434
488
  ...processingParams,
435
- options
489
+ options: {
490
+ ...processingParams.options,
491
+ pageRange: { start: startPage, ...(endPage ? { end: endPage } : {}) }
492
+ }
436
493
  });
437
-
438
- if (result.success && result.text && startPage > 1) {
439
- // This is a simplified approach - pdf-parse doesn't provide per-page text
440
- // For proper page-by-page extraction, consider using pdf2pic or pdf-poppler
441
- console.warn('Page-specific extraction is limited with current PDF parser');
442
- }
443
-
444
- return result;
445
494
  }
446
495
  }
447
496
 
@@ -121,8 +121,16 @@ const ScrapeWithActionsSchema = z.object({
121
121
  captureIntermediateStates: z.boolean().default(false),
122
122
  captureScreenshots: z.boolean().default(true),
123
123
 
124
- // Form auto-fill
125
- formAutoFill: z.record(z.string()).optional(),
124
+ // Form auto-fill — structured shape ({fields:[{selector,value,...}], submitSelector, waitAfterSubmit}).
125
+ // A flat z.record(string) of selector→value is still accepted for backward compatibility.
126
+ formAutoFill: z.union([
127
+ z.object({
128
+ fields: z.array(FormFieldSchema),
129
+ submitSelector: z.string().optional(),
130
+ waitAfterSubmit: z.number().min(0).max(30000).default(2000)
131
+ }),
132
+ z.record(z.string())
133
+ ]).optional(),
126
134
 
127
135
  // Browser options
128
136
  browserOptions: z.object({
@@ -386,8 +394,9 @@ export class ScrapeWithActionsTool extends EventEmitter {
386
394
  const intermediateStates = params.captureIntermediateStates ?
387
395
  await this.extractIntermediateStates(actionResults, params) : [];
388
396
 
389
- // Get final page content after all actions
390
- const finalContent = await this.extractFinalContent(params);
397
+ // Get final page content after all actions (reads the post-action live page
398
+ // captured by ActionExecutor, falling back to a fresh fetch only if missing).
399
+ const finalContent = await this.extractFinalContent(params, chainResult);
391
400
 
392
401
  // Generate different formats
393
402
  const content = this.generateFormats(finalContent, params.formats, {
@@ -446,21 +455,37 @@ export class ScrapeWithActionsTool extends EventEmitter {
446
455
 
447
456
  insertFormAutoFillActions(actions, formAutoFill) {
448
457
  const fillActions = [];
449
-
450
- // Convert object with key-value pairs to fill actions
451
- for (const [selector, value] of Object.entries(formAutoFill)) {
452
- if (selector === 'submitSelector' || selector === 'waitAfterSubmit') {
453
- continue; // Skip special keys
458
+
459
+ if (Array.isArray(formAutoFill.fields)) {
460
+ // Structured shape: { fields: [{selector, value, type, waitAfter}], submitSelector, waitAfterSubmit }
461
+ for (const field of formAutoFill.fields) {
462
+ fillActions.push({
463
+ type: 'type',
464
+ selector: field.selector,
465
+ text: field.value,
466
+ description: `Auto-fill field: ${field.selector}`,
467
+ continueOnError: true,
468
+ retries: 1
469
+ });
470
+ if (field.waitAfter) {
471
+ fillActions.push({ type: 'wait', duration: field.waitAfter });
472
+ }
473
+ }
474
+ } else {
475
+ // Backward-compatible flat shape: { selector: value, ... }
476
+ for (const [selector, value] of Object.entries(formAutoFill)) {
477
+ if (selector === 'submitSelector' || selector === 'waitAfterSubmit' || selector === 'fields') {
478
+ continue; // Skip special keys
479
+ }
480
+ fillActions.push({
481
+ type: 'type',
482
+ selector,
483
+ text: value,
484
+ description: `Auto-fill field: ${selector}`,
485
+ continueOnError: true,
486
+ retries: 1
487
+ });
454
488
  }
455
-
456
- fillActions.push({
457
- type: 'type',
458
- selector,
459
- text: value,
460
- description: `Auto-fill field: ${selector}`,
461
- continueOnError: true,
462
- retries: 1
463
- });
464
489
  }
465
490
 
466
491
  // Add submit action if specified
@@ -585,16 +610,29 @@ export class ScrapeWithActionsTool extends EventEmitter {
585
610
  return states;
586
611
  }
587
612
 
588
- async extractFinalContent(params) {
613
+ async extractFinalContent(params, chainResult = null) {
589
614
  try {
615
+ const options = {
616
+ includeMetadata: params.extractionOptions?.includeMetadata !== false,
617
+ includeLinks: params.extractionOptions?.includeLinks !== false,
618
+ includeImages: params.extractionOptions?.includeImages !== false,
619
+ customSelectors: params.extractionOptions?.selectors
620
+ };
621
+
622
+ // Prefer the post-action live page HTML captured during action execution.
623
+ // This ensures the final content reflects clicks/typing/navigation rather
624
+ // than re-fetching the original (pre-action) URL.
625
+ if (chainResult?.finalHtml) {
626
+ return await this.extractContentTool.execute({
627
+ url: chainResult.finalUrl || params.url,
628
+ html: chainResult.finalHtml,
629
+ options
630
+ });
631
+ }
632
+
590
633
  const extractResult = await this.extractContentTool.execute({
591
634
  url: params.url,
592
- options: {
593
- includeMetadata: params.extractionOptions?.includeMetadata !== false,
594
- includeLinks: params.extractionOptions?.includeLinks !== false,
595
- includeImages: params.extractionOptions?.includeImages !== false,
596
- customSelectors: params.extractionOptions?.selectors
597
- }
635
+ options
598
636
  });
599
637
 
600
638
  return extractResult;