crawlforge-mcp-server 4.2.11 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/server.js +152 -21
- package/src/constants/config.js +5 -0
- package/src/core/ActionExecutor.js +13 -1
- package/src/core/ChangeTracker.js +8 -5
- package/src/core/LLMsTxtAnalyzer.js +71 -47
- package/src/core/LocalizationManager.js +7 -4
- package/src/core/ResearchOrchestrator.js +10 -6
- package/src/core/StealthBrowserManager.js +111 -40
- package/src/core/analysis/ContentAnalyzer.js +2 -2
- package/src/core/crawlers/BFSCrawler.js +23 -12
- package/src/core/processing/ContentProcessor.js +19 -3
- package/src/core/processing/PDFProcessor.js +72 -23
- package/src/tools/advanced/ScrapeWithActionsTool.js +63 -25
- package/src/tools/advanced/batchScrape/index.js +3 -1
- package/src/tools/advanced/batchScrape/reporter.js +5 -1
- package/src/tools/advanced/batchScrape/worker.js +6 -1
- package/src/tools/basic/_fetch.js +78 -5
- package/src/tools/basic/extractLinks.js +1 -1
- package/src/tools/basic/extractMetadata.js +65 -1
- package/src/tools/basic/extractText.js +61 -5
- package/src/tools/basic/scrapeStructured.js +48 -10
- package/src/tools/crawl/crawlDeep.js +13 -5
- package/src/tools/crawl/mapSite.js +24 -51
- package/src/tools/extract/analyzeContent.js +11 -6
- package/src/tools/extract/extractContent.js +23 -5
- package/src/tools/extract/extractStructured.js +65 -16
- package/src/tools/extract/extractWithLlm.js +192 -11
- package/src/tools/extract/listOllamaModels.js +19 -8
- package/src/tools/extract/processDocument.js +10 -4
- package/src/tools/extract/summarizeContent.js +58 -1
- package/src/tools/llmstxt/generateLLMsTxt.js +124 -3
- package/src/tools/research/deepResearch.js +43 -4
- package/src/tools/search/providers/searxng.js +2 -2
- package/src/tools/search/ranking/ResultDeduplicator.js +32 -9
- package/src/tools/search/ranking/ResultRanker.js +13 -4
- package/src/tools/search/searchWeb.js +5 -5
- package/src/tools/templates/TemplateRegistry.js +3 -2
- package/src/tools/tracking/trackChanges/differ.js +33 -1
- package/src/utils/htmlToMarkdown.js +5 -1
|
@@ -47,14 +47,17 @@ const StealthConfigSchema = z.object({
|
|
|
47
47
|
spoofMediaDevices: z.boolean().default(true),
|
|
48
48
|
spoofBatteryAPI: z.boolean().default(true)
|
|
49
49
|
}).optional(),
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
fingerprinting: z.object({
|
|
52
52
|
canvasNoise: z.boolean().default(true),
|
|
53
53
|
webglSpoofing: z.boolean().default(true),
|
|
54
54
|
audioContextSpoofing: z.boolean().default(true),
|
|
55
55
|
fontSpoofing: z.boolean().default(true),
|
|
56
56
|
hardwareSpoofing: z.boolean().default(true)
|
|
57
|
-
}).optional()
|
|
57
|
+
}).optional(),
|
|
58
|
+
|
|
59
|
+
// C2: browser engine selection — 'chromium' (default) or 'camoufox' (Firefox-based)
|
|
60
|
+
engine: z.enum(['chromium', 'camoufox']).optional().default('chromium')
|
|
58
61
|
});
|
|
59
62
|
|
|
60
63
|
export class StealthBrowserManager {
|
|
@@ -232,16 +235,41 @@ export class StealthBrowserManager {
|
|
|
232
235
|
}
|
|
233
236
|
|
|
234
237
|
/**
|
|
235
|
-
* Launch stealth browser with anti-detection configurations
|
|
238
|
+
* Launch stealth browser with anti-detection configurations.
|
|
239
|
+
* C2: honours config.engine — 'chromium' (default) or 'camoufox' (Firefox-based).
|
|
236
240
|
*/
|
|
237
241
|
async launchStealthBrowser(config = {}) {
|
|
242
|
+
const validatedConfig = StealthConfigSchema.parse({ ...this.defaultConfig, ...config });
|
|
243
|
+
|
|
244
|
+
// C2: if the requested engine differs from the running browser, tear it down first.
|
|
245
|
+
if (this.browser && this._launchedEngine && this._launchedEngine !== validatedConfig.engine) {
|
|
246
|
+
await this.browser.close().catch(() => {});
|
|
247
|
+
this.browser = null;
|
|
248
|
+
}
|
|
249
|
+
|
|
238
250
|
if (this.browser) {
|
|
239
251
|
return this.browser;
|
|
240
252
|
}
|
|
241
253
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
254
|
+
// C2: delegate to CamoufoxAdapter when engine === 'camoufox'
|
|
255
|
+
if (validatedConfig.engine === 'camoufox') {
|
|
256
|
+
const adapter = new CamoufoxAdapter();
|
|
257
|
+
const available = await adapter.isAvailable();
|
|
258
|
+
if (!available) {
|
|
259
|
+
throw new Error(
|
|
260
|
+
'camoufox is not installed. Run: npm install camoufox to use the Firefox-based stealth engine.'
|
|
261
|
+
);
|
|
262
|
+
}
|
|
263
|
+
this.browser = await adapter.launch({
|
|
264
|
+
headless: true,
|
|
265
|
+
launchOptions: {}
|
|
266
|
+
});
|
|
267
|
+
this._launchedEngine = 'camoufox';
|
|
268
|
+
return this.browser;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
this._launchedEngine = 'chromium';
|
|
272
|
+
// Base browser args for stealth (Chromium path)
|
|
245
273
|
const stealthArgs = [
|
|
246
274
|
'--no-sandbox',
|
|
247
275
|
'--disable-dev-shm-usage',
|
|
@@ -390,8 +418,11 @@ export class StealthBrowserManager {
|
|
|
390
418
|
* Generate advanced browser fingerprint with enhanced randomization
|
|
391
419
|
*/
|
|
392
420
|
generateAdvancedFingerprint(config = {}) {
|
|
421
|
+
// Select the OS once and thread it through UA, headers, and hardware so
|
|
422
|
+
// navigator.platform / sec-ch-ua-platform / userAgent stay consistent.
|
|
423
|
+
const selectedOS = this.selectOS(config);
|
|
393
424
|
const fingerprint = {
|
|
394
|
-
userAgent: this.selectRealisticUserAgent(config),
|
|
425
|
+
userAgent: this.selectRealisticUserAgent(config, selectedOS),
|
|
395
426
|
viewport: config.customViewport || this.selectWeightedViewport(),
|
|
396
427
|
timezone: config.timezone || this.selectTimezone(),
|
|
397
428
|
deviceScaleFactor: this.randomFloat(1, 2, 1),
|
|
@@ -400,13 +431,13 @@ export class StealthBrowserManager {
|
|
|
400
431
|
colorScheme: Math.random() < 0.3 ? 'dark' : 'light',
|
|
401
432
|
reducedMotion: Math.random() < 0.1 ? 'reduce' : 'no-preference',
|
|
402
433
|
forcedColors: Math.random() < 0.05 ? 'active' : 'none',
|
|
403
|
-
headers: this.generateAdvancedHeaders(config),
|
|
434
|
+
headers: this.generateAdvancedHeaders(config, selectedOS),
|
|
404
435
|
webRTC: this.generateWebRTCConfig(config),
|
|
405
436
|
canvas: this.generateAdvancedCanvasFingerprint(),
|
|
406
437
|
webGL: this.generateAdvancedWebGLFingerprint(),
|
|
407
438
|
audioContext: this.generateAudioContextFingerprint(),
|
|
408
439
|
mediaDevices: this.generateMediaDevicesFingerprint(),
|
|
409
|
-
hardware: this.generateHardwareFingerprint(),
|
|
440
|
+
hardware: this.generateHardwareFingerprint(selectedOS),
|
|
410
441
|
fonts: this.generateAdvancedFontList(),
|
|
411
442
|
plugins: this.generateAdvancedPluginList(),
|
|
412
443
|
geolocation: this.generateRealisticGeolocation(),
|
|
@@ -417,10 +448,34 @@ export class StealthBrowserManager {
|
|
|
417
448
|
return fingerprint;
|
|
418
449
|
}
|
|
419
450
|
|
|
451
|
+
/**
|
|
452
|
+
* Choose a single OS ('windows' | 'macos' | 'linux') for a fingerprint.
|
|
453
|
+
* A custom UA pins the OS to whatever that UA reports; a non-random UA pins
|
|
454
|
+
* to windows (the default pool below); otherwise weighted-random.
|
|
455
|
+
*/
|
|
456
|
+
selectOS(config = {}) {
|
|
457
|
+
if (config.customUserAgent) {
|
|
458
|
+
return this.inferOSFromUserAgent(config.customUserAgent);
|
|
459
|
+
}
|
|
460
|
+
if (!config.useRandomUserAgent) {
|
|
461
|
+
return 'windows';
|
|
462
|
+
}
|
|
463
|
+
return this.weightedRandom(this.osDistribution);
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
/**
|
|
467
|
+
* Infer the OS key from a user-agent string.
|
|
468
|
+
*/
|
|
469
|
+
inferOSFromUserAgent(ua = '') {
|
|
470
|
+
if (/Macintosh|Mac OS X/i.test(ua)) return 'macos';
|
|
471
|
+
if (/Linux|X11|CrOS/i.test(ua)) return 'linux';
|
|
472
|
+
return 'windows';
|
|
473
|
+
}
|
|
474
|
+
|
|
420
475
|
/**
|
|
421
476
|
* Select realistic user agent based on market distribution
|
|
422
477
|
*/
|
|
423
|
-
selectRealisticUserAgent(config) {
|
|
478
|
+
selectRealisticUserAgent(config, selectedOS) {
|
|
424
479
|
if (config.customUserAgent) {
|
|
425
480
|
return config.customUserAgent;
|
|
426
481
|
}
|
|
@@ -429,9 +484,10 @@ export class StealthBrowserManager {
|
|
|
429
484
|
return this.userAgentPools.chrome.windows[0];
|
|
430
485
|
}
|
|
431
486
|
|
|
432
|
-
//
|
|
433
|
-
|
|
434
|
-
|
|
487
|
+
// Use the OS chosen once for this fingerprint (falls back to a fresh draw
|
|
488
|
+
// if called without one, preserving the original standalone behavior).
|
|
489
|
+
selectedOS = selectedOS || this.weightedRandom(this.osDistribution);
|
|
490
|
+
|
|
435
491
|
// Select browser based on distribution and OS compatibility
|
|
436
492
|
let availableBrowsers = { ...this.browserDistribution };
|
|
437
493
|
if (selectedOS === 'linux' && availableBrowsers.safari) {
|
|
@@ -469,7 +525,10 @@ export class StealthBrowserManager {
|
|
|
469
525
|
/**
|
|
470
526
|
* Generate advanced HTTP headers with realistic patterns
|
|
471
527
|
*/
|
|
472
|
-
generateAdvancedHeaders(config) {
|
|
528
|
+
generateAdvancedHeaders(config, selectedOS) {
|
|
529
|
+
// Resolve the UA first so sec-ch-ua brand version can match.
|
|
530
|
+
const resolvedUA = this.selectRealisticUserAgent(config, selectedOS);
|
|
531
|
+
|
|
473
532
|
const headers = {
|
|
474
533
|
'Accept-Language': `${(config.locale || 'en-US').toLowerCase()},en;q=0.9`,
|
|
475
534
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
@@ -481,11 +540,11 @@ export class StealthBrowserManager {
|
|
|
481
540
|
'Sec-Fetch-Site': 'none',
|
|
482
541
|
'Sec-Fetch-User': '?1',
|
|
483
542
|
'sec-ch-ua-mobile': '?0',
|
|
484
|
-
'sec-ch-ua-platform': this.generateSecChUaPlatform()
|
|
543
|
+
'sec-ch-ua-platform': this.generateSecChUaPlatform(selectedOS)
|
|
485
544
|
};
|
|
486
545
|
|
|
487
|
-
//
|
|
488
|
-
headers['sec-ch-ua'] = this.generateSecChUaHeader();
|
|
546
|
+
// C2: pass UA so sec-ch-ua brand version matches the Chrome major version.
|
|
547
|
+
headers['sec-ch-ua'] = this.generateSecChUaHeader(resolvedUA);
|
|
489
548
|
|
|
490
549
|
// Randomize some headers
|
|
491
550
|
if (Math.random() < 0.25) {
|
|
@@ -505,15 +564,23 @@ export class StealthBrowserManager {
|
|
|
505
564
|
}
|
|
506
565
|
|
|
507
566
|
/**
|
|
508
|
-
* Generate sec-ch-ua header
|
|
567
|
+
* Generate sec-ch-ua header.
|
|
568
|
+
* C2: brand versions are derived from the UA's Chrome major version so
|
|
569
|
+
* sec-ch-ua and the User-Agent header stay consistent.
|
|
570
|
+
* @param {string} [userAgent] — the selected user agent string
|
|
509
571
|
*/
|
|
510
|
-
generateSecChUaHeader() {
|
|
572
|
+
generateSecChUaHeader(userAgent = '') {
|
|
573
|
+
// Extract Chrome major version from the UA (e.g. "Chrome/121.0.0.0" → "121").
|
|
574
|
+
// Fall back to 121 if the UA is not a Chrome UA.
|
|
575
|
+
const match = userAgent.match(/Chrome\/(\d+)/i);
|
|
576
|
+
const version = match ? match[1] : '121';
|
|
577
|
+
|
|
511
578
|
const brands = [
|
|
512
579
|
{ brand: 'Not_A Brand', version: '8' },
|
|
513
|
-
{ brand: 'Chromium', version
|
|
514
|
-
{ brand: 'Google Chrome', version
|
|
580
|
+
{ brand: 'Chromium', version },
|
|
581
|
+
{ brand: 'Google Chrome', version }
|
|
515
582
|
];
|
|
516
|
-
|
|
583
|
+
|
|
517
584
|
return brands
|
|
518
585
|
.map(b => `"${b.brand}";v="${b.version}"`)
|
|
519
586
|
.join(', ');
|
|
@@ -522,14 +589,14 @@ export class StealthBrowserManager {
|
|
|
522
589
|
/**
|
|
523
590
|
* Generate sec-ch-ua-platform header
|
|
524
591
|
*/
|
|
525
|
-
generateSecChUaPlatform() {
|
|
592
|
+
generateSecChUaPlatform(selectedOS) {
|
|
526
593
|
const platforms = {
|
|
527
594
|
windows: '"Windows"',
|
|
528
595
|
macos: '"macOS"',
|
|
529
596
|
linux: '"Linux"'
|
|
530
597
|
};
|
|
531
|
-
|
|
532
|
-
|
|
598
|
+
|
|
599
|
+
selectedOS = selectedOS || this.weightedRandom(this.osDistribution);
|
|
533
600
|
return platforms[selectedOS] || '"Windows"';
|
|
534
601
|
}
|
|
535
602
|
|
|
@@ -746,7 +813,9 @@ export class StealthBrowserManager {
|
|
|
746
813
|
/**
|
|
747
814
|
* Generate realistic hardware fingerprint
|
|
748
815
|
*/
|
|
749
|
-
generateHardwareFingerprint() {
|
|
816
|
+
generateHardwareFingerprint(selectedOS) {
|
|
817
|
+
selectedOS = selectedOS || this.weightedRandom(this.osDistribution);
|
|
818
|
+
|
|
750
819
|
const processors = [
|
|
751
820
|
{ cores: 4, threads: 8, name: 'Intel(R) Core(TM) i5-8250U CPU @ 1.60GHz' },
|
|
752
821
|
{ cores: 6, threads: 12, name: 'Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz' },
|
|
@@ -755,31 +824,33 @@ export class StealthBrowserManager {
|
|
|
755
824
|
{ cores: 6, threads: 6, name: 'AMD Ryzen 5 3600 6-Core Processor' },
|
|
756
825
|
{ cores: 8, threads: 16, name: 'AMD Ryzen 7 3700X 8-Core Processor' }
|
|
757
826
|
];
|
|
758
|
-
|
|
827
|
+
|
|
759
828
|
const selectedProcessor = processors[Math.floor(Math.random() * processors.length)];
|
|
760
|
-
|
|
829
|
+
|
|
761
830
|
return {
|
|
762
831
|
hardwareConcurrency: selectedProcessor.threads,
|
|
763
832
|
processor: selectedProcessor.name,
|
|
764
|
-
architecture:
|
|
833
|
+
architecture: 'x86_64',
|
|
765
834
|
memory: Math.floor(Math.random() * 24) + 8, // 8-32 GB
|
|
766
835
|
deviceMemory: Math.pow(2, Math.floor(Math.random() * 3) + 3), // 8, 16, or 32 GB
|
|
767
|
-
platform: this.selectRealisticPlatform()
|
|
836
|
+
platform: this.selectRealisticPlatform(selectedOS)
|
|
768
837
|
};
|
|
769
838
|
}
|
|
770
839
|
|
|
771
840
|
/**
|
|
772
|
-
*
|
|
841
|
+
* Map the chosen OS to its navigator.platform value so it stays consistent
|
|
842
|
+
* with the user-agent and sec-ch-ua-platform header.
|
|
773
843
|
*/
|
|
774
|
-
selectRealisticPlatform() {
|
|
775
|
-
|
|
776
|
-
'
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
844
|
+
selectRealisticPlatform(selectedOS) {
|
|
845
|
+
switch (selectedOS) {
|
|
846
|
+
case 'macos':
|
|
847
|
+
return 'MacIntel';
|
|
848
|
+
case 'linux':
|
|
849
|
+
return 'Linux x86_64';
|
|
850
|
+
case 'windows':
|
|
851
|
+
default:
|
|
852
|
+
return 'Win32';
|
|
853
|
+
}
|
|
783
854
|
}
|
|
784
855
|
|
|
785
856
|
/**
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
import { SummarizerManager } from 'node-summarizer';
|
|
7
|
-
import { franc } from 'franc';
|
|
7
|
+
import { franc, francAll } from 'franc';
|
|
8
8
|
import nlp from 'compromise';
|
|
9
9
|
import { z } from 'zod';
|
|
10
10
|
import { splitSentences } from './sentenceUtils.js';
|
|
@@ -316,7 +316,7 @@ export class ContentAnalyzer {
|
|
|
316
316
|
const confidence = Math.min(1, 0.5 + (text.length / 500) * 0.5);
|
|
317
317
|
|
|
318
318
|
// Get alternative languages using franc.all
|
|
319
|
-
const alternatives =
|
|
319
|
+
const alternatives = francAll(text, {
|
|
320
320
|
minLength: 10,
|
|
321
321
|
whitelist: Object.keys(LANGUAGE_NAMES)
|
|
322
322
|
})
|
|
@@ -6,6 +6,9 @@ import { RobotsChecker } from '../../utils/robotsChecker.js';
|
|
|
6
6
|
import { DomainFilter } from '../../utils/domainFilter.js';
|
|
7
7
|
import { LinkAnalyzer } from '../analysis/LinkAnalyzer.js';
|
|
8
8
|
import { normalizeUrl, extractLinks, isValidUrl } from '../../utils/urlNormalizer.js';
|
|
9
|
+
import { Logger } from '../../utils/Logger.js';
|
|
10
|
+
|
|
11
|
+
const logger = new Logger('BFSCrawler');
|
|
9
12
|
|
|
10
13
|
export class BFSCrawler {
|
|
11
14
|
constructor(options = {}) {
|
|
@@ -43,7 +46,10 @@ export class BFSCrawler {
|
|
|
43
46
|
|
|
44
47
|
this.queue = new QueueManager({ concurrency, timeout });
|
|
45
48
|
this.cache = new CacheManager({ ttl: 3600000 }); // 1 hour cache
|
|
49
|
+
// C1: per-domain rate-limiter map — reuse existing limiter when
|
|
50
|
+
// effectiveRateLimit hasn't changed, rather than recreating it on every URL.
|
|
46
51
|
this.rateLimiter = new RateLimiter({ requestsPerSecond: 10 });
|
|
52
|
+
this._domainRateLimiters = new Map();
|
|
47
53
|
this.robotsChecker = respectRobots ? new RobotsChecker(userAgent) : null;
|
|
48
54
|
|
|
49
55
|
// Initialize domain filter (create new if not provided)
|
|
@@ -142,13 +148,13 @@ export class BFSCrawler {
|
|
|
142
148
|
});
|
|
143
149
|
|
|
144
150
|
if (!filterDecision.allowed) {
|
|
145
|
-
|
|
151
|
+
logger.debug(`Domain filter blocks: ${normalizedUrl} - ${filterDecision.reason}`);
|
|
146
152
|
return;
|
|
147
153
|
}
|
|
148
|
-
|
|
154
|
+
|
|
149
155
|
// Backward compatibility: also check legacy patterns
|
|
150
156
|
if (!this.shouldCrawlUrl(normalizedUrl)) {
|
|
151
|
-
|
|
157
|
+
logger.debug(`Legacy pattern blocks: ${normalizedUrl}`);
|
|
152
158
|
return;
|
|
153
159
|
}
|
|
154
160
|
|
|
@@ -156,7 +162,7 @@ export class BFSCrawler {
|
|
|
156
162
|
if (this.respectRobots && this.robotsChecker) {
|
|
157
163
|
const canFetch = await this.robotsChecker.canFetch(normalizedUrl);
|
|
158
164
|
if (!canFetch) {
|
|
159
|
-
|
|
165
|
+
logger.debug(`Robots.txt blocks: ${normalizedUrl}`);
|
|
160
166
|
return;
|
|
161
167
|
}
|
|
162
168
|
}
|
|
@@ -171,17 +177,22 @@ export class BFSCrawler {
|
|
|
171
177
|
|
|
172
178
|
if (!pageData) {
|
|
173
179
|
// Apply domain-specific rate limiting
|
|
180
|
+
// C1: reuse per-domain limiter from the map to avoid recreating on each URL.
|
|
174
181
|
const urlObj = new URL(normalizedUrl);
|
|
175
|
-
const
|
|
176
|
-
|
|
177
|
-
// Use domain-specific rate limit if available
|
|
182
|
+
const domain = urlObj.hostname;
|
|
183
|
+
const domainRules = this.domainFilter.getDomainRules(domain);
|
|
178
184
|
const effectiveRateLimit = domainRules.rateLimit || 10;
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
this.
|
|
185
|
+
|
|
186
|
+
if (!this._domainRateLimiters.has(domain)) {
|
|
187
|
+
this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
|
|
188
|
+
} else {
|
|
189
|
+
const existing = this._domainRateLimiters.get(domain);
|
|
190
|
+
if (existing.requestsPerSecond !== effectiveRateLimit) {
|
|
191
|
+
this._domainRateLimiters.set(domain, new RateLimiter({ requestsPerSecond: effectiveRateLimit }));
|
|
192
|
+
}
|
|
182
193
|
}
|
|
183
|
-
|
|
184
|
-
await this.
|
|
194
|
+
|
|
195
|
+
await this._domainRateLimiters.get(domain).checkLimit(normalizedUrl);
|
|
185
196
|
|
|
186
197
|
// Fetch the page
|
|
187
198
|
pageData = await this.fetchPage(normalizedUrl);
|
|
@@ -401,9 +401,10 @@ export class ContentProcessor {
|
|
|
401
401
|
|
|
402
402
|
const avgWordsPerSentence = words.length / sentences.length;
|
|
403
403
|
const avgCharsPerWord = charactersNoSpaces / words.length;
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
404
|
+
const avgSyllablesPerWord = words.reduce((sum, w) => sum + this._countSyllables(w), 0) / words.length;
|
|
405
|
+
|
|
406
|
+
// Flesch Reading-Ease: higher score = easier to read
|
|
407
|
+
const readabilityScore = 206.835 - (1.015 * avgWordsPerSentence) - (84.6 * avgSyllablesPerWord);
|
|
407
408
|
|
|
408
409
|
return {
|
|
409
410
|
sentences: sentences.length,
|
|
@@ -412,6 +413,7 @@ export class ContentProcessor {
|
|
|
412
413
|
charactersNoSpaces,
|
|
413
414
|
avgWordsPerSentence: Math.round(avgWordsPerSentence * 100) / 100,
|
|
414
415
|
avgCharsPerWord: Math.round(avgCharsPerWord * 100) / 100,
|
|
416
|
+
avgSyllablesPerWord: Math.round(avgSyllablesPerWord * 100) / 100,
|
|
415
417
|
readabilityScore: Math.round(readabilityScore * 100) / 100,
|
|
416
418
|
readabilityLevel: this.getReadabilityLevel(readabilityScore)
|
|
417
419
|
};
|
|
@@ -432,6 +434,20 @@ export class ContentProcessor {
|
|
|
432
434
|
return 'Very Difficult';
|
|
433
435
|
}
|
|
434
436
|
|
|
437
|
+
/**
|
|
438
|
+
* Count syllables in a word (heuristic)
|
|
439
|
+
* @param {string} word
|
|
440
|
+
* @returns {number}
|
|
441
|
+
*/
|
|
442
|
+
_countSyllables(word) {
|
|
443
|
+
const w = word.toLowerCase().replace(/[^a-z]/g, '');
|
|
444
|
+
if (w.length <= 3) return 1;
|
|
445
|
+
// Remove trailing silent e
|
|
446
|
+
const stripped = w.replace(/e$/, '');
|
|
447
|
+
const matches = stripped.match(/[aeiouy]+/g);
|
|
448
|
+
return Math.max(1, matches ? matches.length : 1);
|
|
449
|
+
}
|
|
450
|
+
|
|
435
451
|
/**
|
|
436
452
|
* Extract fallback content when Readability fails
|
|
437
453
|
* @param {string} html - HTML content
|
|
@@ -16,6 +16,12 @@ const PDFProcessorSchema = z.object({
|
|
|
16
16
|
extractText: z.boolean().default(true),
|
|
17
17
|
password: z.string().optional(),
|
|
18
18
|
maxPages: z.number().min(1).max(1000).default(100),
|
|
19
|
+
// C3: true page-range extraction (1-based, inclusive). When set, only the
|
|
20
|
+
// text from pages [start..end] is returned.
|
|
21
|
+
pageRange: z.object({
|
|
22
|
+
start: z.number().min(1).default(1),
|
|
23
|
+
end: z.number().min(1).optional()
|
|
24
|
+
}).optional(),
|
|
19
25
|
parseOptions: z.object({
|
|
20
26
|
normalizeWhitespace: z.boolean().default(true),
|
|
21
27
|
disableCombineTextItems: z.boolean().default(false)
|
|
@@ -95,12 +101,29 @@ export class PDFProcessor {
|
|
|
95
101
|
return result;
|
|
96
102
|
}
|
|
97
103
|
|
|
104
|
+
// C3: when a page range is requested, capture per-page text so we can
|
|
105
|
+
// return exactly pages [start..end] (pdf-parse otherwise concatenates the
|
|
106
|
+
// whole document and its `max` option only caps the *upper* page bound).
|
|
107
|
+
const pageRange = processingOptions.pageRange;
|
|
108
|
+
const capturedPages = [];
|
|
109
|
+
|
|
98
110
|
// Parse PDF with options
|
|
99
111
|
const parseOptions = {
|
|
100
112
|
...processingOptions.parseOptions,
|
|
101
113
|
max: processingOptions.maxPages
|
|
102
114
|
};
|
|
103
115
|
|
|
116
|
+
// If extracting a range, raise `max` to at least the requested end page
|
|
117
|
+
// and install a pagerender that records each page's text.
|
|
118
|
+
if (pageRange) {
|
|
119
|
+
if (pageRange.end) {
|
|
120
|
+
parseOptions.max = Math.max(parseOptions.max, pageRange.end);
|
|
121
|
+
} else {
|
|
122
|
+
parseOptions.max = processingOptions.maxPages;
|
|
123
|
+
}
|
|
124
|
+
parseOptions.pagerender = (pageData) => this._renderPage(pageData, capturedPages);
|
|
125
|
+
}
|
|
126
|
+
|
|
104
127
|
if (processingOptions.password) {
|
|
105
128
|
parseOptions.password = processingOptions.password;
|
|
106
129
|
}
|
|
@@ -118,7 +141,15 @@ export class PDFProcessor {
|
|
|
118
141
|
|
|
119
142
|
// Extract text content
|
|
120
143
|
if (processingOptions.extractText) {
|
|
121
|
-
|
|
144
|
+
if (pageRange) {
|
|
145
|
+
const start = pageRange.start || 1;
|
|
146
|
+
const end = pageRange.end || capturedPages.length;
|
|
147
|
+
const slice = capturedPages.slice(start - 1, end);
|
|
148
|
+
result.text = this.cleanPDFText(slice.join('\n\n'));
|
|
149
|
+
result.extractedPages = { start, end, count: slice.length };
|
|
150
|
+
} else {
|
|
151
|
+
result.text = this.cleanPDFText(pdfData.text);
|
|
152
|
+
}
|
|
122
153
|
}
|
|
123
154
|
|
|
124
155
|
// Extract metadata
|
|
@@ -414,34 +445,52 @@ export class PDFProcessor {
|
|
|
414
445
|
}
|
|
415
446
|
|
|
416
447
|
/**
|
|
417
|
-
*
|
|
418
|
-
*
|
|
419
|
-
*
|
|
448
|
+
* Render a single PDF page to text and record it.
|
|
449
|
+
* Mirrors pdf-parse's default render (newline on Y-position change) but
|
|
450
|
+
* accumulates per-page text so callers can slice a true page range.
|
|
451
|
+
* Note: like pdf-parse, this does not reconstruct multi-column / table
|
|
452
|
+
* layout — column order follows the PDF's text-item stream.
|
|
453
|
+
* @param {Object} pageData - pdf.js page proxy from pdf-parse
|
|
454
|
+
* @param {string[]} sink - array that receives this page's text
|
|
455
|
+
* @returns {Promise<string>}
|
|
456
|
+
*/
|
|
457
|
+
async _renderPage(pageData, sink) {
|
|
458
|
+
const textContent = await pageData.getTextContent({
|
|
459
|
+
normalizeWhitespace: true,
|
|
460
|
+
disableCombineTextItems: false
|
|
461
|
+
});
|
|
462
|
+
let lastY;
|
|
463
|
+
let text = '';
|
|
464
|
+
for (const item of textContent.items) {
|
|
465
|
+
if (lastY === item.transform[5] || lastY === undefined) {
|
|
466
|
+
text += item.str;
|
|
467
|
+
} else {
|
|
468
|
+
text += '\n' + item.str;
|
|
469
|
+
}
|
|
470
|
+
lastY = item.transform[5];
|
|
471
|
+
}
|
|
472
|
+
sink.push(text);
|
|
473
|
+
// pdf-parse joins page renders with '\n\n' for pdfData.text
|
|
474
|
+
return text;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
/**
|
|
478
|
+
* Extract a specific page range from a PDF (1-based, inclusive).
|
|
479
|
+
* @param {Object} params - Processing parameters
|
|
480
|
+
* @param {number} [params.startPage=1] - First page to include
|
|
481
|
+
* @param {number} [params.endPage] - Last page to include (defaults to end)
|
|
482
|
+
* @returns {Promise<Object>} - Processing result for the requested pages
|
|
420
483
|
*/
|
|
421
484
|
async extractPDFPages(params) {
|
|
422
485
|
const { startPage = 1, endPage, ...processingParams } = params;
|
|
423
|
-
|
|
424
|
-
// Override parse options to limit page range
|
|
425
|
-
const options = {
|
|
426
|
-
...processingParams.options,
|
|
427
|
-
parseOptions: {
|
|
428
|
-
...processingParams.options?.parseOptions,
|
|
429
|
-
max: endPage || processingParams.options?.maxPages || 100
|
|
430
|
-
}
|
|
431
|
-
};
|
|
432
486
|
|
|
433
|
-
|
|
487
|
+
return this.processPDF({
|
|
434
488
|
...processingParams,
|
|
435
|
-
options
|
|
489
|
+
options: {
|
|
490
|
+
...processingParams.options,
|
|
491
|
+
pageRange: { start: startPage, ...(endPage ? { end: endPage } : {}) }
|
|
492
|
+
}
|
|
436
493
|
});
|
|
437
|
-
|
|
438
|
-
if (result.success && result.text && startPage > 1) {
|
|
439
|
-
// This is a simplified approach - pdf-parse doesn't provide per-page text
|
|
440
|
-
// For proper page-by-page extraction, consider using pdf2pic or pdf-poppler
|
|
441
|
-
console.warn('Page-specific extraction is limited with current PDF parser');
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
return result;
|
|
445
494
|
}
|
|
446
495
|
}
|
|
447
496
|
|
|
@@ -121,8 +121,16 @@ const ScrapeWithActionsSchema = z.object({
|
|
|
121
121
|
captureIntermediateStates: z.boolean().default(false),
|
|
122
122
|
captureScreenshots: z.boolean().default(true),
|
|
123
123
|
|
|
124
|
-
// Form auto-fill
|
|
125
|
-
|
|
124
|
+
// Form auto-fill — structured shape ({fields:[{selector,value,...}], submitSelector, waitAfterSubmit}).
|
|
125
|
+
// A flat z.record(string) of selector→value is still accepted for backward compatibility.
|
|
126
|
+
formAutoFill: z.union([
|
|
127
|
+
z.object({
|
|
128
|
+
fields: z.array(FormFieldSchema),
|
|
129
|
+
submitSelector: z.string().optional(),
|
|
130
|
+
waitAfterSubmit: z.number().min(0).max(30000).default(2000)
|
|
131
|
+
}),
|
|
132
|
+
z.record(z.string())
|
|
133
|
+
]).optional(),
|
|
126
134
|
|
|
127
135
|
// Browser options
|
|
128
136
|
browserOptions: z.object({
|
|
@@ -386,8 +394,9 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
386
394
|
const intermediateStates = params.captureIntermediateStates ?
|
|
387
395
|
await this.extractIntermediateStates(actionResults, params) : [];
|
|
388
396
|
|
|
389
|
-
// Get final page content after all actions
|
|
390
|
-
|
|
397
|
+
// Get final page content after all actions (reads the post-action live page
|
|
398
|
+
// captured by ActionExecutor, falling back to a fresh fetch only if missing).
|
|
399
|
+
const finalContent = await this.extractFinalContent(params, chainResult);
|
|
391
400
|
|
|
392
401
|
// Generate different formats
|
|
393
402
|
const content = this.generateFormats(finalContent, params.formats, {
|
|
@@ -446,21 +455,37 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
446
455
|
|
|
447
456
|
insertFormAutoFillActions(actions, formAutoFill) {
|
|
448
457
|
const fillActions = [];
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
458
|
+
|
|
459
|
+
if (Array.isArray(formAutoFill.fields)) {
|
|
460
|
+
// Structured shape: { fields: [{selector, value, type, waitAfter}], submitSelector, waitAfterSubmit }
|
|
461
|
+
for (const field of formAutoFill.fields) {
|
|
462
|
+
fillActions.push({
|
|
463
|
+
type: 'type',
|
|
464
|
+
selector: field.selector,
|
|
465
|
+
text: field.value,
|
|
466
|
+
description: `Auto-fill field: ${field.selector}`,
|
|
467
|
+
continueOnError: true,
|
|
468
|
+
retries: 1
|
|
469
|
+
});
|
|
470
|
+
if (field.waitAfter) {
|
|
471
|
+
fillActions.push({ type: 'wait', duration: field.waitAfter });
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
} else {
|
|
475
|
+
// Backward-compatible flat shape: { selector: value, ... }
|
|
476
|
+
for (const [selector, value] of Object.entries(formAutoFill)) {
|
|
477
|
+
if (selector === 'submitSelector' || selector === 'waitAfterSubmit' || selector === 'fields') {
|
|
478
|
+
continue; // Skip special keys
|
|
479
|
+
}
|
|
480
|
+
fillActions.push({
|
|
481
|
+
type: 'type',
|
|
482
|
+
selector,
|
|
483
|
+
text: value,
|
|
484
|
+
description: `Auto-fill field: ${selector}`,
|
|
485
|
+
continueOnError: true,
|
|
486
|
+
retries: 1
|
|
487
|
+
});
|
|
454
488
|
}
|
|
455
|
-
|
|
456
|
-
fillActions.push({
|
|
457
|
-
type: 'type',
|
|
458
|
-
selector,
|
|
459
|
-
text: value,
|
|
460
|
-
description: `Auto-fill field: ${selector}`,
|
|
461
|
-
continueOnError: true,
|
|
462
|
-
retries: 1
|
|
463
|
-
});
|
|
464
489
|
}
|
|
465
490
|
|
|
466
491
|
// Add submit action if specified
|
|
@@ -585,16 +610,29 @@ export class ScrapeWithActionsTool extends EventEmitter {
|
|
|
585
610
|
return states;
|
|
586
611
|
}
|
|
587
612
|
|
|
588
|
-
async extractFinalContent(params) {
|
|
613
|
+
async extractFinalContent(params, chainResult = null) {
|
|
589
614
|
try {
|
|
615
|
+
const options = {
|
|
616
|
+
includeMetadata: params.extractionOptions?.includeMetadata !== false,
|
|
617
|
+
includeLinks: params.extractionOptions?.includeLinks !== false,
|
|
618
|
+
includeImages: params.extractionOptions?.includeImages !== false,
|
|
619
|
+
customSelectors: params.extractionOptions?.selectors
|
|
620
|
+
};
|
|
621
|
+
|
|
622
|
+
// Prefer the post-action live page HTML captured during action execution.
|
|
623
|
+
// This ensures the final content reflects clicks/typing/navigation rather
|
|
624
|
+
// than re-fetching the original (pre-action) URL.
|
|
625
|
+
if (chainResult?.finalHtml) {
|
|
626
|
+
return await this.extractContentTool.execute({
|
|
627
|
+
url: chainResult.finalUrl || params.url,
|
|
628
|
+
html: chainResult.finalHtml,
|
|
629
|
+
options
|
|
630
|
+
});
|
|
631
|
+
}
|
|
632
|
+
|
|
590
633
|
const extractResult = await this.extractContentTool.execute({
|
|
591
634
|
url: params.url,
|
|
592
|
-
options
|
|
593
|
-
includeMetadata: params.extractionOptions?.includeMetadata !== false,
|
|
594
|
-
includeLinks: params.extractionOptions?.includeLinks !== false,
|
|
595
|
-
includeImages: params.extractionOptions?.includeImages !== false,
|
|
596
|
-
customSelectors: params.extractionOptions?.selectors
|
|
597
|
-
}
|
|
635
|
+
options
|
|
598
636
|
});
|
|
599
637
|
|
|
600
638
|
return extractResult;
|