crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,986 @@
1
+ /**
2
+ * BrowserProcessor - JavaScript-rendered content handling using Playwright
3
+ * Handles dynamic content, SPAs, and JavaScript-heavy websites
4
+ * Enhanced with stealth mode capabilities for anti-detection
5
+ */
6
+
7
+ import { chromium } from 'playwright';
8
+ import { z } from 'zod';
9
+ import StealthBrowserManager from '../StealthBrowserManager.js';
10
+ import HumanBehaviorSimulator from '../../utils/HumanBehaviorSimulator.js';
11
+ import LocalizationManager from '../LocalizationManager.js';
12
+
13
+ const BrowserProcessorSchema = z.object({
14
+ url: z.string().url(),
15
+ options: z.object({
16
+ waitForSelector: z.string().optional(),
17
+ waitForFunction: z.string().optional(),
18
+ waitForTimeout: z.number().min(0).max(60000).default(5000),
19
+ viewportWidth: z.number().min(320).max(1920).default(1280),
20
+ viewportHeight: z.number().min(240).max(1080).default(720),
21
+ userAgent: z.string().optional(),
22
+ enableJavaScript: z.boolean().default(true),
23
+ enableImages: z.boolean().default(false),
24
+ blockResources: z.array(z.string()).default(['font', 'stylesheet']),
25
+ extraHeaders: z.record(z.string()).optional(),
26
+ cookies: z.array(z.object({
27
+ name: z.string(),
28
+ value: z.string(),
29
+ domain: z.string().optional(),
30
+ path: z.string().default('/'),
31
+ expires: z.number().optional(),
32
+ httpOnly: z.boolean().default(false),
33
+ secure: z.boolean().default(false),
34
+ sameSite: z.enum(['Strict', 'Lax', 'None']).default('Lax')
35
+ })).optional(),
36
+ scrollToBottom: z.boolean().default(false),
37
+ executeScript: z.string().optional(),
38
+ captureScreenshot: z.boolean().default(false),
39
+ mobileEmulation: z.boolean().default(false),
40
+
41
+ // Stealth mode options
42
+ stealthMode: z.object({
43
+ enabled: z.boolean().default(false),
44
+ level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
45
+ randomizeFingerprint: z.boolean().default(true),
46
+ simulateHumanBehavior: z.boolean().default(true),
47
+ customUserAgent: z.string().optional(),
48
+ hideWebDriver: z.boolean().default(true),
49
+ blockWebRTC: z.boolean().default(true)
50
+ }).optional(),
51
+
52
+ // Human behavior simulation options
53
+ humanBehavior: z.object({
54
+ enabled: z.boolean().default(false),
55
+ mouseMovements: z.boolean().default(true),
56
+ typingVariation: z.boolean().default(true),
57
+ scrollBehavior: z.boolean().default(true),
58
+ idlePeriods: z.boolean().default(true),
59
+ readingTime: z.boolean().default(true)
60
+ }).optional(),
61
+
62
+ // Localization options
63
+ localization: z.object({
64
+ enabled: z.boolean().default(false),
65
+ countryCode: z.string().length(2).optional(),
66
+ language: z.string().optional(),
67
+ timezone: z.string().optional(),
68
+ customLocation: z.object({
69
+ latitude: z.number().min(-90).max(90),
70
+ longitude: z.number().min(-180).max(180),
71
+ accuracy: z.number().min(1).max(100).optional()
72
+ }).optional(),
73
+ enableTimezoneSpoof: z.boolean().default(true),
74
+ enableGeoLocationSpoof: z.boolean().default(true)
75
+ }).optional()
76
+ }).optional().default({})
77
+ });
78
+
79
+ const BrowserResult = z.object({
80
+ url: z.string(),
81
+ html: z.string(),
82
+ text: z.string(),
83
+ title: z.string(),
84
+ screenshot: z.string().optional(),
85
+ loadTime: z.number(),
86
+ dynamicContent: z.object({
87
+ detectedFrameworks: z.array(z.string()),
88
+ hasLazyLoading: z.boolean(),
89
+ hasDynamicContent: z.boolean(),
90
+ scriptCount: z.number(),
91
+ ajaxRequests: z.array(z.string())
92
+ }),
93
+ metrics: z.object({
94
+ domContentLoaded: z.number(),
95
+ loadComplete: z.number(),
96
+ firstContentfulPaint: z.number().optional(),
97
+ largestContentfulPaint: z.number().optional()
98
+ }),
99
+ processedAt: z.string(),
100
+ success: z.boolean(),
101
+ error: z.string().optional()
102
+ });
103
+
104
+ export class BrowserProcessor {
105
+ constructor() {
106
+ this.browser = null;
107
+ this.stealthManager = null;
108
+ this.humanBehaviorSimulator = null;
109
+ this.localizationManager = null;
110
+ this.activeContexts = new Map();
111
+
112
+ this.defaultOptions = {
113
+ waitForTimeout: 5000,
114
+ viewportWidth: 1280,
115
+ viewportHeight: 720,
116
+ enableJavaScript: true,
117
+ enableImages: false,
118
+ blockResources: ['font', 'stylesheet'],
119
+ scrollToBottom: false,
120
+ captureScreenshot: false,
121
+ mobileEmulation: false,
122
+ stealthMode: {
123
+ enabled: false,
124
+ level: 'medium',
125
+ randomizeFingerprint: true,
126
+ simulateHumanBehavior: true,
127
+ hideWebDriver: true,
128
+ blockWebRTC: true
129
+ },
130
+ humanBehavior: {
131
+ enabled: false,
132
+ mouseMovements: true,
133
+ typingVariation: true,
134
+ scrollBehavior: true,
135
+ idlePeriods: true,
136
+ readingTime: true
137
+ },
138
+ localization: {
139
+ enabled: false,
140
+ enableTimezoneSpoof: true,
141
+ enableGeoLocationSpoof: true
142
+ }
143
+ };
144
+
145
+ // Initialize localization manager
146
+ this.localizationManager = new LocalizationManager();
147
+ }
148
+
149
+ /**
150
+ * Process URL with browser automation
151
+ * @param {Object} params - Processing parameters
152
+ * @param {string} params.url - URL to process
153
+ * @param {Object} params.options - Browser options
154
+ * @returns {Promise<Object>} - Processing result with rendered content
155
+ */
156
+ async processURL(params) {
157
+ const startTime = Date.now();
158
+
159
+ try {
160
+ const validated = BrowserProcessorSchema.parse(params);
161
+ const { url, options } = validated;
162
+ const processingOptions = { ...this.defaultOptions, ...options };
163
+
164
+ const result = {
165
+ url,
166
+ processedAt: new Date().toISOString(),
167
+ success: false,
168
+ loadTime: 0
169
+ };
170
+
171
+ // Initialize browser and page (with stealth if enabled)
172
+ const page = await this.initializePage(processingOptions);
173
+
174
+ try {
175
+ // Navigate and wait for content
176
+ const navigationResult = await this.navigateAndWait(page, url, processingOptions);
177
+
178
+ // Extract content and metadata
179
+ const contentResult = await this.extractContent(page, processingOptions);
180
+
181
+ // Analyze dynamic content
182
+ const dynamicAnalysis = await this.analyzeDynamicContent(page);
183
+
184
+ // Get performance metrics
185
+ const metrics = await this.getPerformanceMetrics(page);
186
+
187
+ // Capture screenshot if requested
188
+ let screenshot = null;
189
+ if (processingOptions.captureScreenshot) {
190
+ screenshot = await this.captureScreenshot(page);
191
+ }
192
+
193
+ // Combine results
194
+ Object.assign(result, {
195
+ ...contentResult,
196
+ screenshot,
197
+ dynamicContent: dynamicAnalysis,
198
+ metrics,
199
+ loadTime: Date.now() - startTime,
200
+ success: true
201
+ });
202
+
203
+ } finally {
204
+ // Always close the page
205
+ await page.close();
206
+ }
207
+
208
+ return result;
209
+
210
+ } catch (error) {
211
+ return {
212
+ url: params.url || 'unknown',
213
+ processedAt: new Date().toISOString(),
214
+ success: false,
215
+ error: `Browser processing failed: ${error.message}`,
216
+ loadTime: Date.now() - startTime,
217
+ html: '',
218
+ text: '',
219
+ title: '',
220
+ dynamicContent: {
221
+ detectedFrameworks: [],
222
+ hasLazyLoading: false,
223
+ hasDynamicContent: false,
224
+ scriptCount: 0,
225
+ ajaxRequests: []
226
+ },
227
+ metrics: {
228
+ domContentLoaded: 0,
229
+ loadComplete: Date.now() - startTime
230
+ }
231
+ };
232
+ }
233
+ }
234
+
235
+ /**
236
+ * Initialize browser instance
237
+ * @returns {Promise<void>}
238
+ */
239
+ async initBrowser() {
240
+ if (!this.browser) {
241
+ this.browser = await chromium.launch({
242
+ headless: true,
243
+ args: [
244
+ '--no-sandbox',
245
+ '--disable-dev-shm-usage',
246
+ '--disable-gpu',
247
+ '--disable-web-security',
248
+ '--disable-background-timer-throttling',
249
+ '--disable-backgrounding-occluded-windows',
250
+ '--disable-renderer-backgrounding'
251
+ ]
252
+ });
253
+ }
254
+ }
255
+
256
+ /**
257
+ * Initialize page with stealth capabilities if enabled
258
+ * @param {Object} options - Processing options
259
+ * @returns {Promise<Page>} - Playwright page
260
+ */
261
+ async initializePage(options) {
262
+ // Apply localization if enabled
263
+ let processedOptions = options;
264
+ if (options.localization?.enabled) {
265
+ processedOptions = await this.applyLocalization(options);
266
+ }
267
+
268
+ // Check if stealth mode is enabled
269
+ if (processedOptions.stealthMode && processedOptions.stealthMode.enabled) {
270
+ return await this.createStealthPage(processedOptions);
271
+ } else {
272
+ // Standard browser initialization
273
+ await this.initBrowser();
274
+ return await this.createPage(processedOptions);
275
+ }
276
+ }
277
+
278
+ /**
279
+ * Create stealth page with anti-detection measures
280
+ * @param {Object} options - Processing options
281
+ * @returns {Promise<Page>} - Stealth-enabled page
282
+ */
283
+ async createStealthPage(options) {
284
+ // Initialize stealth manager if needed
285
+ if (!this.stealthManager) {
286
+ this.stealthManager = new StealthBrowserManager();
287
+ }
288
+
289
+ // Initialize human behavior simulator if needed
290
+ if (!this.humanBehaviorSimulator && options.humanBehavior?.enabled) {
291
+ this.humanBehaviorSimulator = new HumanBehaviorSimulator({
292
+ mouseMovements: {
293
+ enabled: options.humanBehavior.mouseMovements,
294
+ speed: 'normal',
295
+ accuracy: 0.8,
296
+ naturalCurves: true
297
+ },
298
+ typing: {
299
+ enabled: options.humanBehavior.typingVariation,
300
+ speed: 'normal',
301
+ variability: 0.3,
302
+ mistakes: {
303
+ enabled: true,
304
+ frequency: 0.02
305
+ }
306
+ },
307
+ scrolling: {
308
+ enabled: options.humanBehavior.scrollBehavior,
309
+ naturalAcceleration: true,
310
+ randomPauses: true
311
+ },
312
+ interactions: {
313
+ hoverBeforeClick: true,
314
+ focusBlurSimulation: true,
315
+ idlePeriods: {
316
+ enabled: options.humanBehavior.idlePeriods,
317
+ frequency: 0.1
318
+ }
319
+ }
320
+ });
321
+ }
322
+
323
+ // Launch stealth browser
324
+ await this.stealthManager.launchStealthBrowser({
325
+ level: options.stealthMode.level,
326
+ randomizeFingerprint: options.stealthMode.randomizeFingerprint,
327
+ hideWebDriver: options.stealthMode.hideWebDriver,
328
+ blockWebRTC: options.stealthMode.blockWebRTC,
329
+ customUserAgent: options.stealthMode.customUserAgent || options.userAgent
330
+ });
331
+
332
+ // Create stealth context
333
+ const { context, contextId } = await this.stealthManager.createStealthContext({
334
+ level: options.stealthMode.level,
335
+ customViewport: {
336
+ width: options.viewportWidth,
337
+ height: options.viewportHeight
338
+ }
339
+ });
340
+
341
+ // Create stealth page
342
+ const page = await this.stealthManager.createStealthPage(contextId);
343
+
344
+ // Store context for cleanup
345
+ this.activeContexts.set(contextId, { context, page });
346
+
347
+ // Apply additional stealth configurations
348
+ await this.applyStealthMiddleware(page, options);
349
+
350
+ return page;
351
+ }
352
+
353
+ /**
354
+ * Apply additional stealth middleware to page
355
+ * @param {Page} page - Playwright page
356
+ * @param {Object} options - Processing options
357
+ * @returns {Promise<void>}
358
+ */
359
+ async applyStealthMiddleware(page, options) {
360
+ // Set cookies if provided
361
+ if (options.cookies && options.cookies.length > 0) {
362
+ await page.context().addCookies(options.cookies);
363
+ }
364
+
365
+ // Block unnecessary resources with stealth considerations
366
+ if (options.blockResources && options.blockResources.length > 0) {
367
+ await page.route('**/*', (route) => {
368
+ const resourceType = route.request().resourceType();
369
+ const url = route.request().url();
370
+
371
+ // Don't block detection-related resources
372
+ if (url.includes('webdriver') || url.includes('selenium') || url.includes('puppeteer')) {
373
+ route.abort();
374
+ return;
375
+ }
376
+
377
+ if (options.blockResources.includes(resourceType)) {
378
+ route.abort();
379
+ } else {
380
+ route.continue();
381
+ }
382
+ });
383
+ }
384
+
385
+ // Disable images if requested (with stealth considerations)
386
+ if (!options.enableImages) {
387
+ await page.route('**/*.{jpg,jpeg,png,gif,webp,svg}', (route) => {
388
+ // Allow favicon and small images that might be used for tracking
389
+ const url = route.request().url();
390
+ if (url.includes('favicon') || url.includes('tracking') || url.includes('analytics')) {
391
+ route.continue();
392
+ } else {
393
+ route.abort();
394
+ }
395
+ });
396
+ }
397
+
398
+ // Add extra stealth protections
399
+ await page.addInitScript(() => {
400
+ // Additional webdriver detection removal
401
+ delete window.navigator.__proto__.webdriver;
402
+
403
+ // Override chrome runtime
404
+ window.chrome = {
405
+ runtime: {
406
+ onConnect: undefined,
407
+ onMessage: undefined
408
+ }
409
+ };
410
+
411
+ // Mock notification permission
412
+ Object.defineProperty(Notification, 'permission', {
413
+ get: () => 'granted'
414
+ });
415
+
416
+ // Hide headless indicators
417
+ Object.defineProperty(navigator, 'hardwareConcurrency', {
418
+ get: () => 4
419
+ });
420
+ });
421
+ }
422
+
423
+ /**
424
+ * Apply localization settings to browser options
425
+ * @param {Object} options - Original options
426
+ * @returns {Object} - Localized options
427
+ */
428
+ async applyLocalization(options) {
429
+ const { localization } = options;
430
+
431
+ try {
432
+ // Get localization configuration
433
+ const localizationConfig = await this.localizationManager.localizeBrowserContext(
434
+ options,
435
+ localization.countryCode
436
+ );
437
+
438
+ // Merge localized settings
439
+ const localizedOptions = {
440
+ ...options,
441
+ ...localizationConfig,
442
+
443
+ // Override specific browser settings
444
+ locale: localizationConfig.locale,
445
+ timezoneId: localizationConfig.timezoneId,
446
+ geolocation: localization.customLocation || localizationConfig.geolocation,
447
+ extraHeaders: {
448
+ ...options.extraHeaders,
449
+ ...localizationConfig.extraHTTPHeaders
450
+ },
451
+ userAgent: localizationConfig.userAgent || options.userAgent
452
+ };
453
+
454
+ // Add timezone spoofing script if enabled
455
+ if (localization.enableTimezoneSpoof) {
456
+ const timezoneScript = await this.localizationManager.generateTimezoneSpoof(
457
+ localization.countryCode
458
+ );
459
+ localizedOptions.timezoneSpoof = timezoneScript;
460
+ }
461
+
462
+ return localizedOptions;
463
+
464
+ } catch (error) {
465
+ console.warn('Failed to apply localization, using default options:', error.message);
466
+ return options;
467
+ }
468
+ }
469
+
470
+ /**
471
+ * Create new page with specified options
472
+ * @param {Object} options - Page options
473
+ * @returns {Promise<Page>} - Playwright page
474
+ */
475
+ async createPage(options) {
476
+ const contextOptions = {
477
+ viewport: {
478
+ width: options.viewportWidth,
479
+ height: options.viewportHeight
480
+ },
481
+ userAgent: options.userAgent,
482
+ extraHTTPHeaders: options.extraHeaders,
483
+ deviceScaleFactor: options.mobileEmulation ? 2 : 1,
484
+ isMobile: options.mobileEmulation,
485
+ hasTouch: options.mobileEmulation
486
+ };
487
+
488
+ // Add localization-specific context options
489
+ if (options.locale) {
490
+ contextOptions.locale = options.locale;
491
+ }
492
+ if (options.timezoneId) {
493
+ contextOptions.timezoneId = options.timezoneId;
494
+ }
495
+ if (options.geolocation) {
496
+ contextOptions.geolocation = options.geolocation;
497
+ }
498
+ if (options.proxy) {
499
+ contextOptions.proxy = options.proxy;
500
+ }
501
+
502
+ const context = await this.browser.newContext(contextOptions);
503
+ const page = await context.newPage();
504
+
505
+ // Inject timezone spoofing script if provided
506
+ if (options.timezoneSpoof) {
507
+ await page.addInitScript(options.timezoneSpoof);
508
+ }
509
+
510
+ // Set cookies if provided
511
+ if (options.cookies && options.cookies.length > 0) {
512
+ await context.addCookies(options.cookies);
513
+ }
514
+
515
+ // Block unnecessary resources
516
+ if (options.blockResources && options.blockResources.length > 0) {
517
+ await page.route('**/*', (route) => {
518
+ const resourceType = route.request().resourceType();
519
+ if (options.blockResources.includes(resourceType)) {
520
+ route.abort();
521
+ } else {
522
+ route.continue();
523
+ }
524
+ });
525
+ }
526
+
527
+ // Disable images if requested
528
+ if (!options.enableImages) {
529
+ await page.route('**/*.{jpg,jpeg,png,gif,webp,svg}', (route) => {
530
+ route.abort();
531
+ });
532
+ }
533
+
534
+ // Disable JavaScript if requested
535
+ if (!options.enableJavaScript) {
536
+ await context.setExtraHTTPHeaders({
537
+ 'Content-Security-Policy': 'script-src \'none\''
538
+ });
539
+ }
540
+
541
+ return page;
542
+ }
543
+
544
+ /**
545
+ * Navigate to URL and wait for content to load
546
+ * @param {Page} page - Playwright page
547
+ * @param {string} url - URL to navigate to
548
+ * @param {Object} options - Navigation options
549
+ * @returns {Promise<Object>} - Navigation result
550
+ */
551
+ async navigateAndWait(page, url, options) {
552
+ const startTime = Date.now();
553
+
554
+ // Navigate to URL
555
+ await page.goto(url, {
556
+ waitUntil: 'domcontentloaded',
557
+ timeout: 30000
558
+ });
559
+
560
+ // Wait for specific selector if provided
561
+ if (options.waitForSelector) {
562
+ try {
563
+ await page.waitForSelector(options.waitForSelector, {
564
+ timeout: options.waitForTimeout
565
+ });
566
+ } catch (error) {
567
+ console.warn(`Selector "${options.waitForSelector}" not found within timeout`);
568
+ }
569
+ }
570
+
571
+ // Wait for custom function if provided
572
+ if (options.waitForFunction) {
573
+ try {
574
+ await page.waitForFunction(options.waitForFunction, {
575
+ timeout: options.waitForTimeout
576
+ });
577
+ } catch (error) {
578
+ console.warn(`Wait function failed: ${error.message}`);
579
+ }
580
+ }
581
+
582
+ // General timeout wait
583
+ await page.waitForTimeout(Math.min(options.waitForTimeout, 10000));
584
+
585
+ // Scroll to bottom if requested (for lazy loading)
586
+ if (options.scrollToBottom) {
587
+ await this.scrollToBottom(page, options);
588
+ }
589
+
590
+ // Execute custom script if provided
591
+ if (options.executeScript) {
592
+ try {
593
+ await page.evaluate(options.executeScript);
594
+ } catch (error) {
595
+ console.warn(`Custom script execution failed: ${error.message}`);
596
+ }
597
+ }
598
+
599
+ return {
600
+ navigationTime: Date.now() - startTime
601
+ };
602
+ }
603
+
604
+ /**
605
+ * Extract content from page
606
+ * @param {Page} page - Playwright page
607
+ * @param {Object} options - Extraction options
608
+ * @returns {Promise<Object>} - Extracted content
609
+ */
610
+ async extractContent(page, options) {
611
+ // Get HTML content
612
+ const html = await page.content();
613
+
614
+ // Get text content
615
+ const text = await page.evaluate(() => {
616
+ // Remove script and style elements
617
+ const scripts = document.querySelectorAll('script, style, noscript');
618
+ scripts.forEach(el => el.remove());
619
+
620
+ return document.body ? document.body.innerText : '';
621
+ });
622
+
623
+ // Get page title
624
+ const title = await page.title();
625
+
626
+ return {
627
+ html,
628
+ text: text.trim(),
629
+ title
630
+ };
631
+ }
632
+
633
+ /**
634
+ * Analyze dynamic content characteristics
635
+ * @param {Page} page - Playwright page
636
+ * @returns {Promise<Object>} - Dynamic content analysis
637
+ */
638
+ async analyzeDynamicContent(page) {
639
+ return await page.evaluate(() => {
640
+ const analysis = {
641
+ detectedFrameworks: [],
642
+ hasLazyLoading: false,
643
+ hasDynamicContent: false,
644
+ scriptCount: 0,
645
+ ajaxRequests: []
646
+ };
647
+
648
+ // Count scripts
649
+ analysis.scriptCount = document.querySelectorAll('script').length;
650
+
651
+ // Detect frameworks
652
+ if (window.React || document.querySelector('[data-reactroot]')) {
653
+ analysis.detectedFrameworks.push('React');
654
+ }
655
+ if (window.Vue || document.querySelector('[data-v-]')) {
656
+ analysis.detectedFrameworks.push('Vue.js');
657
+ }
658
+ if (window.angular || document.querySelector('[ng-app], [data-ng-app]')) {
659
+ analysis.detectedFrameworks.push('Angular');
660
+ }
661
+ if (window.jQuery || window.$) {
662
+ analysis.detectedFrameworks.push('jQuery');
663
+ }
664
+
665
+ // Check for lazy loading
666
+ const lazyImages = document.querySelectorAll('[loading="lazy"], [data-src], .lazy');
667
+ analysis.hasLazyLoading = lazyImages.length > 0;
668
+
669
+ // Check for dynamic content indicators
670
+ const dynamicIndicators = document.querySelectorAll(
671
+ '[data-bind], [v-if], [v-for], [ng-if], [ng-repeat], [*ngFor], [*ngIf]'
672
+ );
673
+ analysis.hasDynamicContent = dynamicIndicators.length > 0 || analysis.detectedFrameworks.length > 0;
674
+
675
+ return analysis;
676
+ });
677
+ }
678
+
679
+ /**
680
+ * Get performance metrics
681
+ * @param {Page} page - Playwright page
682
+ * @returns {Promise<Object>} - Performance metrics
683
+ */
684
+ async getPerformanceMetrics(page) {
685
+ return await page.evaluate(() => {
686
+ const metrics = {
687
+ domContentLoaded: 0,
688
+ loadComplete: 0
689
+ };
690
+
691
+ if (window.performance && window.performance.timing) {
692
+ const timing = window.performance.timing;
693
+ metrics.domContentLoaded = timing.domContentLoadedEventEnd - timing.navigationStart;
694
+ metrics.loadComplete = timing.loadEventEnd - timing.navigationStart;
695
+ }
696
+
697
+ // Try to get Paint Timing metrics
698
+ if (window.performance && window.performance.getEntriesByType) {
699
+ const paintEntries = window.performance.getEntriesByType('paint');
700
+ paintEntries.forEach(entry => {
701
+ if (entry.name === 'first-contentful-paint') {
702
+ metrics.firstContentfulPaint = entry.startTime;
703
+ }
704
+ });
705
+
706
+ const navigationEntries = window.performance.getEntriesByType('largest-contentful-paint');
707
+ if (navigationEntries.length > 0) {
708
+ metrics.largestContentfulPaint = navigationEntries[navigationEntries.length - 1].startTime;
709
+ }
710
+ }
711
+
712
+ return metrics;
713
+ });
714
+ }
715
+
716
+ /**
717
+ * Capture screenshot
718
+ * @param {Page} page - Playwright page
719
+ * @returns {Promise<string>} - Base64 encoded screenshot
720
+ */
721
+ async captureScreenshot(page) {
722
+ try {
723
+ const screenshot = await page.screenshot({
724
+ type: 'png',
725
+ fullPage: false,
726
+ encoding: 'base64'
727
+ });
728
+ return screenshot;
729
+ } catch (error) {
730
+ console.warn(`Screenshot capture failed: ${error.message}`);
731
+ return null;
732
+ }
733
+ }
734
+
735
+ /**
736
+ * Scroll to bottom of page to trigger lazy loading
737
+ * @param {Page} page - Playwright page
738
+ * @returns {Promise<void>}
739
+ */
740
+ async scrollToBottom(page, options = {}) {
741
+ // Use human behavior simulation if available
742
+ if (this.humanBehaviorSimulator && options.humanBehavior?.enabled) {
743
+ const scrollHeight = await page.evaluate(() => document.body.scrollHeight);
744
+ const viewportHeight = await page.evaluate(() => window.innerHeight);
745
+ const totalDistance = scrollHeight - viewportHeight;
746
+
747
+ if (totalDistance > 0) {
748
+ await this.humanBehaviorSimulator.simulateScroll(page, {
749
+ direction: 'down',
750
+ distance: totalDistance,
751
+ duration: 2000 + Math.random() * 3000 // 2-5 seconds
752
+ });
753
+ }
754
+ } else {
755
+ // Standard scroll behavior
756
+ await page.evaluate(async () => {
757
+ await new Promise(resolve => {
758
+ let totalHeight = 0;
759
+ const distance = 100;
760
+ const timer = setInterval(() => {
761
+ const scrollHeight = document.body.scrollHeight;
762
+ window.scrollBy(0, distance);
763
+ totalHeight += distance;
764
+
765
+ if (totalHeight >= scrollHeight) {
766
+ clearInterval(timer);
767
+ resolve();
768
+ }
769
+ }, 100);
770
+ });
771
+ });
772
+ }
773
+
774
+ // Wait a bit for any lazy content to load
775
+ await page.waitForTimeout(2000);
776
+ }
777
+
778
+ /**
779
+ * Process multiple URLs concurrently
780
+ * @param {Array} urls - Array of URLs to process
781
+ * @param {Object} options - Processing options
782
+ * @returns {Promise<Array>} - Array of processing results
783
+ */
784
+ async processMultipleURLs(urls, options = {}) {
785
+ const concurrency = options.concurrency || 3;
786
+ const results = [];
787
+
788
+ // Initialize browser once for all requests
789
+ await this.initBrowser();
790
+
791
+ try {
792
+ // Process in batches
793
+ for (let i = 0; i < urls.length; i += concurrency) {
794
+ const batch = urls.slice(i, i + concurrency);
795
+ const batchPromises = batch.map(url => {
796
+ const params = typeof url === 'string'
797
+ ? { url, options }
798
+ : { ...url, options: { ...options, ...url.options } };
799
+
800
+ return this.processURL(params).catch(error => ({
801
+ url: params.url,
802
+ success: false,
803
+ error: error.message,
804
+ processedAt: new Date().toISOString(),
805
+ loadTime: 0,
806
+ html: '',
807
+ text: '',
808
+ title: '',
809
+ dynamicContent: {
810
+ detectedFrameworks: [],
811
+ hasLazyLoading: false,
812
+ hasDynamicContent: false,
813
+ scriptCount: 0,
814
+ ajaxRequests: []
815
+ },
816
+ metrics: {
817
+ domContentLoaded: 0,
818
+ loadComplete: 0
819
+ }
820
+ }));
821
+ });
822
+
823
+ const batchResults = await Promise.all(batchPromises);
824
+ results.push(...batchResults);
825
+ }
826
+ } finally {
827
+ // Clean up browser
828
+ await this.cleanup();
829
+ }
830
+
831
+ return results;
832
+ }
833
+
834
+ /**
835
+ * Clean up browser resources
836
+ * @returns {Promise<void>}
837
+ */
838
+ async cleanup() {
839
+ // Clean up stealth contexts first
840
+ for (const [contextId, contextData] of this.activeContexts.entries()) {
841
+ try {
842
+ await contextData.page.close();
843
+ await contextData.context.close();
844
+ } catch (error) {
845
+ console.warn(`Failed to close stealth context ${contextId}:`, error.message);
846
+ }
847
+ }
848
+ this.activeContexts.clear();
849
+
850
+ // Clean up stealth manager
851
+ if (this.stealthManager) {
852
+ await this.stealthManager.cleanup();
853
+ this.stealthManager = null;
854
+ }
855
+
856
+ // Clean up regular browser
857
+ if (this.browser) {
858
+ await this.browser.close();
859
+ this.browser = null;
860
+ }
861
+
862
+ // Reset human behavior simulator
863
+ if (this.humanBehaviorSimulator) {
864
+ this.humanBehaviorSimulator.resetStats();
865
+ this.humanBehaviorSimulator = null;
866
+ }
867
+ }
868
+
869
+ /**
870
+ * Check if URL likely requires JavaScript rendering
871
+ * @param {string} url - URL to check
872
+ * @param {string} html - Optional HTML content for analysis
873
+ * @returns {Promise<Object>} - Analysis result
874
+ */
875
+ async requiresJavaScript(url, html = null) {
876
+ const analysis = {
877
+ likely: false,
878
+ confidence: 0,
879
+ indicators: []
880
+ };
881
+
882
+ // URL-based indicators
883
+ const urlIndicators = [
884
+ { pattern: /\.(js|jsx|ts|tsx)$/, weight: 0.9, name: 'JavaScript file extension' },
885
+ { pattern: /#/, weight: 0.3, name: 'Hash-based routing' },
886
+ { pattern: /\/(app|spa|dashboard|admin)/, weight: 0.4, name: 'SPA-like path' }
887
+ ];
888
+
889
+ urlIndicators.forEach(indicator => {
890
+ if (indicator.pattern.test(url)) {
891
+ analysis.confidence += indicator.weight;
892
+ analysis.indicators.push(indicator.name);
893
+ }
894
+ });
895
+
896
+ // HTML-based indicators (if provided)
897
+ if (html) {
898
+ const htmlIndicators = [
899
+ { pattern: /data-reactroot|ReactDOM\.render/i, weight: 0.8, name: 'React framework' },
900
+ { pattern: /ng-app|angular\.module/i, weight: 0.8, name: 'Angular framework' },
901
+ { pattern: /v-if|v-for|new Vue/i, weight: 0.8, name: 'Vue.js framework' },
902
+ { pattern: /<script[^>]*src.*\.js/gi, weight: 0.1, name: 'External JavaScript' },
903
+ { pattern: /data-bind|knockout/i, weight: 0.6, name: 'Knockout.js' },
904
+ { pattern: /ember-application|Ember\.Application/i, weight: 0.7, name: 'Ember.js' }
905
+ ];
906
+
907
+ htmlIndicators.forEach(indicator => {
908
+ const matches = html.match(indicator.pattern);
909
+ if (matches) {
910
+ const weight = indicator.weight * Math.min(matches.length, 3);
911
+ analysis.confidence += weight;
912
+ analysis.indicators.push(`${indicator.name} (${matches.length} matches)`);
913
+ }
914
+ });
915
+ }
916
+
917
+ analysis.likely = analysis.confidence > 0.5;
918
+ analysis.confidence = Math.min(1, analysis.confidence);
919
+
920
+ return analysis;
921
+ }
922
+
923
+ /**
924
+ * Get stealth mode statistics
925
+ * @returns {Object} Stealth statistics
926
+ */
927
+ getStealthStats() {
928
+ const stats = {
929
+ stealthManagerActive: !!this.stealthManager,
930
+ humanBehaviorActive: !!this.humanBehaviorSimulator,
931
+ activeContexts: this.activeContexts.size,
932
+ stealthStats: null,
933
+ behaviorStats: null
934
+ };
935
+
936
+ if (this.stealthManager) {
937
+ stats.stealthStats = this.stealthManager.getStats();
938
+ }
939
+
940
+ if (this.humanBehaviorSimulator) {
941
+ stats.behaviorStats = this.humanBehaviorSimulator.getStats();
942
+ }
943
+
944
+ return stats;
945
+ }
946
+
947
+ /**
948
+ * Update stealth configuration
949
+ * @param {Object} stealthConfig - New stealth configuration
950
+ * @returns {void}
951
+ */
952
+ updateStealthConfig(stealthConfig) {
953
+ // Update default options
954
+ this.defaultOptions.stealthMode = {
955
+ ...this.defaultOptions.stealthMode,
956
+ ...stealthConfig
957
+ };
958
+
959
+ // If human behavior simulator exists, update its config
960
+ if (this.humanBehaviorSimulator && stealthConfig.humanBehavior) {
961
+ this.humanBehaviorSimulator.updateConfig(stealthConfig.humanBehavior);
962
+ }
963
+ }
964
+
965
+ /**
966
+ * Enable stealth mode with specified level
967
+ * @param {string} level - Stealth level ('basic', 'medium', 'advanced')
968
+ * @returns {void}
969
+ */
970
+ enableStealthMode(level = 'medium') {
971
+ this.defaultOptions.stealthMode.enabled = true;
972
+ this.defaultOptions.stealthMode.level = level;
973
+ this.defaultOptions.humanBehavior.enabled = true;
974
+ }
975
+
976
+ /**
977
+ * Disable stealth mode
978
+ * @returns {void}
979
+ */
980
+ disableStealthMode() {
981
+ this.defaultOptions.stealthMode.enabled = false;
982
+ this.defaultOptions.humanBehavior.enabled = false;
983
+ }
984
+ }
985
+
986
+ export default BrowserProcessor;