@monostate/node-scraper 1.0.3 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 BNCA Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -22,13 +22,20 @@ pnpm add @monostate/node-scraper
22
22
  ### Basic Usage
23
23
 
24
24
  ```javascript
25
- import { smartScrape, BNCASmartScraper } from '@monostate/node-scraper';
25
+ import { smartScrape, smartScreenshot, quickShot } from '@monostate/node-scraper';
26
26
 
27
27
  // Simple one-line scraping
28
28
  const result = await smartScrape('https://example.com');
29
29
  console.log(result.content); // Extracted content
30
30
  console.log(result.method); // Method used: direct-fetch, lightpanda, or puppeteer
31
- console.log(result.performance.totalTime); // Time taken in ms
31
+
32
+ // Take a screenshot
33
+ const screenshot = await smartScreenshot('https://example.com');
34
+ console.log(screenshot.screenshot); // Base64 encoded image
35
+
36
+ // Quick screenshot (optimized for speed)
37
+ const quick = await quickShot('https://example.com');
38
+ console.log(quick.screenshot); // Fast screenshot capture
32
39
  ```
33
40
 
34
41
  ### Advanced Usage
@@ -67,6 +74,10 @@ BNCA uses a sophisticated 3-tier fallback system:
67
74
  - **Performance**: Complete JavaScript execution
68
75
  - **Fallback triggers**: Complex interactions needed
69
76
 
77
+ ### 📸 Screenshot Methods
78
+ - **Chrome CLI**: Direct Chrome screenshot capture
79
+ - **Quickshot**: Optimized with retry logic and smart timeouts
80
+
70
81
  ## 📊 Performance Benchmark
71
82
 
72
83
  | Site Type | BNCA | Firecrawl | Speed Advantage |
@@ -79,12 +90,19 @@ BNCA uses a sophisticated 3-tier fallback system:
79
90
 
80
91
  ## 🎛️ API Reference
81
92
 
82
- ### `smartScrape(url, options?)`
93
+ ### Convenience Functions
94
+
95
+ #### `smartScrape(url, options?)`
96
+ Quick scraping with intelligent fallback.
83
97
 
84
- Convenience function for quick scraping.
98
+ #### `smartScreenshot(url, options?)`
99
+ Take a screenshot of any webpage.
100
+
101
+ #### `quickShot(url, options?)`
102
+ Optimized screenshot capture for maximum speed.
85
103
 
86
104
  **Parameters:**
87
- - `url` (string): URL to scrape
105
+ - `url` (string): URL to scrape/capture
88
106
  - `options` (object, optional): Configuration options
89
107
 
90
108
  **Returns:** Promise<ScrapingResult>
@@ -115,6 +133,24 @@ Scrape a URL with intelligent fallback.
115
133
  const result = await scraper.scrape('https://example.com');
116
134
  ```
117
135
 
136
+ ##### `scraper.screenshot(url, options?)`
137
+
138
+ Take a screenshot of a webpage.
139
+
140
+ ```javascript
141
+ const result = await scraper.screenshot('https://example.com');
142
+ const img = result.screenshot; // data:image/png;base64,...
143
+ ```
144
+
145
+ ##### `scraper.quickshot(url, options?)`
146
+
147
+ Quick screenshot capture - optimized for speed with retry logic.
148
+
149
+ ```javascript
150
+ const result = await scraper.quickshot('https://example.com');
151
+ // 2-3x faster than regular screenshot
152
+ ```
153
+
118
154
  ##### `scraper.getStats()`
119
155
 
120
156
  Get performance statistics.
package/index.d.ts CHANGED
@@ -25,13 +25,15 @@ export interface ScrapingResult {
25
25
  /** Size of the content in bytes */
26
26
  size?: number;
27
27
  /** Method used for scraping */
28
- method: 'direct-fetch' | 'lightpanda' | 'puppeteer' | 'failed' | 'error';
28
+ method: 'direct-fetch' | 'lightpanda' | 'puppeteer' | 'chrome-screenshot' | 'quickshot' | 'failed' | 'error';
29
29
  /** Whether browser rendering was needed */
30
30
  needsBrowser?: boolean;
31
31
  /** Content type from response headers */
32
32
  contentType?: string;
33
33
  /** Error message if scraping failed */
34
34
  error?: string;
35
+ /** Base64 encoded screenshot (if captured) */
36
+ screenshot?: string;
35
37
  /** Performance metrics */
36
38
  performance: {
37
39
  /** Total time taken in milliseconds */
@@ -129,6 +131,22 @@ export class BNCASmartScraper {
129
131
  */
130
132
  scrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
131
133
 
134
+ /**
135
+ * Take a screenshot of a webpage
136
+ * @param url The URL to capture
137
+ * @param options Optional configuration overrides
138
+ * @returns Promise resolving to screenshot result
139
+ */
140
+ screenshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
141
+
142
+ /**
143
+ * Quick screenshot capture - optimized for speed
144
+ * @param url The URL to capture
145
+ * @param options Optional configuration overrides
146
+ * @returns Promise resolving to screenshot result
147
+ */
148
+ quickshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
149
+
132
150
  /**
133
151
  * Get performance statistics for all methods
134
152
  * @returns Current statistics
@@ -214,6 +232,22 @@ export class BNCASmartScraper {
214
232
  */
215
233
  export function smartScrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
216
234
 
235
+ /**
236
+ * Convenience function for taking screenshots
237
+ * @param url The URL to capture
238
+ * @param options Optional configuration
239
+ * @returns Promise resolving to screenshot result
240
+ */
241
+ export function smartScreenshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
242
+
243
+ /**
244
+ * Convenience function for quick screenshot capture
245
+ * @param url The URL to capture
246
+ * @param options Optional configuration
247
+ * @returns Promise resolving to screenshot result
248
+ */
249
+ export function quickShot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
250
+
217
251
  /**
218
252
  * Default export - same as BNCASmartScraper class
219
253
  */
package/index.js CHANGED
@@ -3,6 +3,7 @@ import { spawn } from 'child_process';
3
3
  import fs from 'fs/promises';
4
4
  import path from 'path';
5
5
  import { fileURLToPath } from 'url';
6
+ import { promises as fsPromises } from 'fs';
6
7
 
7
8
  let puppeteer = null;
8
9
  try {
@@ -397,13 +398,17 @@ export class BNCASmartScraper {
397
398
  /__webpack_require__/i
398
399
  ];
399
400
 
400
- // Check for protection systems
401
+ // Check for protection systems (more specific patterns)
401
402
  const protectionIndicators = [
402
- /cloudflare/i,
403
+ /cloudflare.*challenge/i,
404
+ /cloudflare.*protection/i,
405
+ /ray id.*cloudflare/i,
403
406
  /please enable javascript/i,
404
407
  /you need to enable javascript/i,
405
408
  /this site requires javascript/i,
406
- /jscript.*required/i
409
+ /jscript.*required/i,
410
+ /security check.*cloudflare/i,
411
+ /attention required.*cloudflare/i
407
412
  ];
408
413
 
409
414
  // Check for minimal content (likely SPA)
@@ -447,7 +452,10 @@ export class BNCASmartScraper {
447
452
  if (/window\.__NEXT_DATA__/i.test(html)) {
448
453
  indicators.push('Next.js data detected');
449
454
  }
450
- if (/cloudflare/i.test(html)) {
455
+ if (/cloudflare.*challenge/i.test(html)) {
456
+ indicators.push('Cloudflare challenge detected');
457
+ }
458
+ if (/cloudflare.*protection/i.test(html)) {
451
459
  indicators.push('Cloudflare protection detected');
452
460
  }
453
461
  if (/please enable javascript/i.test(html)) {
@@ -590,6 +598,271 @@ export class BNCASmartScraper {
590
598
  }
591
599
  }
592
600
 
601
+ /**
602
+ * Take a screenshot of a webpage
603
+ */
604
+ async screenshot(url, options = {}) {
605
+ const startTime = Date.now();
606
+ const config = { ...this.options, ...options };
607
+
608
+ this.log(`📸 Taking screenshot for: ${url}`);
609
+
610
+ try {
611
+ const screenshot = await this.takeScreenshotWithChrome(url, config);
612
+
613
+ return {
614
+ success: !!screenshot,
615
+ screenshot,
616
+ method: 'chrome-screenshot',
617
+ performance: {
618
+ totalTime: Date.now() - startTime
619
+ }
620
+ };
621
+ } catch (error) {
622
+ return {
623
+ success: false,
624
+ error: error.message,
625
+ method: 'chrome-screenshot',
626
+ performance: {
627
+ totalTime: Date.now() - startTime
628
+ }
629
+ };
630
+ }
631
+ }
632
+
633
+ /**
634
+ * Quick screenshot capture - optimized for speed
635
+ */
636
+ async quickshot(url, options = {}) {
637
+ const startTime = Date.now();
638
+ const config = {
639
+ ...this.options,
640
+ ...options,
641
+ timeout: options.timeout || 15000 // Longer timeout for screenshots
642
+ };
643
+
644
+ this.log(`⚡ Taking quick screenshot for: ${url}`);
645
+
646
+ try {
647
+ const screenshot = await this.takeScreenshotOptimized(url, config);
648
+
649
+ return {
650
+ success: !!screenshot,
651
+ screenshot,
652
+ method: 'quickshot',
653
+ performance: {
654
+ totalTime: Date.now() - startTime
655
+ }
656
+ };
657
+ } catch (error) {
658
+ return {
659
+ success: false,
660
+ error: error.message,
661
+ method: 'quickshot',
662
+ performance: {
663
+ totalTime: Date.now() - startTime
664
+ }
665
+ };
666
+ }
667
+ }
668
+
669
+ /**
670
+ * Take screenshot using Chrome CLI
671
+ */
672
+ async takeScreenshotWithChrome(url, config) {
673
+ const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`);
674
+
675
+ try {
676
+ const args = [
677
+ '--headless=new',
678
+ '--disable-gpu',
679
+ '--no-sandbox',
680
+ '--disable-setuid-sandbox',
681
+ '--disable-dev-shm-usage',
682
+ '--disable-blink-features=AutomationControlled',
683
+ '--user-agent=' + config.userAgent,
684
+ '--screenshot=' + tempFile,
685
+ '--window-size=1280,800',
686
+ '--hide-scrollbars',
687
+ '--virtual-time-budget=10000',
688
+ url
689
+ ];
690
+
691
+ const chromePath = await this.findChromePath();
692
+ if (!chromePath) {
693
+ throw new Error('Chrome/Chromium not found');
694
+ }
695
+
696
+ return new Promise((resolve) => {
697
+ const chrome = spawn(chromePath, args, {
698
+ stdio: ['ignore', 'pipe', 'pipe'],
699
+ detached: false
700
+ });
701
+
702
+ let processExited = false;
703
+ let stderr = '';
704
+
705
+ chrome.stderr.on('data', (data) => {
706
+ stderr += data.toString();
707
+ });
708
+
709
+ const killTimeout = setTimeout(() => {
710
+ if (!processExited) {
711
+ this.log('Chrome timeout, sending SIGTERM...');
712
+ chrome.kill('SIGTERM');
713
+
714
+ setTimeout(() => {
715
+ if (!processExited) {
716
+ chrome.kill('SIGKILL');
717
+ }
718
+ }, 1000);
719
+ }
720
+ }, config.timeout || 15000);
721
+
722
+ chrome.on('exit', async (code, signal) => {
723
+ processExited = true;
724
+ clearTimeout(killTimeout);
725
+
726
+ try {
727
+ await new Promise(r => setTimeout(r, 500));
728
+ const screenshotBuffer = await fsPromises.readFile(tempFile);
729
+ const base64 = screenshotBuffer.toString('base64');
730
+ await fsPromises.unlink(tempFile).catch(() => {});
731
+ resolve(`data:image/png;base64,${base64}`);
732
+ } catch (error) {
733
+ resolve(null);
734
+ }
735
+ });
736
+
737
+ chrome.on('error', (error) => {
738
+ clearTimeout(killTimeout);
739
+ resolve(null);
740
+ });
741
+ });
742
+ } catch (error) {
743
+ return null;
744
+ }
745
+ }
746
+
747
+ /**
748
+ * Optimized screenshot for speed
749
+ */
750
+ async takeScreenshotOptimized(url, config, retryCount = 0) {
751
+ const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`);
752
+
753
+ try {
754
+ const virtualTimeBudget = retryCount === 0 ? 5000 : 8000;
755
+ const processTimeout = retryCount === 0 ? 8000 : 12000;
756
+
757
+ const args = [
758
+ '--headless=new',
759
+ '--disable-gpu',
760
+ '--no-sandbox',
761
+ '--disable-setuid-sandbox',
762
+ '--disable-dev-shm-usage',
763
+ '--disable-blink-features=AutomationControlled',
764
+ '--disable-features=TranslateUI',
765
+ '--disable-extensions',
766
+ '--disable-default-apps',
767
+ '--disable-sync',
768
+ '--metrics-recording-only',
769
+ '--mute-audio',
770
+ '--no-first-run',
771
+ '--disable-background-timer-throttling',
772
+ '--disable-backgrounding-occluded-windows',
773
+ '--disable-renderer-backgrounding',
774
+ '--user-agent=' + config.userAgent,
775
+ '--screenshot=' + tempFile,
776
+ '--window-size=1280,800',
777
+ '--hide-scrollbars',
778
+ '--run-all-compositor-stages-before-draw',
779
+ `--virtual-time-budget=${virtualTimeBudget}`,
780
+ url
781
+ ];
782
+
783
+ const chromePath = await this.findChromePath();
784
+ if (!chromePath) {
785
+ throw new Error('Chrome/Chromium not found');
786
+ }
787
+
788
+ return new Promise((resolve) => {
789
+ const chrome = spawn(chromePath, args, {
790
+ stdio: ['ignore', 'pipe', 'pipe'],
791
+ detached: false
792
+ });
793
+
794
+ let processExited = false;
795
+
796
+ const killTimeout = setTimeout(() => {
797
+ if (!processExited) {
798
+ chrome.kill('SIGTERM');
799
+ setTimeout(() => {
800
+ if (!processExited) {
801
+ chrome.kill('SIGKILL');
802
+ }
803
+ }, 1000);
804
+ }
805
+ }, processTimeout);
806
+
807
+ chrome.on('exit', async (code, signal) => {
808
+ processExited = true;
809
+ clearTimeout(killTimeout);
810
+
811
+ try {
812
+ await new Promise(r => setTimeout(r, 500));
813
+ const screenshotBuffer = await fsPromises.readFile(tempFile);
814
+ const base64 = screenshotBuffer.toString('base64');
815
+ await fsPromises.unlink(tempFile).catch(() => {});
816
+ resolve(`data:image/png;base64,${base64}`);
817
+ } catch (error) {
818
+ if (retryCount === 0) {
819
+ const retryResult = await this.takeScreenshotOptimized(url, config, 1);
820
+ resolve(retryResult);
821
+ } else {
822
+ resolve(null);
823
+ }
824
+ }
825
+ });
826
+
827
+ chrome.on('error', (error) => {
828
+ clearTimeout(killTimeout);
829
+ resolve(null);
830
+ });
831
+ });
832
+ } catch (error) {
833
+ if (retryCount === 0) {
834
+ return this.takeScreenshotOptimized(url, config, 1);
835
+ }
836
+ return null;
837
+ }
838
+ }
839
+
840
+ /**
841
+ * Find Chrome/Chromium binary path
842
+ */
843
+ async findChromePath() {
844
+ const chromePaths = process.platform === 'darwin' ? [
845
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
846
+ '/Applications/Chromium.app/Contents/MacOS/Chromium',
847
+ ] : [
848
+ '/usr/bin/chromium-browser',
849
+ '/usr/bin/chromium',
850
+ '/usr/bin/google-chrome-stable',
851
+ '/usr/bin/google-chrome',
852
+ ];
853
+
854
+ for (const path of chromePaths) {
855
+ try {
856
+ await fsPromises.access(path);
857
+ return path;
858
+ } catch (e) {
859
+ continue;
860
+ }
861
+ }
862
+
863
+ return null;
864
+ }
865
+
593
866
  /**
594
867
  * Health check for all scraping methods
595
868
  */
@@ -630,7 +903,7 @@ export class BNCASmartScraper {
630
903
  }
631
904
  }
632
905
 
633
- // Export convenience function
906
+ // Export convenience functions
634
907
  export async function smartScrape(url, options = {}) {
635
908
  const scraper = new BNCASmartScraper(options);
636
909
  try {
@@ -643,4 +916,24 @@ export async function smartScrape(url, options = {}) {
643
916
  }
644
917
  }
645
918
 
919
+ export async function smartScreenshot(url, options = {}) {
920
+ const scraper = new BNCASmartScraper(options);
921
+ try {
922
+ const result = await scraper.screenshot(url, options);
923
+ return result;
924
+ } catch (error) {
925
+ throw error;
926
+ }
927
+ }
928
+
929
+ export async function quickShot(url, options = {}) {
930
+ const scraper = new BNCASmartScraper(options);
931
+ try {
932
+ const result = await scraper.quickshot(url, options);
933
+ return result;
934
+ } catch (error) {
935
+ throw error;
936
+ }
937
+ }
938
+
646
939
  export default BNCASmartScraper;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@monostate/node-scraper",
3
- "version": "1.0.3",
3
+ "version": "1.1.1",
4
4
  "description": "Intelligent web scraping with multi-level fallback system - 11.35x faster than Firecrawl",
5
5
  "type": "module",
6
6
  "main": "index.js",