@monostate/node-scraper 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/README.md +41 -5
  2. package/index.d.ts +35 -1
  3. package/index.js +287 -1
  4. package/package.json +1 -1
package/README.md CHANGED
@@ -22,13 +22,20 @@ pnpm add @monostate/node-scraper
22
22
  ### Basic Usage
23
23
 
24
24
  ```javascript
25
- import { smartScrape, BNCASmartScraper } from '@monostate/node-scraper';
25
+ import { smartScrape, smartScreenshot, quickShot } from '@monostate/node-scraper';
26
26
 
27
27
  // Simple one-line scraping
28
28
  const result = await smartScrape('https://example.com');
29
29
  console.log(result.content); // Extracted content
30
30
  console.log(result.method); // Method used: direct-fetch, lightpanda, or puppeteer
31
- console.log(result.performance.totalTime); // Time taken in ms
31
+
32
+ // Take a screenshot
33
+ const screenshot = await smartScreenshot('https://example.com');
34
+ console.log(screenshot.screenshot); // Base64 encoded image
35
+
36
+ // Quick screenshot (optimized for speed)
37
+ const quick = await quickShot('https://example.com');
38
+ console.log(quick.screenshot); // Fast screenshot capture
32
39
  ```
33
40
 
34
41
  ### Advanced Usage
@@ -67,6 +74,10 @@ BNCA uses a sophisticated 3-tier fallback system:
67
74
  - **Performance**: Complete JavaScript execution
68
75
  - **Fallback triggers**: Complex interactions needed
69
76
 
77
+ ### 📸 Screenshot Methods
78
+ - **Chrome CLI**: Direct Chrome screenshot capture
79
+ - **Quickshot**: Optimized with retry logic and smart timeouts
80
+
70
81
  ## 📊 Performance Benchmark
71
82
 
72
83
  | Site Type | BNCA | Firecrawl | Speed Advantage |
@@ -79,12 +90,19 @@ BNCA uses a sophisticated 3-tier fallback system:
79
90
 
80
91
  ## 🎛️ API Reference
81
92
 
82
- ### `smartScrape(url, options?)`
93
+ ### Convenience Functions
94
+
95
+ #### `smartScrape(url, options?)`
96
+ Quick scraping with intelligent fallback.
83
97
 
84
- Convenience function for quick scraping.
98
+ #### `smartScreenshot(url, options?)`
99
+ Take a screenshot of any webpage.
100
+
101
+ #### `quickShot(url, options?)`
102
+ Optimized screenshot capture for maximum speed.
85
103
 
86
104
  **Parameters:**
87
- - `url` (string): URL to scrape
105
+ - `url` (string): URL to scrape/capture
88
106
  - `options` (object, optional): Configuration options
89
107
 
90
108
  **Returns:** Promise<ScrapingResult>
@@ -115,6 +133,24 @@ Scrape a URL with intelligent fallback.
115
133
  const result = await scraper.scrape('https://example.com');
116
134
  ```
117
135
 
136
+ ##### `scraper.screenshot(url, options?)`
137
+
138
+ Take a screenshot of a webpage.
139
+
140
+ ```javascript
141
+ const result = await scraper.screenshot('https://example.com');
142
+ const img = result.screenshot; // data:image/png;base64,...
143
+ ```
144
+
145
+ ##### `scraper.quickshot(url, options?)`
146
+
147
+ Quick screenshot capture - optimized for speed with retry logic.
148
+
149
+ ```javascript
150
+ const result = await scraper.quickshot('https://example.com');
151
+ // 2-3x faster than regular screenshot
152
+ ```
153
+
118
154
  ##### `scraper.getStats()`
119
155
 
120
156
  Get performance statistics.
package/index.d.ts CHANGED
@@ -25,13 +25,15 @@ export interface ScrapingResult {
25
25
  /** Size of the content in bytes */
26
26
  size?: number;
27
27
  /** Method used for scraping */
28
- method: 'direct-fetch' | 'lightpanda' | 'puppeteer' | 'failed' | 'error';
28
+ method: 'direct-fetch' | 'lightpanda' | 'puppeteer' | 'chrome-screenshot' | 'quickshot' | 'failed' | 'error';
29
29
  /** Whether browser rendering was needed */
30
30
  needsBrowser?: boolean;
31
31
  /** Content type from response headers */
32
32
  contentType?: string;
33
33
  /** Error message if scraping failed */
34
34
  error?: string;
35
+ /** Base64 encoded screenshot (if captured) */
36
+ screenshot?: string;
35
37
  /** Performance metrics */
36
38
  performance: {
37
39
  /** Total time taken in milliseconds */
@@ -129,6 +131,22 @@ export class BNCASmartScraper {
129
131
  */
130
132
  scrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
131
133
 
134
+ /**
135
+ * Take a screenshot of a webpage
136
+ * @param url The URL to capture
137
+ * @param options Optional configuration overrides
138
+ * @returns Promise resolving to screenshot result
139
+ */
140
+ screenshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
141
+
142
+ /**
143
+ * Quick screenshot capture - optimized for speed
144
+ * @param url The URL to capture
145
+ * @param options Optional configuration overrides
146
+ * @returns Promise resolving to screenshot result
147
+ */
148
+ quickshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
149
+
132
150
  /**
133
151
  * Get performance statistics for all methods
134
152
  * @returns Current statistics
@@ -214,6 +232,22 @@ export class BNCASmartScraper {
214
232
  */
215
233
  export function smartScrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
216
234
 
235
+ /**
236
+ * Convenience function for taking screenshots
237
+ * @param url The URL to capture
238
+ * @param options Optional configuration
239
+ * @returns Promise resolving to screenshot result
240
+ */
241
+ export function smartScreenshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
242
+
243
+ /**
244
+ * Convenience function for quick screenshot capture
245
+ * @param url The URL to capture
246
+ * @param options Optional configuration
247
+ * @returns Promise resolving to screenshot result
248
+ */
249
+ export function quickShot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
250
+
217
251
  /**
218
252
  * Default export - same as BNCASmartScraper class
219
253
  */
package/index.js CHANGED
@@ -3,6 +3,7 @@ import { spawn } from 'child_process';
3
3
  import fs from 'fs/promises';
4
4
  import path from 'path';
5
5
  import { fileURLToPath } from 'url';
6
+ import { promises as fsPromises } from 'fs';
6
7
 
7
8
  let puppeteer = null;
8
9
  try {
@@ -590,6 +591,271 @@ export class BNCASmartScraper {
590
591
  }
591
592
  }
592
593
 
594
+ /**
595
+ * Take a screenshot of a webpage
596
+ */
597
+ async screenshot(url, options = {}) {
598
+ const startTime = Date.now();
599
+ const config = { ...this.options, ...options };
600
+
601
+ this.log(`📸 Taking screenshot for: ${url}`);
602
+
603
+ try {
604
+ const screenshot = await this.takeScreenshotWithChrome(url, config);
605
+
606
+ return {
607
+ success: !!screenshot,
608
+ screenshot,
609
+ method: 'chrome-screenshot',
610
+ performance: {
611
+ totalTime: Date.now() - startTime
612
+ }
613
+ };
614
+ } catch (error) {
615
+ return {
616
+ success: false,
617
+ error: error.message,
618
+ method: 'chrome-screenshot',
619
+ performance: {
620
+ totalTime: Date.now() - startTime
621
+ }
622
+ };
623
+ }
624
+ }
625
+
626
+ /**
627
+ * Quick screenshot capture - optimized for speed
628
+ */
629
+ async quickshot(url, options = {}) {
630
+ const startTime = Date.now();
631
+ const config = {
632
+ ...this.options,
633
+ ...options,
634
+ timeout: options.timeout || 15000 // Longer timeout for screenshots
635
+ };
636
+
637
+ this.log(`⚡ Taking quick screenshot for: ${url}`);
638
+
639
+ try {
640
+ const screenshot = await this.takeScreenshotOptimized(url, config);
641
+
642
+ return {
643
+ success: !!screenshot,
644
+ screenshot,
645
+ method: 'quickshot',
646
+ performance: {
647
+ totalTime: Date.now() - startTime
648
+ }
649
+ };
650
+ } catch (error) {
651
+ return {
652
+ success: false,
653
+ error: error.message,
654
+ method: 'quickshot',
655
+ performance: {
656
+ totalTime: Date.now() - startTime
657
+ }
658
+ };
659
+ }
660
+ }
661
+
662
+ /**
663
+ * Take screenshot using Chrome CLI
664
+ */
665
+ async takeScreenshotWithChrome(url, config) {
666
+ const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`);
667
+
668
+ try {
669
+ const args = [
670
+ '--headless=new',
671
+ '--disable-gpu',
672
+ '--no-sandbox',
673
+ '--disable-setuid-sandbox',
674
+ '--disable-dev-shm-usage',
675
+ '--disable-blink-features=AutomationControlled',
676
+ '--user-agent=' + config.userAgent,
677
+ '--screenshot=' + tempFile,
678
+ '--window-size=1280,800',
679
+ '--hide-scrollbars',
680
+ '--virtual-time-budget=10000',
681
+ url
682
+ ];
683
+
684
+ const chromePath = await this.findChromePath();
685
+ if (!chromePath) {
686
+ throw new Error('Chrome/Chromium not found');
687
+ }
688
+
689
+ return new Promise((resolve) => {
690
+ const chrome = spawn(chromePath, args, {
691
+ stdio: ['ignore', 'pipe', 'pipe'],
692
+ detached: false
693
+ });
694
+
695
+ let processExited = false;
696
+ let stderr = '';
697
+
698
+ chrome.stderr.on('data', (data) => {
699
+ stderr += data.toString();
700
+ });
701
+
702
+ const killTimeout = setTimeout(() => {
703
+ if (!processExited) {
704
+ this.log('Chrome timeout, sending SIGTERM...');
705
+ chrome.kill('SIGTERM');
706
+
707
+ setTimeout(() => {
708
+ if (!processExited) {
709
+ chrome.kill('SIGKILL');
710
+ }
711
+ }, 1000);
712
+ }
713
+ }, config.timeout || 15000);
714
+
715
+ chrome.on('exit', async (code, signal) => {
716
+ processExited = true;
717
+ clearTimeout(killTimeout);
718
+
719
+ try {
720
+ await new Promise(r => setTimeout(r, 500));
721
+ const screenshotBuffer = await fsPromises.readFile(tempFile);
722
+ const base64 = screenshotBuffer.toString('base64');
723
+ await fsPromises.unlink(tempFile).catch(() => {});
724
+ resolve(`data:image/png;base64,${base64}`);
725
+ } catch (error) {
726
+ resolve(null);
727
+ }
728
+ });
729
+
730
+ chrome.on('error', (error) => {
731
+ clearTimeout(killTimeout);
732
+ resolve(null);
733
+ });
734
+ });
735
+ } catch (error) {
736
+ return null;
737
+ }
738
+ }
739
+
740
+ /**
741
+ * Optimized screenshot for speed
742
+ */
743
+ async takeScreenshotOptimized(url, config, retryCount = 0) {
744
+ const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`);
745
+
746
+ try {
747
+ const virtualTimeBudget = retryCount === 0 ? 5000 : 8000;
748
+ const processTimeout = retryCount === 0 ? 8000 : 12000;
749
+
750
+ const args = [
751
+ '--headless=new',
752
+ '--disable-gpu',
753
+ '--no-sandbox',
754
+ '--disable-setuid-sandbox',
755
+ '--disable-dev-shm-usage',
756
+ '--disable-blink-features=AutomationControlled',
757
+ '--disable-features=TranslateUI',
758
+ '--disable-extensions',
759
+ '--disable-default-apps',
760
+ '--disable-sync',
761
+ '--metrics-recording-only',
762
+ '--mute-audio',
763
+ '--no-first-run',
764
+ '--disable-background-timer-throttling',
765
+ '--disable-backgrounding-occluded-windows',
766
+ '--disable-renderer-backgrounding',
767
+ '--user-agent=' + config.userAgent,
768
+ '--screenshot=' + tempFile,
769
+ '--window-size=1280,800',
770
+ '--hide-scrollbars',
771
+ '--run-all-compositor-stages-before-draw',
772
+ `--virtual-time-budget=${virtualTimeBudget}`,
773
+ url
774
+ ];
775
+
776
+ const chromePath = await this.findChromePath();
777
+ if (!chromePath) {
778
+ throw new Error('Chrome/Chromium not found');
779
+ }
780
+
781
+ return new Promise((resolve) => {
782
+ const chrome = spawn(chromePath, args, {
783
+ stdio: ['ignore', 'pipe', 'pipe'],
784
+ detached: false
785
+ });
786
+
787
+ let processExited = false;
788
+
789
+ const killTimeout = setTimeout(() => {
790
+ if (!processExited) {
791
+ chrome.kill('SIGTERM');
792
+ setTimeout(() => {
793
+ if (!processExited) {
794
+ chrome.kill('SIGKILL');
795
+ }
796
+ }, 1000);
797
+ }
798
+ }, processTimeout);
799
+
800
+ chrome.on('exit', async (code, signal) => {
801
+ processExited = true;
802
+ clearTimeout(killTimeout);
803
+
804
+ try {
805
+ await new Promise(r => setTimeout(r, 500));
806
+ const screenshotBuffer = await fsPromises.readFile(tempFile);
807
+ const base64 = screenshotBuffer.toString('base64');
808
+ await fsPromises.unlink(tempFile).catch(() => {});
809
+ resolve(`data:image/png;base64,${base64}`);
810
+ } catch (error) {
811
+ if (retryCount === 0) {
812
+ const retryResult = await this.takeScreenshotOptimized(url, config, 1);
813
+ resolve(retryResult);
814
+ } else {
815
+ resolve(null);
816
+ }
817
+ }
818
+ });
819
+
820
+ chrome.on('error', (error) => {
821
+ clearTimeout(killTimeout);
822
+ resolve(null);
823
+ });
824
+ });
825
+ } catch (error) {
826
+ if (retryCount === 0) {
827
+ return this.takeScreenshotOptimized(url, config, 1);
828
+ }
829
+ return null;
830
+ }
831
+ }
832
+
833
+ /**
834
+ * Find Chrome/Chromium binary path
835
+ */
836
+ async findChromePath() {
837
+ const chromePaths = process.platform === 'darwin' ? [
838
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
839
+ '/Applications/Chromium.app/Contents/MacOS/Chromium',
840
+ ] : [
841
+ '/usr/bin/chromium-browser',
842
+ '/usr/bin/chromium',
843
+ '/usr/bin/google-chrome-stable',
844
+ '/usr/bin/google-chrome',
845
+ ];
846
+
847
+ for (const path of chromePaths) {
848
+ try {
849
+ await fsPromises.access(path);
850
+ return path;
851
+ } catch (e) {
852
+ continue;
853
+ }
854
+ }
855
+
856
+ return null;
857
+ }
858
+
593
859
  /**
594
860
  * Health check for all scraping methods
595
861
  */
@@ -630,7 +896,7 @@ export class BNCASmartScraper {
630
896
  }
631
897
  }
632
898
 
633
- // Export convenience function
899
+ // Export convenience functions
634
900
  export async function smartScrape(url, options = {}) {
635
901
  const scraper = new BNCASmartScraper(options);
636
902
  try {
@@ -643,4 +909,24 @@ export async function smartScrape(url, options = {}) {
643
909
  }
644
910
  }
645
911
 
912
+ export async function smartScreenshot(url, options = {}) {
913
+ const scraper = new BNCASmartScraper(options);
914
+ try {
915
+ const result = await scraper.screenshot(url, options);
916
+ return result;
917
+ } catch (error) {
918
+ throw error;
919
+ }
920
+ }
921
+
922
+ export async function quickShot(url, options = {}) {
923
+ const scraper = new BNCASmartScraper(options);
924
+ try {
925
+ const result = await scraper.quickshot(url, options);
926
+ return result;
927
+ } catch (error) {
928
+ throw error;
929
+ }
930
+ }
931
+
646
932
  export default BNCASmartScraper;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@monostate/node-scraper",
3
- "version": "1.0.2",
3
+ "version": "1.1.0",
4
4
  "description": "Intelligent web scraping with multi-level fallback system - 11.35x faster than Firecrawl",
5
5
  "type": "module",
6
6
  "main": "index.js",