@monostate/node-scraper 1.0.3 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -5
- package/index.d.ts +35 -1
- package/index.js +287 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -22,13 +22,20 @@ pnpm add @monostate/node-scraper
|
|
|
22
22
|
### Basic Usage
|
|
23
23
|
|
|
24
24
|
```javascript
|
|
25
|
-
import { smartScrape,
|
|
25
|
+
import { smartScrape, smartScreenshot, quickShot } from '@monostate/node-scraper';
|
|
26
26
|
|
|
27
27
|
// Simple one-line scraping
|
|
28
28
|
const result = await smartScrape('https://example.com');
|
|
29
29
|
console.log(result.content); // Extracted content
|
|
30
30
|
console.log(result.method); // Method used: direct-fetch, lightpanda, or puppeteer
|
|
31
|
-
|
|
31
|
+
|
|
32
|
+
// Take a screenshot
|
|
33
|
+
const screenshot = await smartScreenshot('https://example.com');
|
|
34
|
+
console.log(screenshot.screenshot); // Base64 encoded image
|
|
35
|
+
|
|
36
|
+
// Quick screenshot (optimized for speed)
|
|
37
|
+
const quick = await quickShot('https://example.com');
|
|
38
|
+
console.log(quick.screenshot); // Fast screenshot capture
|
|
32
39
|
```
|
|
33
40
|
|
|
34
41
|
### Advanced Usage
|
|
@@ -67,6 +74,10 @@ BNCA uses a sophisticated 3-tier fallback system:
|
|
|
67
74
|
- **Performance**: Complete JavaScript execution
|
|
68
75
|
- **Fallback triggers**: Complex interactions needed
|
|
69
76
|
|
|
77
|
+
### 📸 Screenshot Methods
|
|
78
|
+
- **Chrome CLI**: Direct Chrome screenshot capture
|
|
79
|
+
- **Quickshot**: Optimized with retry logic and smart timeouts
|
|
80
|
+
|
|
70
81
|
## 📊 Performance Benchmark
|
|
71
82
|
|
|
72
83
|
| Site Type | BNCA | Firecrawl | Speed Advantage |
|
|
@@ -79,12 +90,19 @@ BNCA uses a sophisticated 3-tier fallback system:
|
|
|
79
90
|
|
|
80
91
|
## 🎛️ API Reference
|
|
81
92
|
|
|
82
|
-
###
|
|
93
|
+
### Convenience Functions
|
|
94
|
+
|
|
95
|
+
#### `smartScrape(url, options?)`
|
|
96
|
+
Quick scraping with intelligent fallback.
|
|
83
97
|
|
|
84
|
-
|
|
98
|
+
#### `smartScreenshot(url, options?)`
|
|
99
|
+
Take a screenshot of any webpage.
|
|
100
|
+
|
|
101
|
+
#### `quickShot(url, options?)`
|
|
102
|
+
Optimized screenshot capture for maximum speed.
|
|
85
103
|
|
|
86
104
|
**Parameters:**
|
|
87
|
-
- `url` (string): URL to scrape
|
|
105
|
+
- `url` (string): URL to scrape/capture
|
|
88
106
|
- `options` (object, optional): Configuration options
|
|
89
107
|
|
|
90
108
|
**Returns:** Promise<ScrapingResult>
|
|
@@ -115,6 +133,24 @@ Scrape a URL with intelligent fallback.
|
|
|
115
133
|
const result = await scraper.scrape('https://example.com');
|
|
116
134
|
```
|
|
117
135
|
|
|
136
|
+
##### `scraper.screenshot(url, options?)`
|
|
137
|
+
|
|
138
|
+
Take a screenshot of a webpage.
|
|
139
|
+
|
|
140
|
+
```javascript
|
|
141
|
+
const result = await scraper.screenshot('https://example.com');
|
|
142
|
+
const img = result.screenshot; // data:image/png;base64,...
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
##### `scraper.quickshot(url, options?)`
|
|
146
|
+
|
|
147
|
+
Quick screenshot capture - optimized for speed with retry logic.
|
|
148
|
+
|
|
149
|
+
```javascript
|
|
150
|
+
const result = await scraper.quickshot('https://example.com');
|
|
151
|
+
// 2-3x faster than regular screenshot
|
|
152
|
+
```
|
|
153
|
+
|
|
118
154
|
##### `scraper.getStats()`
|
|
119
155
|
|
|
120
156
|
Get performance statistics.
|
package/index.d.ts
CHANGED
|
@@ -25,13 +25,15 @@ export interface ScrapingResult {
|
|
|
25
25
|
/** Size of the content in bytes */
|
|
26
26
|
size?: number;
|
|
27
27
|
/** Method used for scraping */
|
|
28
|
-
method: 'direct-fetch' | 'lightpanda' | 'puppeteer' | 'failed' | 'error';
|
|
28
|
+
method: 'direct-fetch' | 'lightpanda' | 'puppeteer' | 'chrome-screenshot' | 'quickshot' | 'failed' | 'error';
|
|
29
29
|
/** Whether browser rendering was needed */
|
|
30
30
|
needsBrowser?: boolean;
|
|
31
31
|
/** Content type from response headers */
|
|
32
32
|
contentType?: string;
|
|
33
33
|
/** Error message if scraping failed */
|
|
34
34
|
error?: string;
|
|
35
|
+
/** Base64 encoded screenshot (if captured) */
|
|
36
|
+
screenshot?: string;
|
|
35
37
|
/** Performance metrics */
|
|
36
38
|
performance: {
|
|
37
39
|
/** Total time taken in milliseconds */
|
|
@@ -129,6 +131,22 @@ export class BNCASmartScraper {
|
|
|
129
131
|
*/
|
|
130
132
|
scrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
|
|
131
133
|
|
|
134
|
+
/**
|
|
135
|
+
* Take a screenshot of a webpage
|
|
136
|
+
* @param url The URL to capture
|
|
137
|
+
* @param options Optional configuration overrides
|
|
138
|
+
* @returns Promise resolving to screenshot result
|
|
139
|
+
*/
|
|
140
|
+
screenshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Quick screenshot capture - optimized for speed
|
|
144
|
+
* @param url The URL to capture
|
|
145
|
+
* @param options Optional configuration overrides
|
|
146
|
+
* @returns Promise resolving to screenshot result
|
|
147
|
+
*/
|
|
148
|
+
quickshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
|
|
149
|
+
|
|
132
150
|
/**
|
|
133
151
|
* Get performance statistics for all methods
|
|
134
152
|
* @returns Current statistics
|
|
@@ -214,6 +232,22 @@ export class BNCASmartScraper {
|
|
|
214
232
|
*/
|
|
215
233
|
export function smartScrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
|
|
216
234
|
|
|
235
|
+
/**
|
|
236
|
+
* Convenience function for taking screenshots
|
|
237
|
+
* @param url The URL to capture
|
|
238
|
+
* @param options Optional configuration
|
|
239
|
+
* @returns Promise resolving to screenshot result
|
|
240
|
+
*/
|
|
241
|
+
export function smartScreenshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Convenience function for quick screenshot capture
|
|
245
|
+
* @param url The URL to capture
|
|
246
|
+
* @param options Optional configuration
|
|
247
|
+
* @returns Promise resolving to screenshot result
|
|
248
|
+
*/
|
|
249
|
+
export function quickShot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
|
|
250
|
+
|
|
217
251
|
/**
|
|
218
252
|
* Default export - same as BNCASmartScraper class
|
|
219
253
|
*/
|
package/index.js
CHANGED
|
@@ -3,6 +3,7 @@ import { spawn } from 'child_process';
|
|
|
3
3
|
import fs from 'fs/promises';
|
|
4
4
|
import path from 'path';
|
|
5
5
|
import { fileURLToPath } from 'url';
|
|
6
|
+
import { promises as fsPromises } from 'fs';
|
|
6
7
|
|
|
7
8
|
let puppeteer = null;
|
|
8
9
|
try {
|
|
@@ -590,6 +591,271 @@ export class BNCASmartScraper {
|
|
|
590
591
|
}
|
|
591
592
|
}
|
|
592
593
|
|
|
594
|
+
/**
|
|
595
|
+
* Take a screenshot of a webpage
|
|
596
|
+
*/
|
|
597
|
+
async screenshot(url, options = {}) {
|
|
598
|
+
const startTime = Date.now();
|
|
599
|
+
const config = { ...this.options, ...options };
|
|
600
|
+
|
|
601
|
+
this.log(`📸 Taking screenshot for: ${url}`);
|
|
602
|
+
|
|
603
|
+
try {
|
|
604
|
+
const screenshot = await this.takeScreenshotWithChrome(url, config);
|
|
605
|
+
|
|
606
|
+
return {
|
|
607
|
+
success: !!screenshot,
|
|
608
|
+
screenshot,
|
|
609
|
+
method: 'chrome-screenshot',
|
|
610
|
+
performance: {
|
|
611
|
+
totalTime: Date.now() - startTime
|
|
612
|
+
}
|
|
613
|
+
};
|
|
614
|
+
} catch (error) {
|
|
615
|
+
return {
|
|
616
|
+
success: false,
|
|
617
|
+
error: error.message,
|
|
618
|
+
method: 'chrome-screenshot',
|
|
619
|
+
performance: {
|
|
620
|
+
totalTime: Date.now() - startTime
|
|
621
|
+
}
|
|
622
|
+
};
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
/**
|
|
627
|
+
* Quick screenshot capture - optimized for speed
|
|
628
|
+
*/
|
|
629
|
+
async quickshot(url, options = {}) {
|
|
630
|
+
const startTime = Date.now();
|
|
631
|
+
const config = {
|
|
632
|
+
...this.options,
|
|
633
|
+
...options,
|
|
634
|
+
timeout: options.timeout || 15000 // Longer timeout for screenshots
|
|
635
|
+
};
|
|
636
|
+
|
|
637
|
+
this.log(`⚡ Taking quick screenshot for: ${url}`);
|
|
638
|
+
|
|
639
|
+
try {
|
|
640
|
+
const screenshot = await this.takeScreenshotOptimized(url, config);
|
|
641
|
+
|
|
642
|
+
return {
|
|
643
|
+
success: !!screenshot,
|
|
644
|
+
screenshot,
|
|
645
|
+
method: 'quickshot',
|
|
646
|
+
performance: {
|
|
647
|
+
totalTime: Date.now() - startTime
|
|
648
|
+
}
|
|
649
|
+
};
|
|
650
|
+
} catch (error) {
|
|
651
|
+
return {
|
|
652
|
+
success: false,
|
|
653
|
+
error: error.message,
|
|
654
|
+
method: 'quickshot',
|
|
655
|
+
performance: {
|
|
656
|
+
totalTime: Date.now() - startTime
|
|
657
|
+
}
|
|
658
|
+
};
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
/**
|
|
663
|
+
* Take screenshot using Chrome CLI
|
|
664
|
+
*/
|
|
665
|
+
async takeScreenshotWithChrome(url, config) {
|
|
666
|
+
const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`);
|
|
667
|
+
|
|
668
|
+
try {
|
|
669
|
+
const args = [
|
|
670
|
+
'--headless=new',
|
|
671
|
+
'--disable-gpu',
|
|
672
|
+
'--no-sandbox',
|
|
673
|
+
'--disable-setuid-sandbox',
|
|
674
|
+
'--disable-dev-shm-usage',
|
|
675
|
+
'--disable-blink-features=AutomationControlled',
|
|
676
|
+
'--user-agent=' + config.userAgent,
|
|
677
|
+
'--screenshot=' + tempFile,
|
|
678
|
+
'--window-size=1280,800',
|
|
679
|
+
'--hide-scrollbars',
|
|
680
|
+
'--virtual-time-budget=10000',
|
|
681
|
+
url
|
|
682
|
+
];
|
|
683
|
+
|
|
684
|
+
const chromePath = await this.findChromePath();
|
|
685
|
+
if (!chromePath) {
|
|
686
|
+
throw new Error('Chrome/Chromium not found');
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
return new Promise((resolve) => {
|
|
690
|
+
const chrome = spawn(chromePath, args, {
|
|
691
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
692
|
+
detached: false
|
|
693
|
+
});
|
|
694
|
+
|
|
695
|
+
let processExited = false;
|
|
696
|
+
let stderr = '';
|
|
697
|
+
|
|
698
|
+
chrome.stderr.on('data', (data) => {
|
|
699
|
+
stderr += data.toString();
|
|
700
|
+
});
|
|
701
|
+
|
|
702
|
+
const killTimeout = setTimeout(() => {
|
|
703
|
+
if (!processExited) {
|
|
704
|
+
this.log('Chrome timeout, sending SIGTERM...');
|
|
705
|
+
chrome.kill('SIGTERM');
|
|
706
|
+
|
|
707
|
+
setTimeout(() => {
|
|
708
|
+
if (!processExited) {
|
|
709
|
+
chrome.kill('SIGKILL');
|
|
710
|
+
}
|
|
711
|
+
}, 1000);
|
|
712
|
+
}
|
|
713
|
+
}, config.timeout || 15000);
|
|
714
|
+
|
|
715
|
+
chrome.on('exit', async (code, signal) => {
|
|
716
|
+
processExited = true;
|
|
717
|
+
clearTimeout(killTimeout);
|
|
718
|
+
|
|
719
|
+
try {
|
|
720
|
+
await new Promise(r => setTimeout(r, 500));
|
|
721
|
+
const screenshotBuffer = await fsPromises.readFile(tempFile);
|
|
722
|
+
const base64 = screenshotBuffer.toString('base64');
|
|
723
|
+
await fsPromises.unlink(tempFile).catch(() => {});
|
|
724
|
+
resolve(`data:image/png;base64,${base64}`);
|
|
725
|
+
} catch (error) {
|
|
726
|
+
resolve(null);
|
|
727
|
+
}
|
|
728
|
+
});
|
|
729
|
+
|
|
730
|
+
chrome.on('error', (error) => {
|
|
731
|
+
clearTimeout(killTimeout);
|
|
732
|
+
resolve(null);
|
|
733
|
+
});
|
|
734
|
+
});
|
|
735
|
+
} catch (error) {
|
|
736
|
+
return null;
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
/**
|
|
741
|
+
* Optimized screenshot for speed
|
|
742
|
+
*/
|
|
743
|
+
async takeScreenshotOptimized(url, config, retryCount = 0) {
|
|
744
|
+
const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`);
|
|
745
|
+
|
|
746
|
+
try {
|
|
747
|
+
const virtualTimeBudget = retryCount === 0 ? 5000 : 8000;
|
|
748
|
+
const processTimeout = retryCount === 0 ? 8000 : 12000;
|
|
749
|
+
|
|
750
|
+
const args = [
|
|
751
|
+
'--headless=new',
|
|
752
|
+
'--disable-gpu',
|
|
753
|
+
'--no-sandbox',
|
|
754
|
+
'--disable-setuid-sandbox',
|
|
755
|
+
'--disable-dev-shm-usage',
|
|
756
|
+
'--disable-blink-features=AutomationControlled',
|
|
757
|
+
'--disable-features=TranslateUI',
|
|
758
|
+
'--disable-extensions',
|
|
759
|
+
'--disable-default-apps',
|
|
760
|
+
'--disable-sync',
|
|
761
|
+
'--metrics-recording-only',
|
|
762
|
+
'--mute-audio',
|
|
763
|
+
'--no-first-run',
|
|
764
|
+
'--disable-background-timer-throttling',
|
|
765
|
+
'--disable-backgrounding-occluded-windows',
|
|
766
|
+
'--disable-renderer-backgrounding',
|
|
767
|
+
'--user-agent=' + config.userAgent,
|
|
768
|
+
'--screenshot=' + tempFile,
|
|
769
|
+
'--window-size=1280,800',
|
|
770
|
+
'--hide-scrollbars',
|
|
771
|
+
'--run-all-compositor-stages-before-draw',
|
|
772
|
+
`--virtual-time-budget=${virtualTimeBudget}`,
|
|
773
|
+
url
|
|
774
|
+
];
|
|
775
|
+
|
|
776
|
+
const chromePath = await this.findChromePath();
|
|
777
|
+
if (!chromePath) {
|
|
778
|
+
throw new Error('Chrome/Chromium not found');
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
return new Promise((resolve) => {
|
|
782
|
+
const chrome = spawn(chromePath, args, {
|
|
783
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
784
|
+
detached: false
|
|
785
|
+
});
|
|
786
|
+
|
|
787
|
+
let processExited = false;
|
|
788
|
+
|
|
789
|
+
const killTimeout = setTimeout(() => {
|
|
790
|
+
if (!processExited) {
|
|
791
|
+
chrome.kill('SIGTERM');
|
|
792
|
+
setTimeout(() => {
|
|
793
|
+
if (!processExited) {
|
|
794
|
+
chrome.kill('SIGKILL');
|
|
795
|
+
}
|
|
796
|
+
}, 1000);
|
|
797
|
+
}
|
|
798
|
+
}, processTimeout);
|
|
799
|
+
|
|
800
|
+
chrome.on('exit', async (code, signal) => {
|
|
801
|
+
processExited = true;
|
|
802
|
+
clearTimeout(killTimeout);
|
|
803
|
+
|
|
804
|
+
try {
|
|
805
|
+
await new Promise(r => setTimeout(r, 500));
|
|
806
|
+
const screenshotBuffer = await fsPromises.readFile(tempFile);
|
|
807
|
+
const base64 = screenshotBuffer.toString('base64');
|
|
808
|
+
await fsPromises.unlink(tempFile).catch(() => {});
|
|
809
|
+
resolve(`data:image/png;base64,${base64}`);
|
|
810
|
+
} catch (error) {
|
|
811
|
+
if (retryCount === 0) {
|
|
812
|
+
const retryResult = await this.takeScreenshotOptimized(url, config, 1);
|
|
813
|
+
resolve(retryResult);
|
|
814
|
+
} else {
|
|
815
|
+
resolve(null);
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
});
|
|
819
|
+
|
|
820
|
+
chrome.on('error', (error) => {
|
|
821
|
+
clearTimeout(killTimeout);
|
|
822
|
+
resolve(null);
|
|
823
|
+
});
|
|
824
|
+
});
|
|
825
|
+
} catch (error) {
|
|
826
|
+
if (retryCount === 0) {
|
|
827
|
+
return this.takeScreenshotOptimized(url, config, 1);
|
|
828
|
+
}
|
|
829
|
+
return null;
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
/**
|
|
834
|
+
* Find Chrome/Chromium binary path
|
|
835
|
+
*/
|
|
836
|
+
async findChromePath() {
|
|
837
|
+
const chromePaths = process.platform === 'darwin' ? [
|
|
838
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
839
|
+
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
|
840
|
+
] : [
|
|
841
|
+
'/usr/bin/chromium-browser',
|
|
842
|
+
'/usr/bin/chromium',
|
|
843
|
+
'/usr/bin/google-chrome-stable',
|
|
844
|
+
'/usr/bin/google-chrome',
|
|
845
|
+
];
|
|
846
|
+
|
|
847
|
+
for (const path of chromePaths) {
|
|
848
|
+
try {
|
|
849
|
+
await fsPromises.access(path);
|
|
850
|
+
return path;
|
|
851
|
+
} catch (e) {
|
|
852
|
+
continue;
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
return null;
|
|
857
|
+
}
|
|
858
|
+
|
|
593
859
|
/**
|
|
594
860
|
* Health check for all scraping methods
|
|
595
861
|
*/
|
|
@@ -630,7 +896,7 @@ export class BNCASmartScraper {
|
|
|
630
896
|
}
|
|
631
897
|
}
|
|
632
898
|
|
|
633
|
-
// Export convenience
|
|
899
|
+
// Export convenience functions
|
|
634
900
|
export async function smartScrape(url, options = {}) {
|
|
635
901
|
const scraper = new BNCASmartScraper(options);
|
|
636
902
|
try {
|
|
@@ -643,4 +909,24 @@ export async function smartScrape(url, options = {}) {
|
|
|
643
909
|
}
|
|
644
910
|
}
|
|
645
911
|
|
|
912
|
+
export async function smartScreenshot(url, options = {}) {
|
|
913
|
+
const scraper = new BNCASmartScraper(options);
|
|
914
|
+
try {
|
|
915
|
+
const result = await scraper.screenshot(url, options);
|
|
916
|
+
return result;
|
|
917
|
+
} catch (error) {
|
|
918
|
+
throw error;
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
export async function quickShot(url, options = {}) {
|
|
923
|
+
const scraper = new BNCASmartScraper(options);
|
|
924
|
+
try {
|
|
925
|
+
const result = await scraper.quickshot(url, options);
|
|
926
|
+
return result;
|
|
927
|
+
} catch (error) {
|
|
928
|
+
throw error;
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
|
|
646
932
|
export default BNCASmartScraper;
|