@monostate/node-scraper 1.6.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -9
- package/index.d.ts +149 -0
- package/index.js +280 -17
- package/package.json +4 -5
- package/bin/lightpanda +0 -0
package/README.md
CHANGED
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
[](../../LICENSE)
|
|
8
8
|
[](https://nodejs.org/)
|
|
9
9
|
|
|
10
|
-
##
|
|
10
|
+
## Quick Start
|
|
11
11
|
|
|
12
12
|
### Installation
|
|
13
13
|
|
|
@@ -19,18 +19,24 @@ yarn add @monostate/node-scraper
|
|
|
19
19
|
pnpm add @monostate/node-scraper
|
|
20
20
|
```
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
**New in v1.8.0**: Bulk scraping with automatic request queueing, progress tracking, and streaming results! Process hundreds of URLs efficiently. Plus critical memory leak fix with browser pooling.
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
**Fixed in v1.7.0**: Critical cross-platform compatibility fix - binaries are now correctly downloaded per platform instead of being bundled.
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
**New in v1.6.0**: Method override support! Force specific scraping methods with `method` parameter for testing and optimization.
|
|
27
|
+
|
|
28
|
+
**New in v1.5.0**: AI-powered Q&A! Ask questions about any website using OpenRouter, OpenAI, or built-in AI.
|
|
29
|
+
|
|
30
|
+
**Also in v1.3.0**: PDF parsing support added! Automatically extracts text, metadata, and page count from PDF documents.
|
|
31
|
+
|
|
32
|
+
**Also in v1.2.0**: Lightpanda binary is now automatically downloaded and configured during installation! No manual setup required.
|
|
27
33
|
|
|
28
34
|
### Zero-Configuration Setup
|
|
29
35
|
|
|
30
36
|
The package now automatically:
|
|
31
|
-
-
|
|
32
|
-
-
|
|
33
|
-
-
|
|
37
|
+
- Downloads the correct Lightpanda binary for your platform (macOS, Linux, Windows/WSL)
|
|
38
|
+
- Configures binary paths and permissions
|
|
39
|
+
- Validates installation health on first use
|
|
34
40
|
|
|
35
41
|
### Basic Usage
|
|
36
42
|
|
|
@@ -72,6 +78,24 @@ console.log(result.stats); // Performance statistics
|
|
|
72
78
|
await scraper.cleanup(); // Clean up resources
|
|
73
79
|
```
|
|
74
80
|
|
|
81
|
+
### Browser Pool Configuration (New in v1.8.0)
|
|
82
|
+
|
|
83
|
+
The package now includes automatic browser instance pooling to prevent memory leaks:
|
|
84
|
+
|
|
85
|
+
```javascript
|
|
86
|
+
// Browser pool is managed automatically with these defaults:
|
|
87
|
+
// - Max 3 concurrent browser instances
|
|
88
|
+
// - 5 second idle timeout before cleanup
|
|
89
|
+
// - Automatic reuse of browser instances
|
|
90
|
+
|
|
91
|
+
// For heavy workloads, you can manually clean up:
|
|
92
|
+
const scraper = new BNCASmartScraper();
|
|
93
|
+
// ... perform multiple scrapes ...
|
|
94
|
+
await scraper.cleanup(); // Closes all browser instances
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Important**: The convenience functions (`smartScrape`, `smartScreenshot`, etc.) automatically handle cleanup. You only need to call `cleanup()` when using the `BNCASmartScraper` class directly.
|
|
98
|
+
|
|
75
99
|
### Method Override (New in v1.6.0)
|
|
76
100
|
|
|
77
101
|
Force a specific scraping method instead of using automatic fallback:
|
|
@@ -106,7 +130,69 @@ const result = await smartScrape('https://example.com', { method: 'auto' });
|
|
|
106
130
|
}
|
|
107
131
|
```
|
|
108
132
|
|
|
109
|
-
|
|
133
|
+
### Bulk Scraping (New in v1.8.0)
|
|
134
|
+
|
|
135
|
+
Process multiple URLs efficiently with automatic request queueing and progress tracking:
|
|
136
|
+
|
|
137
|
+
```javascript
|
|
138
|
+
import { bulkScrape } from '@monostate/node-scraper';
|
|
139
|
+
|
|
140
|
+
// Basic bulk scraping
|
|
141
|
+
const urls = [
|
|
142
|
+
'https://example1.com',
|
|
143
|
+
'https://example2.com',
|
|
144
|
+
'https://example3.com',
|
|
145
|
+
// ... hundreds more
|
|
146
|
+
];
|
|
147
|
+
|
|
148
|
+
const results = await bulkScrape(urls, {
|
|
149
|
+
concurrency: 5, // Process 5 URLs at a time
|
|
150
|
+
continueOnError: true, // Don't stop on failures
|
|
151
|
+
progressCallback: (progress) => {
|
|
152
|
+
console.log(`Progress: ${progress.percentage.toFixed(1)}% (${progress.processed}/${progress.total})`);
|
|
153
|
+
}
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
console.log(`Success: ${results.stats.successful}, Failed: ${results.stats.failed}`);
|
|
157
|
+
console.log(`Total time: ${results.stats.totalTime}ms`);
|
|
158
|
+
console.log(`Average time per URL: ${results.stats.averageTime}ms`);
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
#### Streaming Results
|
|
162
|
+
|
|
163
|
+
For large datasets, use streaming to process results as they complete:
|
|
164
|
+
|
|
165
|
+
```javascript
|
|
166
|
+
import { bulkScrapeStream } from '@monostate/node-scraper';
|
|
167
|
+
|
|
168
|
+
await bulkScrapeStream(urls, {
|
|
169
|
+
concurrency: 10,
|
|
170
|
+
onResult: async (result) => {
|
|
171
|
+
// Process each successful result immediately
|
|
172
|
+
await saveToDatabase(result);
|
|
173
|
+
console.log(`✓ ${result.url} - ${result.duration}ms`);
|
|
174
|
+
},
|
|
175
|
+
onError: async (error) => {
|
|
176
|
+
// Handle errors as they occur
|
|
177
|
+
console.error(`✗ ${error.url} - ${error.error}`);
|
|
178
|
+
},
|
|
179
|
+
progressCallback: (progress) => {
|
|
180
|
+
process.stdout.write(`\rProcessing: ${progress.percentage.toFixed(1)}%`);
|
|
181
|
+
}
|
|
182
|
+
});
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
**Features:**
|
|
186
|
+
- Automatic request queueing (no more memory errors!)
|
|
187
|
+
- Configurable concurrency control
|
|
188
|
+
- Real-time progress tracking
|
|
189
|
+
- Continue on error or stop on first failure
|
|
190
|
+
- Detailed statistics and method tracking
|
|
191
|
+
- Browser instance pooling for efficiency
|
|
192
|
+
|
|
193
|
+
For detailed examples and advanced usage, see [BULK_SCRAPING.md](./BULK_SCRAPING.md).
|
|
194
|
+
|
|
195
|
+
## How It Works
|
|
110
196
|
|
|
111
197
|
BNCA uses a sophisticated multi-tier system with intelligent detection:
|
|
112
198
|
|
|
@@ -235,7 +321,7 @@ Clean up resources (close browser instances).
|
|
|
235
321
|
await scraper.cleanup();
|
|
236
322
|
```
|
|
237
323
|
|
|
238
|
-
###
|
|
324
|
+
### AI-Powered Q&A
|
|
239
325
|
|
|
240
326
|
Ask questions about any website and get AI-generated answers:
|
|
241
327
|
|
package/index.d.ts
CHANGED
|
@@ -139,6 +139,118 @@ export interface HealthCheckResult {
|
|
|
139
139
|
timestamp: string;
|
|
140
140
|
}
|
|
141
141
|
|
|
142
|
+
export interface BulkScrapeOptions extends ScrapingOptions {
|
|
143
|
+
/** Number of concurrent requests (default: 5) */
|
|
144
|
+
concurrency?: number;
|
|
145
|
+
/** Progress callback function */
|
|
146
|
+
progressCallback?: (progress: BulkProgress) => void;
|
|
147
|
+
/** Continue processing on error (default: true) */
|
|
148
|
+
continueOnError?: boolean;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
export interface BulkScrapeStreamOptions extends ScrapingOptions {
|
|
152
|
+
/** Number of concurrent requests (default: 5) */
|
|
153
|
+
concurrency?: number;
|
|
154
|
+
/** Callback for each successful result */
|
|
155
|
+
onResult: (result: BulkScrapeResultItem) => void | Promise<void>;
|
|
156
|
+
/** Callback for errors */
|
|
157
|
+
onError?: (error: BulkScrapeErrorItem) => void | Promise<void>;
|
|
158
|
+
/** Progress callback function */
|
|
159
|
+
progressCallback?: (progress: BulkProgress) => void;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
export interface BulkProgress {
|
|
163
|
+
/** Number of URLs processed */
|
|
164
|
+
processed: number;
|
|
165
|
+
/** Total number of URLs */
|
|
166
|
+
total: number;
|
|
167
|
+
/** Percentage complete */
|
|
168
|
+
percentage: number;
|
|
169
|
+
/** Current URL being processed */
|
|
170
|
+
current: string;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
export interface BulkScrapeResult {
|
|
174
|
+
/** Successfully scraped results */
|
|
175
|
+
success: BulkScrapeResultItem[];
|
|
176
|
+
/** Failed scrapes */
|
|
177
|
+
failed: BulkScrapeErrorItem[];
|
|
178
|
+
/** Total number of URLs */
|
|
179
|
+
total: number;
|
|
180
|
+
/** Start timestamp */
|
|
181
|
+
startTime: number;
|
|
182
|
+
/** End timestamp */
|
|
183
|
+
endTime: number;
|
|
184
|
+
/** Aggregate statistics */
|
|
185
|
+
stats: BulkScrapeStats;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
export interface BulkScrapeResultItem extends ScrapingResult {
|
|
189
|
+
/** The URL that was scraped */
|
|
190
|
+
url: string;
|
|
191
|
+
/** Time taken in milliseconds */
|
|
192
|
+
duration: number;
|
|
193
|
+
/** Timestamp of completion */
|
|
194
|
+
timestamp: string;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
export interface BulkScrapeErrorItem {
|
|
198
|
+
/** The URL that failed */
|
|
199
|
+
url: string;
|
|
200
|
+
/** Success is always false for errors */
|
|
201
|
+
success: false;
|
|
202
|
+
/** Error message */
|
|
203
|
+
error: string;
|
|
204
|
+
/** Time taken in milliseconds */
|
|
205
|
+
duration: number;
|
|
206
|
+
/** Timestamp of failure */
|
|
207
|
+
timestamp: string;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
export interface BulkScrapeStats {
|
|
211
|
+
/** Number of successful scrapes */
|
|
212
|
+
successful: number;
|
|
213
|
+
/** Number of failed scrapes */
|
|
214
|
+
failed: number;
|
|
215
|
+
/** Total time taken in milliseconds */
|
|
216
|
+
totalTime: number;
|
|
217
|
+
/** Average time per URL in milliseconds */
|
|
218
|
+
averageTime: number;
|
|
219
|
+
/** Count of methods used */
|
|
220
|
+
methods: {
|
|
221
|
+
direct: number;
|
|
222
|
+
lightpanda: number;
|
|
223
|
+
puppeteer: number;
|
|
224
|
+
pdf: number;
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
export interface BulkScrapeStreamStats {
|
|
229
|
+
/** Total number of URLs */
|
|
230
|
+
total: number;
|
|
231
|
+
/** Number of URLs processed */
|
|
232
|
+
processed: number;
|
|
233
|
+
/** Number of successful scrapes */
|
|
234
|
+
successful: number;
|
|
235
|
+
/** Number of failed scrapes */
|
|
236
|
+
failed: number;
|
|
237
|
+
/** Start timestamp */
|
|
238
|
+
startTime: number;
|
|
239
|
+
/** End timestamp */
|
|
240
|
+
endTime: number;
|
|
241
|
+
/** Total time in milliseconds */
|
|
242
|
+
totalTime: number;
|
|
243
|
+
/** Average time per URL in milliseconds */
|
|
244
|
+
averageTime: number;
|
|
245
|
+
/** Count of methods used */
|
|
246
|
+
methods: {
|
|
247
|
+
direct: number;
|
|
248
|
+
lightpanda: number;
|
|
249
|
+
puppeteer: number;
|
|
250
|
+
pdf: number;
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
|
|
142
254
|
/**
|
|
143
255
|
* BNCA Smart Scraper - Intelligent web scraping with multi-level fallback
|
|
144
256
|
*/
|
|
@@ -264,6 +376,27 @@ export class BNCASmartScraper {
|
|
|
264
376
|
* @param message Message to log
|
|
265
377
|
*/
|
|
266
378
|
private log(message: string): void;
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* Clean up resources - closes all browser instances
|
|
382
|
+
*/
|
|
383
|
+
cleanup(): Promise<void>;
|
|
384
|
+
|
|
385
|
+
/**
|
|
386
|
+
* Bulk scrape multiple URLs with optimized concurrency
|
|
387
|
+
* @param urls Array of URLs to scrape
|
|
388
|
+
* @param options Bulk scraping options
|
|
389
|
+
* @returns Promise resolving to bulk scraping results
|
|
390
|
+
*/
|
|
391
|
+
bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>;
|
|
392
|
+
|
|
393
|
+
/**
|
|
394
|
+
* Bulk scrape with streaming results
|
|
395
|
+
* @param urls Array of URLs to scrape
|
|
396
|
+
* @param options Bulk scraping options with callbacks
|
|
397
|
+
* @returns Promise resolving to summary statistics
|
|
398
|
+
*/
|
|
399
|
+
bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
|
|
267
400
|
}
|
|
268
401
|
|
|
269
402
|
/**
|
|
@@ -306,6 +439,22 @@ export function askWebsiteAI(url: string, question: string, options?: ScrapingOp
|
|
|
306
439
|
processing?: 'openrouter' | 'openai' | 'backend' | 'local';
|
|
307
440
|
}>;
|
|
308
441
|
|
|
442
|
+
/**
|
|
443
|
+
* Convenience function for bulk scraping multiple URLs
|
|
444
|
+
* @param urls Array of URLs to scrape
|
|
445
|
+
* @param options Bulk scraping options
|
|
446
|
+
* @returns Promise resolving to bulk scraping results
|
|
447
|
+
*/
|
|
448
|
+
export function bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>;
|
|
449
|
+
|
|
450
|
+
/**
|
|
451
|
+
* Convenience function for bulk scraping with streaming results
|
|
452
|
+
* @param urls Array of URLs to scrape
|
|
453
|
+
* @param options Bulk scraping options with callbacks
|
|
454
|
+
* @returns Promise resolving to summary statistics
|
|
455
|
+
*/
|
|
456
|
+
export function bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
|
|
457
|
+
|
|
309
458
|
/**
|
|
310
459
|
* Default export - same as BNCASmartScraper class
|
|
311
460
|
*/
|
package/index.js
CHANGED
|
@@ -6,6 +6,7 @@ import path from 'path';
|
|
|
6
6
|
import { fileURLToPath } from 'url';
|
|
7
7
|
import { promises as fsPromises } from 'fs';
|
|
8
8
|
import pdfParse from 'pdf-parse/lib/pdf-parse.js';
|
|
9
|
+
import browserPool from './browser-pool.js';
|
|
9
10
|
|
|
10
11
|
let puppeteer = null;
|
|
11
12
|
try {
|
|
@@ -666,23 +667,13 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
666
667
|
};
|
|
667
668
|
}
|
|
668
669
|
|
|
670
|
+
let browser = null;
|
|
671
|
+
let page = null;
|
|
672
|
+
|
|
669
673
|
try {
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
args: [
|
|
674
|
-
'--no-sandbox',
|
|
675
|
-
'--disable-setuid-sandbox',
|
|
676
|
-
'--disable-dev-shm-usage',
|
|
677
|
-
'--disable-accelerated-2d-canvas',
|
|
678
|
-
'--no-first-run',
|
|
679
|
-
'--no-zygote',
|
|
680
|
-
'--disable-gpu'
|
|
681
|
-
]
|
|
682
|
-
});
|
|
683
|
-
}
|
|
684
|
-
|
|
685
|
-
const page = await this.browser.newPage();
|
|
674
|
+
// Get browser from pool
|
|
675
|
+
browser = await browserPool.getBrowser();
|
|
676
|
+
page = await browser.newPage();
|
|
686
677
|
|
|
687
678
|
// Set user agent and viewport
|
|
688
679
|
await page.setUserAgent(config.userAgent);
|
|
@@ -766,7 +757,6 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
766
757
|
};
|
|
767
758
|
});
|
|
768
759
|
|
|
769
|
-
await page.close();
|
|
770
760
|
this.stats.puppeteer.successes++;
|
|
771
761
|
|
|
772
762
|
return {
|
|
@@ -782,6 +772,26 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
782
772
|
error: `Puppeteer scraping failed: ${errorMsg}`,
|
|
783
773
|
errorType: this.categorizeError(errorMsg)
|
|
784
774
|
};
|
|
775
|
+
} finally {
|
|
776
|
+
// Always clean up page
|
|
777
|
+
if (page) {
|
|
778
|
+
try {
|
|
779
|
+
// Check if page is still connected before closing
|
|
780
|
+
if (!page.isClosed()) {
|
|
781
|
+
await page.close();
|
|
782
|
+
}
|
|
783
|
+
} catch (e) {
|
|
784
|
+
// Silently ignore protocol errors when page is already closed
|
|
785
|
+
if (!e.message.includes('Protocol error') && !e.message.includes('Target closed')) {
|
|
786
|
+
console.warn('Error closing page:', e.message);
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
// Release browser back to pool
|
|
792
|
+
if (browser) {
|
|
793
|
+
browserPool.releaseBrowser(browser);
|
|
794
|
+
}
|
|
785
795
|
}
|
|
786
796
|
}
|
|
787
797
|
|
|
@@ -1467,6 +1477,235 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
1467
1477
|
timestamp: new Date().toISOString()
|
|
1468
1478
|
};
|
|
1469
1479
|
}
|
|
1480
|
+
|
|
1481
|
+
/**
|
|
1482
|
+
* Clean up resources - closes all browser instances
|
|
1483
|
+
*/
|
|
1484
|
+
async cleanup() {
|
|
1485
|
+
await browserPool.closeAll();
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
/**
|
|
1489
|
+
* Bulk scrape multiple URLs with optimized concurrency
|
|
1490
|
+
* @param {string[]} urls - Array of URLs to scrape
|
|
1491
|
+
* @param {Object} options - Scraping options
|
|
1492
|
+
* @returns {Promise<Object>} Bulk scraping results
|
|
1493
|
+
*/
|
|
1494
|
+
async bulkScrape(urls, options = {}) {
|
|
1495
|
+
const {
|
|
1496
|
+
concurrency = 5,
|
|
1497
|
+
progressCallback = null,
|
|
1498
|
+
continueOnError = true,
|
|
1499
|
+
...scrapeOptions
|
|
1500
|
+
} = options;
|
|
1501
|
+
|
|
1502
|
+
const results = {
|
|
1503
|
+
success: [],
|
|
1504
|
+
failed: [],
|
|
1505
|
+
total: urls.length,
|
|
1506
|
+
startTime: Date.now(),
|
|
1507
|
+
endTime: null,
|
|
1508
|
+
stats: {
|
|
1509
|
+
successful: 0,
|
|
1510
|
+
failed: 0,
|
|
1511
|
+
totalTime: 0,
|
|
1512
|
+
averageTime: 0,
|
|
1513
|
+
methods: {
|
|
1514
|
+
direct: 0,
|
|
1515
|
+
lightpanda: 0,
|
|
1516
|
+
puppeteer: 0,
|
|
1517
|
+
pdf: 0
|
|
1518
|
+
}
|
|
1519
|
+
}
|
|
1520
|
+
};
|
|
1521
|
+
|
|
1522
|
+
// Process URLs in batches
|
|
1523
|
+
const batches = [];
|
|
1524
|
+
for (let i = 0; i < urls.length; i += concurrency) {
|
|
1525
|
+
batches.push(urls.slice(i, i + concurrency));
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1528
|
+
let processedCount = 0;
|
|
1529
|
+
|
|
1530
|
+
for (const batch of batches) {
|
|
1531
|
+
const batchPromises = batch.map(async (url) => {
|
|
1532
|
+
const startTime = Date.now();
|
|
1533
|
+
try {
|
|
1534
|
+
const result = await this.scrape(url, scrapeOptions);
|
|
1535
|
+
const endTime = Date.now();
|
|
1536
|
+
const duration = endTime - startTime;
|
|
1537
|
+
|
|
1538
|
+
const successResult = {
|
|
1539
|
+
url,
|
|
1540
|
+
...result,
|
|
1541
|
+
duration,
|
|
1542
|
+
timestamp: new Date(endTime).toISOString()
|
|
1543
|
+
};
|
|
1544
|
+
|
|
1545
|
+
results.success.push(successResult);
|
|
1546
|
+
results.stats.successful++;
|
|
1547
|
+
|
|
1548
|
+
// Track method usage
|
|
1549
|
+
if (result.method) {
|
|
1550
|
+
results.stats.methods[result.method]++;
|
|
1551
|
+
}
|
|
1552
|
+
|
|
1553
|
+
return successResult;
|
|
1554
|
+
} catch (error) {
|
|
1555
|
+
const endTime = Date.now();
|
|
1556
|
+
const duration = endTime - startTime;
|
|
1557
|
+
|
|
1558
|
+
const failedResult = {
|
|
1559
|
+
url,
|
|
1560
|
+
success: false,
|
|
1561
|
+
error: error.message,
|
|
1562
|
+
duration,
|
|
1563
|
+
timestamp: new Date(endTime).toISOString()
|
|
1564
|
+
};
|
|
1565
|
+
|
|
1566
|
+
results.failed.push(failedResult);
|
|
1567
|
+
results.stats.failed++;
|
|
1568
|
+
|
|
1569
|
+
if (!continueOnError) {
|
|
1570
|
+
throw error;
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
return failedResult;
|
|
1574
|
+
} finally {
|
|
1575
|
+
processedCount++;
|
|
1576
|
+
if (progressCallback) {
|
|
1577
|
+
progressCallback({
|
|
1578
|
+
processed: processedCount,
|
|
1579
|
+
total: urls.length,
|
|
1580
|
+
percentage: (processedCount / urls.length) * 100,
|
|
1581
|
+
current: url
|
|
1582
|
+
});
|
|
1583
|
+
}
|
|
1584
|
+
}
|
|
1585
|
+
});
|
|
1586
|
+
|
|
1587
|
+
await Promise.all(batchPromises);
|
|
1588
|
+
}
|
|
1589
|
+
|
|
1590
|
+
results.endTime = Date.now();
|
|
1591
|
+
results.stats.totalTime = results.endTime - results.startTime;
|
|
1592
|
+
results.stats.averageTime = results.stats.totalTime / urls.length;
|
|
1593
|
+
|
|
1594
|
+
return results;
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
/**
|
|
1598
|
+
* Bulk scrape with streaming results
|
|
1599
|
+
* @param {string[]} urls - Array of URLs to scrape
|
|
1600
|
+
* @param {Object} options - Scraping options with onResult callback
|
|
1601
|
+
* @returns {Promise<Object>} Summary statistics
|
|
1602
|
+
*/
|
|
1603
|
+
async bulkScrapeStream(urls, options = {}) {
|
|
1604
|
+
const {
|
|
1605
|
+
concurrency = 5,
|
|
1606
|
+
onResult = null,
|
|
1607
|
+
onError = null,
|
|
1608
|
+
progressCallback = null,
|
|
1609
|
+
...scrapeOptions
|
|
1610
|
+
} = options;
|
|
1611
|
+
|
|
1612
|
+
if (!onResult) {
|
|
1613
|
+
throw new Error('onResult callback is required for streaming bulk scrape');
|
|
1614
|
+
}
|
|
1615
|
+
|
|
1616
|
+
const stats = {
|
|
1617
|
+
total: urls.length,
|
|
1618
|
+
processed: 0,
|
|
1619
|
+
successful: 0,
|
|
1620
|
+
failed: 0,
|
|
1621
|
+
startTime: Date.now(),
|
|
1622
|
+
endTime: null,
|
|
1623
|
+
methods: {
|
|
1624
|
+
direct: 0,
|
|
1625
|
+
lightpanda: 0,
|
|
1626
|
+
puppeteer: 0,
|
|
1627
|
+
pdf: 0
|
|
1628
|
+
}
|
|
1629
|
+
};
|
|
1630
|
+
|
|
1631
|
+
const queue = [...urls];
|
|
1632
|
+
const inProgress = new Set();
|
|
1633
|
+
|
|
1634
|
+
const processNext = async () => {
|
|
1635
|
+
if (queue.length === 0 || inProgress.size >= concurrency) {
|
|
1636
|
+
return;
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
const url = queue.shift();
|
|
1640
|
+
inProgress.add(url);
|
|
1641
|
+
|
|
1642
|
+
const startTime = Date.now();
|
|
1643
|
+
try {
|
|
1644
|
+
const result = await this.scrape(url, scrapeOptions);
|
|
1645
|
+
const duration = Date.now() - startTime;
|
|
1646
|
+
|
|
1647
|
+
stats.successful++;
|
|
1648
|
+
if (result.method) {
|
|
1649
|
+
stats.methods[result.method]++;
|
|
1650
|
+
}
|
|
1651
|
+
|
|
1652
|
+
await onResult({
|
|
1653
|
+
url,
|
|
1654
|
+
...result,
|
|
1655
|
+
duration,
|
|
1656
|
+
timestamp: new Date().toISOString()
|
|
1657
|
+
});
|
|
1658
|
+
} catch (error) {
|
|
1659
|
+
const duration = Date.now() - startTime;
|
|
1660
|
+
stats.failed++;
|
|
1661
|
+
|
|
1662
|
+
if (onError) {
|
|
1663
|
+
await onError({
|
|
1664
|
+
url,
|
|
1665
|
+
error: error.message,
|
|
1666
|
+
duration,
|
|
1667
|
+
timestamp: new Date().toISOString()
|
|
1668
|
+
});
|
|
1669
|
+
}
|
|
1670
|
+
} finally {
|
|
1671
|
+
inProgress.delete(url);
|
|
1672
|
+
stats.processed++;
|
|
1673
|
+
|
|
1674
|
+
if (progressCallback) {
|
|
1675
|
+
progressCallback({
|
|
1676
|
+
processed: stats.processed,
|
|
1677
|
+
total: stats.total,
|
|
1678
|
+
percentage: (stats.processed / stats.total) * 100,
|
|
1679
|
+
current: url
|
|
1680
|
+
});
|
|
1681
|
+
}
|
|
1682
|
+
|
|
1683
|
+
// Process next URL
|
|
1684
|
+
if (queue.length > 0) {
|
|
1685
|
+
processNext();
|
|
1686
|
+
}
|
|
1687
|
+
}
|
|
1688
|
+
};
|
|
1689
|
+
|
|
1690
|
+
// Start initial batch
|
|
1691
|
+
const initialBatch = Math.min(concurrency, queue.length);
|
|
1692
|
+
const promises = [];
|
|
1693
|
+
for (let i = 0; i < initialBatch; i++) {
|
|
1694
|
+
promises.push(processNext());
|
|
1695
|
+
}
|
|
1696
|
+
|
|
1697
|
+
// Wait for all to complete
|
|
1698
|
+
await Promise.all(promises);
|
|
1699
|
+
while (inProgress.size > 0) {
|
|
1700
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
1701
|
+
}
|
|
1702
|
+
|
|
1703
|
+
stats.endTime = Date.now();
|
|
1704
|
+
stats.totalTime = stats.endTime - stats.startTime;
|
|
1705
|
+
stats.averageTime = stats.totalTime / stats.total;
|
|
1706
|
+
|
|
1707
|
+
return stats;
|
|
1708
|
+
}
|
|
1470
1709
|
}
|
|
1471
1710
|
|
|
1472
1711
|
// Export convenience functions
|
|
@@ -1514,4 +1753,28 @@ export async function askWebsiteAI(url, question, options = {}) {
|
|
|
1514
1753
|
}
|
|
1515
1754
|
}
|
|
1516
1755
|
|
|
1756
|
+
export async function bulkScrape(urls, options = {}) {
|
|
1757
|
+
const scraper = new BNCASmartScraper(options);
|
|
1758
|
+
try {
|
|
1759
|
+
const result = await scraper.bulkScrape(urls, options);
|
|
1760
|
+
return result;
|
|
1761
|
+
} catch (error) {
|
|
1762
|
+
throw error;
|
|
1763
|
+
} finally {
|
|
1764
|
+
await scraper.cleanup();
|
|
1765
|
+
}
|
|
1766
|
+
}
|
|
1767
|
+
|
|
1768
|
+
export async function bulkScrapeStream(urls, options = {}) {
|
|
1769
|
+
const scraper = new BNCASmartScraper(options);
|
|
1770
|
+
try {
|
|
1771
|
+
const result = await scraper.bulkScrapeStream(urls, options);
|
|
1772
|
+
return result;
|
|
1773
|
+
} catch (error) {
|
|
1774
|
+
throw error;
|
|
1775
|
+
} finally {
|
|
1776
|
+
await scraper.cleanup();
|
|
1777
|
+
}
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1517
1780
|
export default BNCASmartScraper;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@monostate/node-scraper",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.8.0",
|
|
4
4
|
"description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -16,8 +16,7 @@
|
|
|
16
16
|
"index.d.ts",
|
|
17
17
|
"README.md",
|
|
18
18
|
"package.json",
|
|
19
|
-
"scripts/"
|
|
20
|
-
"bin/"
|
|
19
|
+
"scripts/"
|
|
21
20
|
],
|
|
22
21
|
"scripts": {
|
|
23
22
|
"postinstall": "node scripts/install-lightpanda.js"
|
|
@@ -50,7 +49,7 @@
|
|
|
50
49
|
"pdf-parse": "^1.1.1"
|
|
51
50
|
},
|
|
52
51
|
"peerDependencies": {
|
|
53
|
-
"puppeteer": "
|
|
52
|
+
"puppeteer": "^24.11.2"
|
|
54
53
|
},
|
|
55
54
|
"peerDependenciesMeta": {
|
|
56
55
|
"puppeteer": {
|
|
@@ -76,4 +75,4 @@
|
|
|
76
75
|
"publishConfig": {
|
|
77
76
|
"access": "public"
|
|
78
77
|
}
|
|
79
|
-
}
|
|
78
|
+
}
|
package/bin/lightpanda
DELETED
|
Binary file
|