@monostate/node-scraper 1.7.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/README.md +82 -0
  2. package/index.d.ts +149 -0
  3. package/index.js +280 -17
  4. package/package.json +3 -3
package/README.md CHANGED
@@ -19,6 +19,8 @@ yarn add @monostate/node-scraper
19
19
  pnpm add @monostate/node-scraper
20
20
  ```
21
21
 
22
+ **New in v1.8.0**: Bulk scraping with automatic request queueing, progress tracking, and streaming results! Process hundreds of URLs efficiently. Plus critical memory leak fix with browser pooling.
23
+
22
24
  **Fixed in v1.7.0**: Critical cross-platform compatibility fix - binaries are now correctly downloaded per platform instead of being bundled.
23
25
 
24
26
  **New in v1.6.0**: Method override support! Force specific scraping methods with `method` parameter for testing and optimization.
@@ -76,6 +78,24 @@ console.log(result.stats); // Performance statistics
76
78
  await scraper.cleanup(); // Clean up resources
77
79
  ```
78
80
 
81
+ ### Browser Pool Configuration (New in v1.8.0)
82
+
83
+ The package now includes automatic browser instance pooling to prevent memory leaks:
84
+
85
+ ```javascript
86
+ // Browser pool is managed automatically with these defaults:
87
+ // - Max 3 concurrent browser instances
88
+ // - 5 second idle timeout before cleanup
89
+ // - Automatic reuse of browser instances
90
+
91
+ // For heavy workloads, you can manually clean up:
92
+ const scraper = new BNCASmartScraper();
93
+ // ... perform multiple scrapes ...
94
+ await scraper.cleanup(); // Closes all browser instances
95
+ ```
96
+
97
+ **Important**: The convenience functions (`smartScrape`, `smartScreenshot`, etc.) automatically handle cleanup. You only need to call `cleanup()` when using the `BNCASmartScraper` class directly.
98
+
79
99
  ### Method Override (New in v1.6.0)
80
100
 
81
101
  Force a specific scraping method instead of using automatic fallback:
@@ -110,6 +130,68 @@ const result = await smartScrape('https://example.com', { method: 'auto' });
110
130
  }
111
131
  ```
112
132
 
133
+ ### Bulk Scraping (New in v1.8.0)
134
+
135
+ Process multiple URLs efficiently with automatic request queueing and progress tracking:
136
+
137
+ ```javascript
138
+ import { bulkScrape } from '@monostate/node-scraper';
139
+
140
+ // Basic bulk scraping
141
+ const urls = [
142
+ 'https://example1.com',
143
+ 'https://example2.com',
144
+ 'https://example3.com',
145
+ // ... hundreds more
146
+ ];
147
+
148
+ const results = await bulkScrape(urls, {
149
+ concurrency: 5, // Process 5 URLs at a time
150
+ continueOnError: true, // Don't stop on failures
151
+ progressCallback: (progress) => {
152
+ console.log(`Progress: ${progress.percentage.toFixed(1)}% (${progress.processed}/${progress.total})`);
153
+ }
154
+ });
155
+
156
+ console.log(`Success: ${results.stats.successful}, Failed: ${results.stats.failed}`);
157
+ console.log(`Total time: ${results.stats.totalTime}ms`);
158
+ console.log(`Average time per URL: ${results.stats.averageTime}ms`);
159
+ ```
160
+
161
+ #### Streaming Results
162
+
163
+ For large datasets, use streaming to process results as they complete:
164
+
165
+ ```javascript
166
+ import { bulkScrapeStream } from '@monostate/node-scraper';
167
+
168
+ await bulkScrapeStream(urls, {
169
+ concurrency: 10,
170
+ onResult: async (result) => {
171
+ // Process each successful result immediately
172
+ await saveToDatabase(result);
173
+ console.log(`✓ ${result.url} - ${result.duration}ms`);
174
+ },
175
+ onError: async (error) => {
176
+ // Handle errors as they occur
177
+ console.error(`✗ ${error.url} - ${error.error}`);
178
+ },
179
+ progressCallback: (progress) => {
180
+ process.stdout.write(`\rProcessing: ${progress.percentage.toFixed(1)}%`);
181
+ }
182
+ });
183
+ ```
184
+
185
+ **Features:**
186
+ - Automatic request queueing (no more memory errors!)
187
+ - Configurable concurrency control
188
+ - Real-time progress tracking
189
+ - Continue on error or stop on first failure
190
+ - Detailed statistics and method tracking
191
+ - Browser instance pooling for efficiency
192
+
193
+ For detailed examples and advanced usage, see [BULK_SCRAPING.md](./BULK_SCRAPING.md).
194
+
113
195
  ## How It Works
114
196
 
115
197
  BNCA uses a sophisticated multi-tier system with intelligent detection:
package/index.d.ts CHANGED
@@ -139,6 +139,118 @@ export interface HealthCheckResult {
139
139
  timestamp: string;
140
140
  }
141
141
 
142
+ export interface BulkScrapeOptions extends ScrapingOptions {
143
+ /** Number of concurrent requests (default: 5) */
144
+ concurrency?: number;
145
+ /** Progress callback function */
146
+ progressCallback?: (progress: BulkProgress) => void;
147
+ /** Continue processing on error (default: true) */
148
+ continueOnError?: boolean;
149
+ }
150
+
151
+ export interface BulkScrapeStreamOptions extends ScrapingOptions {
152
+ /** Number of concurrent requests (default: 5) */
153
+ concurrency?: number;
154
+ /** Callback for each successful result */
155
+ onResult: (result: BulkScrapeResultItem) => void | Promise<void>;
156
+ /** Callback for errors */
157
+ onError?: (error: BulkScrapeErrorItem) => void | Promise<void>;
158
+ /** Progress callback function */
159
+ progressCallback?: (progress: BulkProgress) => void;
160
+ }
161
+
162
+ export interface BulkProgress {
163
+ /** Number of URLs processed */
164
+ processed: number;
165
+ /** Total number of URLs */
166
+ total: number;
167
+ /** Percentage complete */
168
+ percentage: number;
169
+ /** Current URL being processed */
170
+ current: string;
171
+ }
172
+
173
+ export interface BulkScrapeResult {
174
+ /** Successfully scraped results */
175
+ success: BulkScrapeResultItem[];
176
+ /** Failed scrapes */
177
+ failed: BulkScrapeErrorItem[];
178
+ /** Total number of URLs */
179
+ total: number;
180
+ /** Start timestamp */
181
+ startTime: number;
182
+ /** End timestamp */
183
+ endTime: number;
184
+ /** Aggregate statistics */
185
+ stats: BulkScrapeStats;
186
+ }
187
+
188
+ export interface BulkScrapeResultItem extends ScrapingResult {
189
+ /** The URL that was scraped */
190
+ url: string;
191
+ /** Time taken in milliseconds */
192
+ duration: number;
193
+ /** Timestamp of completion */
194
+ timestamp: string;
195
+ }
196
+
197
+ export interface BulkScrapeErrorItem {
198
+ /** The URL that failed */
199
+ url: string;
200
+ /** Success is always false for errors */
201
+ success: false;
202
+ /** Error message */
203
+ error: string;
204
+ /** Time taken in milliseconds */
205
+ duration: number;
206
+ /** Timestamp of failure */
207
+ timestamp: string;
208
+ }
209
+
210
+ export interface BulkScrapeStats {
211
+ /** Number of successful scrapes */
212
+ successful: number;
213
+ /** Number of failed scrapes */
214
+ failed: number;
215
+ /** Total time taken in milliseconds */
216
+ totalTime: number;
217
+ /** Average time per URL in milliseconds */
218
+ averageTime: number;
219
+ /** Count of methods used */
220
+ methods: {
221
+ direct: number;
222
+ lightpanda: number;
223
+ puppeteer: number;
224
+ pdf: number;
225
+ };
226
+ }
227
+
228
+ export interface BulkScrapeStreamStats {
229
+ /** Total number of URLs */
230
+ total: number;
231
+ /** Number of URLs processed */
232
+ processed: number;
233
+ /** Number of successful scrapes */
234
+ successful: number;
235
+ /** Number of failed scrapes */
236
+ failed: number;
237
+ /** Start timestamp */
238
+ startTime: number;
239
+ /** End timestamp */
240
+ endTime: number;
241
+ /** Total time in milliseconds */
242
+ totalTime: number;
243
+ /** Average time per URL in milliseconds */
244
+ averageTime: number;
245
+ /** Count of methods used */
246
+ methods: {
247
+ direct: number;
248
+ lightpanda: number;
249
+ puppeteer: number;
250
+ pdf: number;
251
+ };
252
+ }
253
+
142
254
  /**
143
255
  * BNCA Smart Scraper - Intelligent web scraping with multi-level fallback
144
256
  */
@@ -264,6 +376,27 @@ export class BNCASmartScraper {
264
376
  * @param message Message to log
265
377
  */
266
378
  private log(message: string): void;
379
+
380
+ /**
381
+ * Clean up resources - closes all browser instances
382
+ */
383
+ cleanup(): Promise<void>;
384
+
385
+ /**
386
+ * Bulk scrape multiple URLs with optimized concurrency
387
+ * @param urls Array of URLs to scrape
388
+ * @param options Bulk scraping options
389
+ * @returns Promise resolving to bulk scraping results
390
+ */
391
+ bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>;
392
+
393
+ /**
394
+ * Bulk scrape with streaming results
395
+ * @param urls Array of URLs to scrape
396
+ * @param options Bulk scraping options with callbacks
397
+ * @returns Promise resolving to summary statistics
398
+ */
399
+ bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
267
400
  }
268
401
 
269
402
  /**
@@ -306,6 +439,22 @@ export function askWebsiteAI(url: string, question: string, options?: ScrapingOp
306
439
  processing?: 'openrouter' | 'openai' | 'backend' | 'local';
307
440
  }>;
308
441
 
442
+ /**
443
+ * Convenience function for bulk scraping multiple URLs
444
+ * @param urls Array of URLs to scrape
445
+ * @param options Bulk scraping options
446
+ * @returns Promise resolving to bulk scraping results
447
+ */
448
+ export function bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>;
449
+
450
+ /**
451
+ * Convenience function for bulk scraping with streaming results
452
+ * @param urls Array of URLs to scrape
453
+ * @param options Bulk scraping options with callbacks
454
+ * @returns Promise resolving to summary statistics
455
+ */
456
+ export function bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
457
+
309
458
  /**
310
459
  * Default export - same as BNCASmartScraper class
311
460
  */
package/index.js CHANGED
@@ -6,6 +6,7 @@ import path from 'path';
6
6
  import { fileURLToPath } from 'url';
7
7
  import { promises as fsPromises } from 'fs';
8
8
  import pdfParse from 'pdf-parse/lib/pdf-parse.js';
9
+ import browserPool from './browser-pool.js';
9
10
 
10
11
  let puppeteer = null;
11
12
  try {
@@ -666,23 +667,13 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
666
667
  };
667
668
  }
668
669
 
670
+ let browser = null;
671
+ let page = null;
672
+
669
673
  try {
670
- if (!this.browser) {
671
- this.browser = await puppeteer.launch({
672
- headless: true,
673
- args: [
674
- '--no-sandbox',
675
- '--disable-setuid-sandbox',
676
- '--disable-dev-shm-usage',
677
- '--disable-accelerated-2d-canvas',
678
- '--no-first-run',
679
- '--no-zygote',
680
- '--disable-gpu'
681
- ]
682
- });
683
- }
684
-
685
- const page = await this.browser.newPage();
674
+ // Get browser from pool
675
+ browser = await browserPool.getBrowser();
676
+ page = await browser.newPage();
686
677
 
687
678
  // Set user agent and viewport
688
679
  await page.setUserAgent(config.userAgent);
@@ -766,7 +757,6 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
766
757
  };
767
758
  });
768
759
 
769
- await page.close();
770
760
  this.stats.puppeteer.successes++;
771
761
 
772
762
  return {
@@ -782,6 +772,26 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
782
772
  error: `Puppeteer scraping failed: ${errorMsg}`,
783
773
  errorType: this.categorizeError(errorMsg)
784
774
  };
775
+ } finally {
776
+ // Always clean up page
777
+ if (page) {
778
+ try {
779
+ // Check if page is still connected before closing
780
+ if (!page.isClosed()) {
781
+ await page.close();
782
+ }
783
+ } catch (e) {
784
+ // Silently ignore protocol errors when page is already closed
785
+ if (!e.message.includes('Protocol error') && !e.message.includes('Target closed')) {
786
+ console.warn('Error closing page:', e.message);
787
+ }
788
+ }
789
+ }
790
+
791
+ // Release browser back to pool
792
+ if (browser) {
793
+ browserPool.releaseBrowser(browser);
794
+ }
785
795
  }
786
796
  }
787
797
 
@@ -1467,6 +1477,235 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
1467
1477
  timestamp: new Date().toISOString()
1468
1478
  };
1469
1479
  }
1480
+
1481
+ /**
1482
+ * Clean up resources - closes all browser instances
1483
+ */
1484
+ async cleanup() {
1485
+ await browserPool.closeAll();
1486
+ }
1487
+
1488
+ /**
1489
+ * Bulk scrape multiple URLs with optimized concurrency
1490
+ * @param {string[]} urls - Array of URLs to scrape
1491
+ * @param {Object} options - Scraping options
1492
+ * @returns {Promise<Object>} Bulk scraping results
1493
+ */
1494
+ async bulkScrape(urls, options = {}) {
1495
+ const {
1496
+ concurrency = 5,
1497
+ progressCallback = null,
1498
+ continueOnError = true,
1499
+ ...scrapeOptions
1500
+ } = options;
1501
+
1502
+ const results = {
1503
+ success: [],
1504
+ failed: [],
1505
+ total: urls.length,
1506
+ startTime: Date.now(),
1507
+ endTime: null,
1508
+ stats: {
1509
+ successful: 0,
1510
+ failed: 0,
1511
+ totalTime: 0,
1512
+ averageTime: 0,
1513
+ methods: {
1514
+ direct: 0,
1515
+ lightpanda: 0,
1516
+ puppeteer: 0,
1517
+ pdf: 0
1518
+ }
1519
+ }
1520
+ };
1521
+
1522
+ // Process URLs in batches
1523
+ const batches = [];
1524
+ for (let i = 0; i < urls.length; i += concurrency) {
1525
+ batches.push(urls.slice(i, i + concurrency));
1526
+ }
1527
+
1528
+ let processedCount = 0;
1529
+
1530
+ for (const batch of batches) {
1531
+ const batchPromises = batch.map(async (url) => {
1532
+ const startTime = Date.now();
1533
+ try {
1534
+ const result = await this.scrape(url, scrapeOptions);
1535
+ const endTime = Date.now();
1536
+ const duration = endTime - startTime;
1537
+
1538
+ const successResult = {
1539
+ url,
1540
+ ...result,
1541
+ duration,
1542
+ timestamp: new Date(endTime).toISOString()
1543
+ };
1544
+
1545
+ results.success.push(successResult);
1546
+ results.stats.successful++;
1547
+
1548
+ // Track method usage
1549
+ if (result.method) {
1550
+ results.stats.methods[result.method]++;
1551
+ }
1552
+
1553
+ return successResult;
1554
+ } catch (error) {
1555
+ const endTime = Date.now();
1556
+ const duration = endTime - startTime;
1557
+
1558
+ const failedResult = {
1559
+ url,
1560
+ success: false,
1561
+ error: error.message,
1562
+ duration,
1563
+ timestamp: new Date(endTime).toISOString()
1564
+ };
1565
+
1566
+ results.failed.push(failedResult);
1567
+ results.stats.failed++;
1568
+
1569
+ if (!continueOnError) {
1570
+ throw error;
1571
+ }
1572
+
1573
+ return failedResult;
1574
+ } finally {
1575
+ processedCount++;
1576
+ if (progressCallback) {
1577
+ progressCallback({
1578
+ processed: processedCount,
1579
+ total: urls.length,
1580
+ percentage: (processedCount / urls.length) * 100,
1581
+ current: url
1582
+ });
1583
+ }
1584
+ }
1585
+ });
1586
+
1587
+ await Promise.all(batchPromises);
1588
+ }
1589
+
1590
+ results.endTime = Date.now();
1591
+ results.stats.totalTime = results.endTime - results.startTime;
1592
+ results.stats.averageTime = results.stats.totalTime / urls.length;
1593
+
1594
+ return results;
1595
+ }
1596
+
1597
+ /**
1598
+ * Bulk scrape with streaming results
1599
+ * @param {string[]} urls - Array of URLs to scrape
1600
+ * @param {Object} options - Scraping options with onResult callback
1601
+ * @returns {Promise<Object>} Summary statistics
1602
+ */
1603
+ async bulkScrapeStream(urls, options = {}) {
1604
+ const {
1605
+ concurrency = 5,
1606
+ onResult = null,
1607
+ onError = null,
1608
+ progressCallback = null,
1609
+ ...scrapeOptions
1610
+ } = options;
1611
+
1612
+ if (!onResult) {
1613
+ throw new Error('onResult callback is required for streaming bulk scrape');
1614
+ }
1615
+
1616
+ const stats = {
1617
+ total: urls.length,
1618
+ processed: 0,
1619
+ successful: 0,
1620
+ failed: 0,
1621
+ startTime: Date.now(),
1622
+ endTime: null,
1623
+ methods: {
1624
+ direct: 0,
1625
+ lightpanda: 0,
1626
+ puppeteer: 0,
1627
+ pdf: 0
1628
+ }
1629
+ };
1630
+
1631
+ const queue = [...urls];
1632
+ const inProgress = new Set();
1633
+
1634
+ const processNext = async () => {
1635
+ if (queue.length === 0 || inProgress.size >= concurrency) {
1636
+ return;
1637
+ }
1638
+
1639
+ const url = queue.shift();
1640
+ inProgress.add(url);
1641
+
1642
+ const startTime = Date.now();
1643
+ try {
1644
+ const result = await this.scrape(url, scrapeOptions);
1645
+ const duration = Date.now() - startTime;
1646
+
1647
+ stats.successful++;
1648
+ if (result.method) {
1649
+ stats.methods[result.method]++;
1650
+ }
1651
+
1652
+ await onResult({
1653
+ url,
1654
+ ...result,
1655
+ duration,
1656
+ timestamp: new Date().toISOString()
1657
+ });
1658
+ } catch (error) {
1659
+ const duration = Date.now() - startTime;
1660
+ stats.failed++;
1661
+
1662
+ if (onError) {
1663
+ await onError({
1664
+ url,
1665
+ error: error.message,
1666
+ duration,
1667
+ timestamp: new Date().toISOString()
1668
+ });
1669
+ }
1670
+ } finally {
1671
+ inProgress.delete(url);
1672
+ stats.processed++;
1673
+
1674
+ if (progressCallback) {
1675
+ progressCallback({
1676
+ processed: stats.processed,
1677
+ total: stats.total,
1678
+ percentage: (stats.processed / stats.total) * 100,
1679
+ current: url
1680
+ });
1681
+ }
1682
+
1683
+ // Process next URL
1684
+ if (queue.length > 0) {
1685
+ processNext();
1686
+ }
1687
+ }
1688
+ };
1689
+
1690
+ // Start initial batch
1691
+ const initialBatch = Math.min(concurrency, queue.length);
1692
+ const promises = [];
1693
+ for (let i = 0; i < initialBatch; i++) {
1694
+ promises.push(processNext());
1695
+ }
1696
+
1697
+ // Wait for all to complete
1698
+ await Promise.all(promises);
1699
+ while (inProgress.size > 0) {
1700
+ await new Promise(resolve => setTimeout(resolve, 100));
1701
+ }
1702
+
1703
+ stats.endTime = Date.now();
1704
+ stats.totalTime = stats.endTime - stats.startTime;
1705
+ stats.averageTime = stats.totalTime / stats.total;
1706
+
1707
+ return stats;
1708
+ }
1470
1709
  }
1471
1710
 
1472
1711
  // Export convenience functions
@@ -1514,4 +1753,28 @@ export async function askWebsiteAI(url, question, options = {}) {
1514
1753
  }
1515
1754
  }
1516
1755
 
1756
+ export async function bulkScrape(urls, options = {}) {
1757
+ const scraper = new BNCASmartScraper(options);
1758
+ try {
1759
+ const result = await scraper.bulkScrape(urls, options);
1760
+ return result;
1761
+ } catch (error) {
1762
+ throw error;
1763
+ } finally {
1764
+ await scraper.cleanup();
1765
+ }
1766
+ }
1767
+
1768
+ export async function bulkScrapeStream(urls, options = {}) {
1769
+ const scraper = new BNCASmartScraper(options);
1770
+ try {
1771
+ const result = await scraper.bulkScrapeStream(urls, options);
1772
+ return result;
1773
+ } catch (error) {
1774
+ throw error;
1775
+ } finally {
1776
+ await scraper.cleanup();
1777
+ }
1778
+ }
1779
+
1517
1780
  export default BNCASmartScraper;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@monostate/node-scraper",
3
- "version": "1.7.0",
3
+ "version": "1.8.0",
4
4
  "description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -49,7 +49,7 @@
49
49
  "pdf-parse": "^1.1.1"
50
50
  },
51
51
  "peerDependencies": {
52
- "puppeteer": ">=20.0.0"
52
+ "puppeteer": "^24.11.2"
53
53
  },
54
54
  "peerDependenciesMeta": {
55
55
  "puppeteer": {
@@ -75,4 +75,4 @@
75
75
  "publishConfig": {
76
76
  "access": "public"
77
77
  }
78
- }
78
+ }