@monostate/node-scraper 1.7.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -6,6 +6,7 @@ import path from 'path';
6
6
  import { fileURLToPath } from 'url';
7
7
  import { promises as fsPromises } from 'fs';
8
8
  import pdfParse from 'pdf-parse/lib/pdf-parse.js';
9
+ import browserPool from './browser-pool.js';
9
10
 
10
11
  let puppeteer = null;
11
12
  try {
@@ -666,23 +667,13 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
666
667
  };
667
668
  }
668
669
 
670
+ let browser = null;
671
+ let page = null;
672
+
669
673
  try {
670
- if (!this.browser) {
671
- this.browser = await puppeteer.launch({
672
- headless: true,
673
- args: [
674
- '--no-sandbox',
675
- '--disable-setuid-sandbox',
676
- '--disable-dev-shm-usage',
677
- '--disable-accelerated-2d-canvas',
678
- '--no-first-run',
679
- '--no-zygote',
680
- '--disable-gpu'
681
- ]
682
- });
683
- }
684
-
685
- const page = await this.browser.newPage();
674
+ // Get browser from pool
675
+ browser = await browserPool.getBrowser();
676
+ page = await browser.newPage();
686
677
 
687
678
  // Set user agent and viewport
688
679
  await page.setUserAgent(config.userAgent);
@@ -766,7 +757,6 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
766
757
  };
767
758
  });
768
759
 
769
- await page.close();
770
760
  this.stats.puppeteer.successes++;
771
761
 
772
762
  return {
@@ -782,6 +772,26 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
782
772
  error: `Puppeteer scraping failed: ${errorMsg}`,
783
773
  errorType: this.categorizeError(errorMsg)
784
774
  };
775
+ } finally {
776
+ // Always clean up page
777
+ if (page) {
778
+ try {
779
+ // Check if page is still connected before closing
780
+ if (!page.isClosed()) {
781
+ await page.close();
782
+ }
783
+ } catch (e) {
784
+ // Silently ignore protocol errors when page is already closed
785
+ if (!e.message.includes('Protocol error') && !e.message.includes('Target closed')) {
786
+ console.warn('Error closing page:', e.message);
787
+ }
788
+ }
789
+ }
790
+
791
+ // Release browser back to pool
792
+ if (browser) {
793
+ browserPool.releaseBrowser(browser);
794
+ }
785
795
  }
786
796
  }
787
797
 
@@ -1467,6 +1477,235 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
1467
1477
  timestamp: new Date().toISOString()
1468
1478
  };
1469
1479
  }
1480
+
1481
+ /**
1482
+ * Clean up resources - closes all browser instances
1483
+ */
1484
+ async cleanup() {
1485
+ await browserPool.closeAll();
1486
+ }
1487
+
1488
+ /**
1489
+ * Bulk scrape multiple URLs with optimized concurrency
1490
+ * @param {string[]} urls - Array of URLs to scrape
1491
+ * @param {Object} options - Scraping options
1492
+ * @returns {Promise<Object>} Bulk scraping results
1493
+ */
1494
+ async bulkScrape(urls, options = {}) {
1495
+ const {
1496
+ concurrency = 5,
1497
+ progressCallback = null,
1498
+ continueOnError = true,
1499
+ ...scrapeOptions
1500
+ } = options;
1501
+
1502
+ const results = {
1503
+ success: [],
1504
+ failed: [],
1505
+ total: urls.length,
1506
+ startTime: Date.now(),
1507
+ endTime: null,
1508
+ stats: {
1509
+ successful: 0,
1510
+ failed: 0,
1511
+ totalTime: 0,
1512
+ averageTime: 0,
1513
+ methods: {
1514
+ direct: 0,
1515
+ lightpanda: 0,
1516
+ puppeteer: 0,
1517
+ pdf: 0
1518
+ }
1519
+ }
1520
+ };
1521
+
1522
+ // Process URLs in batches
1523
+ const batches = [];
1524
+ for (let i = 0; i < urls.length; i += concurrency) {
1525
+ batches.push(urls.slice(i, i + concurrency));
1526
+ }
1527
+
1528
+ let processedCount = 0;
1529
+
1530
+ for (const batch of batches) {
1531
+ const batchPromises = batch.map(async (url) => {
1532
+ const startTime = Date.now();
1533
+ try {
1534
+ const result = await this.scrape(url, scrapeOptions);
1535
+ const endTime = Date.now();
1536
+ const duration = endTime - startTime;
1537
+
1538
+ const successResult = {
1539
+ url,
1540
+ ...result,
1541
+ duration,
1542
+ timestamp: new Date(endTime).toISOString()
1543
+ };
1544
+
1545
+ results.success.push(successResult);
1546
+ results.stats.successful++;
1547
+
1548
+ // Track method usage
1549
+ if (result.method) {
1550
+ results.stats.methods[result.method]++;
1551
+ }
1552
+
1553
+ return successResult;
1554
+ } catch (error) {
1555
+ const endTime = Date.now();
1556
+ const duration = endTime - startTime;
1557
+
1558
+ const failedResult = {
1559
+ url,
1560
+ success: false,
1561
+ error: error.message,
1562
+ duration,
1563
+ timestamp: new Date(endTime).toISOString()
1564
+ };
1565
+
1566
+ results.failed.push(failedResult);
1567
+ results.stats.failed++;
1568
+
1569
+ if (!continueOnError) {
1570
+ throw error;
1571
+ }
1572
+
1573
+ return failedResult;
1574
+ } finally {
1575
+ processedCount++;
1576
+ if (progressCallback) {
1577
+ progressCallback({
1578
+ processed: processedCount,
1579
+ total: urls.length,
1580
+ percentage: (processedCount / urls.length) * 100,
1581
+ current: url
1582
+ });
1583
+ }
1584
+ }
1585
+ });
1586
+
1587
+ await Promise.all(batchPromises);
1588
+ }
1589
+
1590
+ results.endTime = Date.now();
1591
+ results.stats.totalTime = results.endTime - results.startTime;
1592
+ results.stats.averageTime = results.stats.totalTime / urls.length;
1593
+
1594
+ return results;
1595
+ }
1596
+
1597
+ /**
1598
+ * Bulk scrape with streaming results
1599
+ * @param {string[]} urls - Array of URLs to scrape
1600
+ * @param {Object} options - Scraping options with onResult callback
1601
+ * @returns {Promise<Object>} Summary statistics
1602
+ */
1603
+ async bulkScrapeStream(urls, options = {}) {
1604
+ const {
1605
+ concurrency = 5,
1606
+ onResult = null,
1607
+ onError = null,
1608
+ progressCallback = null,
1609
+ ...scrapeOptions
1610
+ } = options;
1611
+
1612
+ if (!onResult) {
1613
+ throw new Error('onResult callback is required for streaming bulk scrape');
1614
+ }
1615
+
1616
+ const stats = {
1617
+ total: urls.length,
1618
+ processed: 0,
1619
+ successful: 0,
1620
+ failed: 0,
1621
+ startTime: Date.now(),
1622
+ endTime: null,
1623
+ methods: {
1624
+ direct: 0,
1625
+ lightpanda: 0,
1626
+ puppeteer: 0,
1627
+ pdf: 0
1628
+ }
1629
+ };
1630
+
1631
+ const queue = [...urls];
1632
+ const inProgress = new Set();
1633
+
1634
+ const processNext = async () => {
1635
+ if (queue.length === 0 || inProgress.size >= concurrency) {
1636
+ return;
1637
+ }
1638
+
1639
+ const url = queue.shift();
1640
+ inProgress.add(url);
1641
+
1642
+ const startTime = Date.now();
1643
+ try {
1644
+ const result = await this.scrape(url, scrapeOptions);
1645
+ const duration = Date.now() - startTime;
1646
+
1647
+ stats.successful++;
1648
+ if (result.method) {
1649
+ stats.methods[result.method]++;
1650
+ }
1651
+
1652
+ await onResult({
1653
+ url,
1654
+ ...result,
1655
+ duration,
1656
+ timestamp: new Date().toISOString()
1657
+ });
1658
+ } catch (error) {
1659
+ const duration = Date.now() - startTime;
1660
+ stats.failed++;
1661
+
1662
+ if (onError) {
1663
+ await onError({
1664
+ url,
1665
+ error: error.message,
1666
+ duration,
1667
+ timestamp: new Date().toISOString()
1668
+ });
1669
+ }
1670
+ } finally {
1671
+ inProgress.delete(url);
1672
+ stats.processed++;
1673
+
1674
+ if (progressCallback) {
1675
+ progressCallback({
1676
+ processed: stats.processed,
1677
+ total: stats.total,
1678
+ percentage: (stats.processed / stats.total) * 100,
1679
+ current: url
1680
+ });
1681
+ }
1682
+
1683
+ // Process next URL
1684
+ if (queue.length > 0) {
1685
+ processNext();
1686
+ }
1687
+ }
1688
+ };
1689
+
1690
+ // Start initial batch
1691
+ const initialBatch = Math.min(concurrency, queue.length);
1692
+ const promises = [];
1693
+ for (let i = 0; i < initialBatch; i++) {
1694
+ promises.push(processNext());
1695
+ }
1696
+
1697
+ // Wait for all to complete
1698
+ await Promise.all(promises);
1699
+ while (inProgress.size > 0) {
1700
+ await new Promise(resolve => setTimeout(resolve, 100));
1701
+ }
1702
+
1703
+ stats.endTime = Date.now();
1704
+ stats.totalTime = stats.endTime - stats.startTime;
1705
+ stats.averageTime = stats.totalTime / stats.total;
1706
+
1707
+ return stats;
1708
+ }
1470
1709
  }
1471
1710
 
1472
1711
  // Export convenience functions
@@ -1514,4 +1753,28 @@ export async function askWebsiteAI(url, question, options = {}) {
1514
1753
  }
1515
1754
  }
1516
1755
 
1756
+ export async function bulkScrape(urls, options = {}) {
1757
+ const scraper = new BNCASmartScraper(options);
1758
+ try {
1759
+ const result = await scraper.bulkScrape(urls, options);
1760
+ return result;
1761
+ } catch (error) {
1762
+ throw error;
1763
+ } finally {
1764
+ await scraper.cleanup();
1765
+ }
1766
+ }
1767
+
1768
+ export async function bulkScrapeStream(urls, options = {}) {
1769
+ const scraper = new BNCASmartScraper(options);
1770
+ try {
1771
+ const result = await scraper.bulkScrapeStream(urls, options);
1772
+ return result;
1773
+ } catch (error) {
1774
+ throw error;
1775
+ } finally {
1776
+ await scraper.cleanup();
1777
+ }
1778
+ }
1779
+
1517
1780
  export default BNCASmartScraper;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@monostate/node-scraper",
3
- "version": "1.7.0",
3
+ "version": "1.8.1",
4
4
  "description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -14,7 +14,9 @@
14
14
  "files": [
15
15
  "index.js",
16
16
  "index.d.ts",
17
+ "browser-pool.js",
17
18
  "README.md",
19
+ "BULK_SCRAPING.md",
18
20
  "package.json",
19
21
  "scripts/"
20
22
  ],
@@ -49,7 +51,7 @@
49
51
  "pdf-parse": "^1.1.1"
50
52
  },
51
53
  "peerDependencies": {
52
- "puppeteer": ">=20.0.0"
54
+ "puppeteer": "^24.11.2"
53
55
  },
54
56
  "peerDependenciesMeta": {
55
57
  "puppeteer": {
@@ -75,4 +77,4 @@
75
77
  "publishConfig": {
76
78
  "access": "public"
77
79
  }
78
- }
80
+ }