@monostate/node-scraper 1.7.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/BULK_SCRAPING.md +626 -0
- package/README.md +84 -0
- package/browser-pool.js +229 -0
- package/index.d.ts +149 -0
- package/index.js +280 -17
- package/package.json +5 -3
package/index.js
CHANGED
|
@@ -6,6 +6,7 @@ import path from 'path';
|
|
|
6
6
|
import { fileURLToPath } from 'url';
|
|
7
7
|
import { promises as fsPromises } from 'fs';
|
|
8
8
|
import pdfParse from 'pdf-parse/lib/pdf-parse.js';
|
|
9
|
+
import browserPool from './browser-pool.js';
|
|
9
10
|
|
|
10
11
|
let puppeteer = null;
|
|
11
12
|
try {
|
|
@@ -666,23 +667,13 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
666
667
|
};
|
|
667
668
|
}
|
|
668
669
|
|
|
670
|
+
let browser = null;
|
|
671
|
+
let page = null;
|
|
672
|
+
|
|
669
673
|
try {
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
args: [
|
|
674
|
-
'--no-sandbox',
|
|
675
|
-
'--disable-setuid-sandbox',
|
|
676
|
-
'--disable-dev-shm-usage',
|
|
677
|
-
'--disable-accelerated-2d-canvas',
|
|
678
|
-
'--no-first-run',
|
|
679
|
-
'--no-zygote',
|
|
680
|
-
'--disable-gpu'
|
|
681
|
-
]
|
|
682
|
-
});
|
|
683
|
-
}
|
|
684
|
-
|
|
685
|
-
const page = await this.browser.newPage();
|
|
674
|
+
// Get browser from pool
|
|
675
|
+
browser = await browserPool.getBrowser();
|
|
676
|
+
page = await browser.newPage();
|
|
686
677
|
|
|
687
678
|
// Set user agent and viewport
|
|
688
679
|
await page.setUserAgent(config.userAgent);
|
|
@@ -766,7 +757,6 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
766
757
|
};
|
|
767
758
|
});
|
|
768
759
|
|
|
769
|
-
await page.close();
|
|
770
760
|
this.stats.puppeteer.successes++;
|
|
771
761
|
|
|
772
762
|
return {
|
|
@@ -782,6 +772,26 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
782
772
|
error: `Puppeteer scraping failed: ${errorMsg}`,
|
|
783
773
|
errorType: this.categorizeError(errorMsg)
|
|
784
774
|
};
|
|
775
|
+
} finally {
|
|
776
|
+
// Always clean up page
|
|
777
|
+
if (page) {
|
|
778
|
+
try {
|
|
779
|
+
// Check if page is still connected before closing
|
|
780
|
+
if (!page.isClosed()) {
|
|
781
|
+
await page.close();
|
|
782
|
+
}
|
|
783
|
+
} catch (e) {
|
|
784
|
+
// Silently ignore protocol errors when page is already closed
|
|
785
|
+
if (!e.message.includes('Protocol error') && !e.message.includes('Target closed')) {
|
|
786
|
+
console.warn('Error closing page:', e.message);
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
// Release browser back to pool
|
|
792
|
+
if (browser) {
|
|
793
|
+
browserPool.releaseBrowser(browser);
|
|
794
|
+
}
|
|
785
795
|
}
|
|
786
796
|
}
|
|
787
797
|
|
|
@@ -1467,6 +1477,235 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
1467
1477
|
timestamp: new Date().toISOString()
|
|
1468
1478
|
};
|
|
1469
1479
|
}
|
|
1480
|
+
|
|
1481
|
+
/**
|
|
1482
|
+
* Clean up resources - closes all browser instances
|
|
1483
|
+
*/
|
|
1484
|
+
async cleanup() {
|
|
1485
|
+
await browserPool.closeAll();
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
/**
|
|
1489
|
+
* Bulk scrape multiple URLs with optimized concurrency
|
|
1490
|
+
* @param {string[]} urls - Array of URLs to scrape
|
|
1491
|
+
* @param {Object} options - Scraping options
|
|
1492
|
+
* @returns {Promise<Object>} Bulk scraping results
|
|
1493
|
+
*/
|
|
1494
|
+
async bulkScrape(urls, options = {}) {
|
|
1495
|
+
const {
|
|
1496
|
+
concurrency = 5,
|
|
1497
|
+
progressCallback = null,
|
|
1498
|
+
continueOnError = true,
|
|
1499
|
+
...scrapeOptions
|
|
1500
|
+
} = options;
|
|
1501
|
+
|
|
1502
|
+
const results = {
|
|
1503
|
+
success: [],
|
|
1504
|
+
failed: [],
|
|
1505
|
+
total: urls.length,
|
|
1506
|
+
startTime: Date.now(),
|
|
1507
|
+
endTime: null,
|
|
1508
|
+
stats: {
|
|
1509
|
+
successful: 0,
|
|
1510
|
+
failed: 0,
|
|
1511
|
+
totalTime: 0,
|
|
1512
|
+
averageTime: 0,
|
|
1513
|
+
methods: {
|
|
1514
|
+
direct: 0,
|
|
1515
|
+
lightpanda: 0,
|
|
1516
|
+
puppeteer: 0,
|
|
1517
|
+
pdf: 0
|
|
1518
|
+
}
|
|
1519
|
+
}
|
|
1520
|
+
};
|
|
1521
|
+
|
|
1522
|
+
// Process URLs in batches
|
|
1523
|
+
const batches = [];
|
|
1524
|
+
for (let i = 0; i < urls.length; i += concurrency) {
|
|
1525
|
+
batches.push(urls.slice(i, i + concurrency));
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1528
|
+
let processedCount = 0;
|
|
1529
|
+
|
|
1530
|
+
for (const batch of batches) {
|
|
1531
|
+
const batchPromises = batch.map(async (url) => {
|
|
1532
|
+
const startTime = Date.now();
|
|
1533
|
+
try {
|
|
1534
|
+
const result = await this.scrape(url, scrapeOptions);
|
|
1535
|
+
const endTime = Date.now();
|
|
1536
|
+
const duration = endTime - startTime;
|
|
1537
|
+
|
|
1538
|
+
const successResult = {
|
|
1539
|
+
url,
|
|
1540
|
+
...result,
|
|
1541
|
+
duration,
|
|
1542
|
+
timestamp: new Date(endTime).toISOString()
|
|
1543
|
+
};
|
|
1544
|
+
|
|
1545
|
+
results.success.push(successResult);
|
|
1546
|
+
results.stats.successful++;
|
|
1547
|
+
|
|
1548
|
+
// Track method usage
|
|
1549
|
+
if (result.method) {
|
|
1550
|
+
results.stats.methods[result.method]++;
|
|
1551
|
+
}
|
|
1552
|
+
|
|
1553
|
+
return successResult;
|
|
1554
|
+
} catch (error) {
|
|
1555
|
+
const endTime = Date.now();
|
|
1556
|
+
const duration = endTime - startTime;
|
|
1557
|
+
|
|
1558
|
+
const failedResult = {
|
|
1559
|
+
url,
|
|
1560
|
+
success: false,
|
|
1561
|
+
error: error.message,
|
|
1562
|
+
duration,
|
|
1563
|
+
timestamp: new Date(endTime).toISOString()
|
|
1564
|
+
};
|
|
1565
|
+
|
|
1566
|
+
results.failed.push(failedResult);
|
|
1567
|
+
results.stats.failed++;
|
|
1568
|
+
|
|
1569
|
+
if (!continueOnError) {
|
|
1570
|
+
throw error;
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
return failedResult;
|
|
1574
|
+
} finally {
|
|
1575
|
+
processedCount++;
|
|
1576
|
+
if (progressCallback) {
|
|
1577
|
+
progressCallback({
|
|
1578
|
+
processed: processedCount,
|
|
1579
|
+
total: urls.length,
|
|
1580
|
+
percentage: (processedCount / urls.length) * 100,
|
|
1581
|
+
current: url
|
|
1582
|
+
});
|
|
1583
|
+
}
|
|
1584
|
+
}
|
|
1585
|
+
});
|
|
1586
|
+
|
|
1587
|
+
await Promise.all(batchPromises);
|
|
1588
|
+
}
|
|
1589
|
+
|
|
1590
|
+
results.endTime = Date.now();
|
|
1591
|
+
results.stats.totalTime = results.endTime - results.startTime;
|
|
1592
|
+
results.stats.averageTime = results.stats.totalTime / urls.length;
|
|
1593
|
+
|
|
1594
|
+
return results;
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
/**
|
|
1598
|
+
* Bulk scrape with streaming results
|
|
1599
|
+
* @param {string[]} urls - Array of URLs to scrape
|
|
1600
|
+
* @param {Object} options - Scraping options with onResult callback
|
|
1601
|
+
* @returns {Promise<Object>} Summary statistics
|
|
1602
|
+
*/
|
|
1603
|
+
async bulkScrapeStream(urls, options = {}) {
|
|
1604
|
+
const {
|
|
1605
|
+
concurrency = 5,
|
|
1606
|
+
onResult = null,
|
|
1607
|
+
onError = null,
|
|
1608
|
+
progressCallback = null,
|
|
1609
|
+
...scrapeOptions
|
|
1610
|
+
} = options;
|
|
1611
|
+
|
|
1612
|
+
if (!onResult) {
|
|
1613
|
+
throw new Error('onResult callback is required for streaming bulk scrape');
|
|
1614
|
+
}
|
|
1615
|
+
|
|
1616
|
+
const stats = {
|
|
1617
|
+
total: urls.length,
|
|
1618
|
+
processed: 0,
|
|
1619
|
+
successful: 0,
|
|
1620
|
+
failed: 0,
|
|
1621
|
+
startTime: Date.now(),
|
|
1622
|
+
endTime: null,
|
|
1623
|
+
methods: {
|
|
1624
|
+
direct: 0,
|
|
1625
|
+
lightpanda: 0,
|
|
1626
|
+
puppeteer: 0,
|
|
1627
|
+
pdf: 0
|
|
1628
|
+
}
|
|
1629
|
+
};
|
|
1630
|
+
|
|
1631
|
+
const queue = [...urls];
|
|
1632
|
+
const inProgress = new Set();
|
|
1633
|
+
|
|
1634
|
+
const processNext = async () => {
|
|
1635
|
+
if (queue.length === 0 || inProgress.size >= concurrency) {
|
|
1636
|
+
return;
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
const url = queue.shift();
|
|
1640
|
+
inProgress.add(url);
|
|
1641
|
+
|
|
1642
|
+
const startTime = Date.now();
|
|
1643
|
+
try {
|
|
1644
|
+
const result = await this.scrape(url, scrapeOptions);
|
|
1645
|
+
const duration = Date.now() - startTime;
|
|
1646
|
+
|
|
1647
|
+
stats.successful++;
|
|
1648
|
+
if (result.method) {
|
|
1649
|
+
stats.methods[result.method]++;
|
|
1650
|
+
}
|
|
1651
|
+
|
|
1652
|
+
await onResult({
|
|
1653
|
+
url,
|
|
1654
|
+
...result,
|
|
1655
|
+
duration,
|
|
1656
|
+
timestamp: new Date().toISOString()
|
|
1657
|
+
});
|
|
1658
|
+
} catch (error) {
|
|
1659
|
+
const duration = Date.now() - startTime;
|
|
1660
|
+
stats.failed++;
|
|
1661
|
+
|
|
1662
|
+
if (onError) {
|
|
1663
|
+
await onError({
|
|
1664
|
+
url,
|
|
1665
|
+
error: error.message,
|
|
1666
|
+
duration,
|
|
1667
|
+
timestamp: new Date().toISOString()
|
|
1668
|
+
});
|
|
1669
|
+
}
|
|
1670
|
+
} finally {
|
|
1671
|
+
inProgress.delete(url);
|
|
1672
|
+
stats.processed++;
|
|
1673
|
+
|
|
1674
|
+
if (progressCallback) {
|
|
1675
|
+
progressCallback({
|
|
1676
|
+
processed: stats.processed,
|
|
1677
|
+
total: stats.total,
|
|
1678
|
+
percentage: (stats.processed / stats.total) * 100,
|
|
1679
|
+
current: url
|
|
1680
|
+
});
|
|
1681
|
+
}
|
|
1682
|
+
|
|
1683
|
+
// Process next URL
|
|
1684
|
+
if (queue.length > 0) {
|
|
1685
|
+
processNext();
|
|
1686
|
+
}
|
|
1687
|
+
}
|
|
1688
|
+
};
|
|
1689
|
+
|
|
1690
|
+
// Start initial batch
|
|
1691
|
+
const initialBatch = Math.min(concurrency, queue.length);
|
|
1692
|
+
const promises = [];
|
|
1693
|
+
for (let i = 0; i < initialBatch; i++) {
|
|
1694
|
+
promises.push(processNext());
|
|
1695
|
+
}
|
|
1696
|
+
|
|
1697
|
+
// Wait for all to complete
|
|
1698
|
+
await Promise.all(promises);
|
|
1699
|
+
while (inProgress.size > 0) {
|
|
1700
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
1701
|
+
}
|
|
1702
|
+
|
|
1703
|
+
stats.endTime = Date.now();
|
|
1704
|
+
stats.totalTime = stats.endTime - stats.startTime;
|
|
1705
|
+
stats.averageTime = stats.totalTime / stats.total;
|
|
1706
|
+
|
|
1707
|
+
return stats;
|
|
1708
|
+
}
|
|
1470
1709
|
}
|
|
1471
1710
|
|
|
1472
1711
|
// Export convenience functions
|
|
@@ -1514,4 +1753,28 @@ export async function askWebsiteAI(url, question, options = {}) {
|
|
|
1514
1753
|
}
|
|
1515
1754
|
}
|
|
1516
1755
|
|
|
1756
|
+
export async function bulkScrape(urls, options = {}) {
|
|
1757
|
+
const scraper = new BNCASmartScraper(options);
|
|
1758
|
+
try {
|
|
1759
|
+
const result = await scraper.bulkScrape(urls, options);
|
|
1760
|
+
return result;
|
|
1761
|
+
} catch (error) {
|
|
1762
|
+
throw error;
|
|
1763
|
+
} finally {
|
|
1764
|
+
await scraper.cleanup();
|
|
1765
|
+
}
|
|
1766
|
+
}
|
|
1767
|
+
|
|
1768
|
+
export async function bulkScrapeStream(urls, options = {}) {
|
|
1769
|
+
const scraper = new BNCASmartScraper(options);
|
|
1770
|
+
try {
|
|
1771
|
+
const result = await scraper.bulkScrapeStream(urls, options);
|
|
1772
|
+
return result;
|
|
1773
|
+
} catch (error) {
|
|
1774
|
+
throw error;
|
|
1775
|
+
} finally {
|
|
1776
|
+
await scraper.cleanup();
|
|
1777
|
+
}
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1517
1780
|
export default BNCASmartScraper;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@monostate/node-scraper",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.8.1",
|
|
4
4
|
"description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -14,7 +14,9 @@
|
|
|
14
14
|
"files": [
|
|
15
15
|
"index.js",
|
|
16
16
|
"index.d.ts",
|
|
17
|
+
"browser-pool.js",
|
|
17
18
|
"README.md",
|
|
19
|
+
"BULK_SCRAPING.md",
|
|
18
20
|
"package.json",
|
|
19
21
|
"scripts/"
|
|
20
22
|
],
|
|
@@ -49,7 +51,7 @@
|
|
|
49
51
|
"pdf-parse": "^1.1.1"
|
|
50
52
|
},
|
|
51
53
|
"peerDependencies": {
|
|
52
|
-
"puppeteer": "
|
|
54
|
+
"puppeteer": "^24.11.2"
|
|
53
55
|
},
|
|
54
56
|
"peerDependenciesMeta": {
|
|
55
57
|
"puppeteer": {
|
|
@@ -75,4 +77,4 @@
|
|
|
75
77
|
"publishConfig": {
|
|
76
78
|
"access": "public"
|
|
77
79
|
}
|
|
78
|
-
}
|
|
80
|
+
}
|