@monostate/node-scraper 1.7.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -19,6 +19,10 @@ yarn add @monostate/node-scraper
19
19
  pnpm add @monostate/node-scraper
20
20
  ```
21
21
 
22
+ **Fixed in v1.8.1**: Critical production fix - browser-pool.js now included in npm package.
23
+
24
+ **New in v1.8.0**: Bulk scraping with automatic request queueing, progress tracking, and streaming results! Process hundreds of URLs efficiently. Plus critical memory leak fix with browser pooling.
25
+
22
26
  **Fixed in v1.7.0**: Critical cross-platform compatibility fix - binaries are now correctly downloaded per platform instead of being bundled.
23
27
 
24
28
  **New in v1.6.0**: Method override support! Force specific scraping methods with `method` parameter for testing and optimization.
@@ -76,6 +80,24 @@ console.log(result.stats); // Performance statistics
76
80
  await scraper.cleanup(); // Clean up resources
77
81
  ```
78
82
 
83
+ ### Browser Pool Configuration (New in v1.8.0)
84
+
85
+ The package now includes automatic browser instance pooling to prevent memory leaks:
86
+
87
+ ```javascript
88
+ // Browser pool is managed automatically with these defaults:
89
+ // - Max 3 concurrent browser instances
90
+ // - 5 second idle timeout before cleanup
91
+ // - Automatic reuse of browser instances
92
+
93
+ // For heavy workloads, you can manually clean up:
94
+ const scraper = new BNCASmartScraper();
95
+ // ... perform multiple scrapes ...
96
+ await scraper.cleanup(); // Closes all browser instances
97
+ ```
98
+
99
+ **Important**: The convenience functions (`smartScrape`, `smartScreenshot`, etc.) automatically handle cleanup. You only need to call `cleanup()` when using the `BNCASmartScraper` class directly.
100
+
79
101
  ### Method Override (New in v1.6.0)
80
102
 
81
103
  Force a specific scraping method instead of using automatic fallback:
@@ -110,6 +132,68 @@ const result = await smartScrape('https://example.com', { method: 'auto' });
110
132
  }
111
133
  ```
112
134
 
135
+ ### Bulk Scraping (New in v1.8.0)
136
+
137
+ Process multiple URLs efficiently with automatic request queueing and progress tracking:
138
+
139
+ ```javascript
140
+ import { bulkScrape } from '@monostate/node-scraper';
141
+
142
+ // Basic bulk scraping
143
+ const urls = [
144
+ 'https://example1.com',
145
+ 'https://example2.com',
146
+ 'https://example3.com',
147
+ // ... hundreds more
148
+ ];
149
+
150
+ const results = await bulkScrape(urls, {
151
+ concurrency: 5, // Process 5 URLs at a time
152
+ continueOnError: true, // Don't stop on failures
153
+ progressCallback: (progress) => {
154
+ console.log(`Progress: ${progress.percentage.toFixed(1)}% (${progress.processed}/${progress.total})`);
155
+ }
156
+ });
157
+
158
+ console.log(`Success: ${results.stats.successful}, Failed: ${results.stats.failed}`);
159
+ console.log(`Total time: ${results.stats.totalTime}ms`);
160
+ console.log(`Average time per URL: ${results.stats.averageTime}ms`);
161
+ ```
162
+
163
+ #### Streaming Results
164
+
165
+ For large datasets, use streaming to process results as they complete:
166
+
167
+ ```javascript
168
+ import { bulkScrapeStream } from '@monostate/node-scraper';
169
+
170
+ await bulkScrapeStream(urls, {
171
+ concurrency: 10,
172
+ onResult: async (result) => {
173
+ // Process each successful result immediately
174
+ await saveToDatabase(result);
175
+ console.log(`✓ ${result.url} - ${result.duration}ms`);
176
+ },
177
+ onError: async (error) => {
178
+ // Handle errors as they occur
179
+ console.error(`✗ ${error.url} - ${error.error}`);
180
+ },
181
+ progressCallback: (progress) => {
182
+ process.stdout.write(`\rProcessing: ${progress.percentage.toFixed(1)}%`);
183
+ }
184
+ });
185
+ ```
186
+
187
+ **Features:**
188
+ - Automatic request queueing (no more memory errors!)
189
+ - Configurable concurrency control
190
+ - Real-time progress tracking
191
+ - Continue on error or stop on first failure
192
+ - Detailed statistics and method tracking
193
+ - Browser instance pooling for efficiency
194
+
195
+ For detailed examples and advanced usage, see [BULK_SCRAPING.md](./BULK_SCRAPING.md).
196
+
113
197
  ## How It Works
114
198
 
115
199
  BNCA uses a sophisticated multi-tier system with intelligent detection:
@@ -0,0 +1,229 @@
1
+ class BrowserPool {
2
+ constructor(maxInstances = 3, idleTimeout = 5000) {
3
+ this.maxInstances = maxInstances;
4
+ this.idleTimeout = idleTimeout;
5
+ this.pool = [];
6
+ this.busyBrowsers = new Set();
7
+ this.cleanupTimer = null;
8
+ this.requestQueue = [];
9
+ this.stats = {
10
+ created: 0,
11
+ reused: 0,
12
+ queued: 0,
13
+ cleaned: 0
14
+ };
15
+ }
16
+
17
+ async getBrowser() {
18
+ // Try to get an idle browser from pool
19
+ let browser = this.pool.find(b => !this.busyBrowsers.has(b.instance));
20
+
21
+ if (browser) {
22
+ browser.lastUsed = Date.now();
23
+ this.busyBrowsers.add(browser.instance);
24
+ this.stats.reused++;
25
+ return browser.instance;
26
+ }
27
+
28
+ // Create new browser if under limit
29
+ if (this.pool.length < this.maxInstances) {
30
+ browser = await this.createBrowser();
31
+ this.pool.push(browser);
32
+ this.busyBrowsers.add(browser.instance);
33
+ this.stats.created++;
34
+ return browser.instance;
35
+ }
36
+
37
+ // Queue the request and wait for available browser
38
+ this.stats.queued++;
39
+ return this.queueRequest();
40
+ }
41
+
42
+ async createBrowser() {
43
+ const puppeteer = await this.getPuppeteer();
44
+ const instance = await puppeteer.launch({
45
+ headless: 'new',
46
+ args: [
47
+ '--no-sandbox',
48
+ '--disable-setuid-sandbox',
49
+ '--disable-dev-shm-usage',
50
+ '--disable-gpu',
51
+ '--disable-web-security',
52
+ '--disable-features=VizDisplayCompositor',
53
+ '--disable-background-timer-throttling',
54
+ '--disable-backgrounding-occluded-windows',
55
+ '--disable-renderer-backgrounding',
56
+ '--disable-extensions',
57
+ '--disable-default-apps',
58
+ '--disable-sync',
59
+ '--metrics-recording-only',
60
+ '--mute-audio',
61
+ '--no-first-run'
62
+ ]
63
+ });
64
+
65
+ const browser = {
66
+ instance,
67
+ created: Date.now(),
68
+ lastUsed: Date.now(),
69
+ pageCount: 0
70
+ };
71
+
72
+ // Handle browser disconnect
73
+ instance.on('disconnected', () => {
74
+ this.removeBrowser(browser);
75
+ this.processQueue();
76
+ });
77
+
78
+ return browser;
79
+ }
80
+
81
+ async getPuppeteer() {
82
+ try {
83
+ const puppeteer = await import('puppeteer');
84
+ return puppeteer.default || puppeteer;
85
+ } catch (error) {
86
+ throw new Error('Puppeteer is not installed. Please install it to use Puppeteer-based scraping.');
87
+ }
88
+ }
89
+
90
+ async queueRequest() {
91
+ return new Promise((resolve) => {
92
+ this.requestQueue.push({ resolve, timestamp: Date.now() });
93
+ });
94
+ }
95
+
96
+ processQueue() {
97
+ if (this.requestQueue.length === 0) return;
98
+
99
+ // Find available browser
100
+ const available = this.pool.find(b => !this.busyBrowsers.has(b.instance));
101
+ if (!available) return;
102
+
103
+ // Process oldest request in queue
104
+ const request = this.requestQueue.shift();
105
+ if (request) {
106
+ available.lastUsed = Date.now();
107
+ this.busyBrowsers.add(available.instance);
108
+ request.resolve(available.instance);
109
+ }
110
+ }
111
+
112
+ releaseBrowser(browser) {
113
+ this.busyBrowsers.delete(browser);
114
+
115
+ // Process any queued requests
116
+ this.processQueue();
117
+
118
+ // Start cleanup timer if not already running
119
+ if (!this.cleanupTimer) {
120
+ this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
121
+ }
122
+ }
123
+
124
+ removeBrowser(browserObj) {
125
+ const index = this.pool.findIndex(b => b.instance === browserObj.instance);
126
+ if (index !== -1) {
127
+ this.pool.splice(index, 1);
128
+ this.busyBrowsers.delete(browserObj.instance);
129
+ }
130
+ }
131
+
132
+ async cleanup() {
133
+ this.cleanupTimer = null;
134
+ const now = Date.now();
135
+ const toRemove = [];
136
+
137
+ // Keep at least one browser if there are queued requests
138
+ const minBrowsers = this.requestQueue.length > 0 ? 1 : 0;
139
+
140
+ for (const browser of this.pool) {
141
+ // Skip if we need to keep minimum browsers
142
+ if (this.pool.length - toRemove.length <= minBrowsers) break;
143
+
144
+ // Remove idle browsers
145
+ const isIdle = !this.busyBrowsers.has(browser.instance);
146
+ const idleTime = now - browser.lastUsed;
147
+
148
+ if (isIdle && idleTime > this.idleTimeout) {
149
+ toRemove.push(browser);
150
+ }
151
+ }
152
+
153
+ // Close idle browsers
154
+ for (const browser of toRemove) {
155
+ try {
156
+ // Check if browser is still connected
157
+ if (browser.instance && browser.instance.isConnected()) {
158
+ await browser.instance.close();
159
+ }
160
+ this.removeBrowser(browser);
161
+ this.stats.cleaned++;
162
+ } catch (error) {
163
+ // Silently ignore protocol errors and disconnection errors
164
+ if (!error.message.includes('Protocol error') &&
165
+ !error.message.includes('Target closed') &&
166
+ !error.message.includes('Connection closed')) {
167
+ console.warn('Error closing browser:', error.message);
168
+ }
169
+ // Remove browser even if close failed
170
+ this.removeBrowser(browser);
171
+ }
172
+ }
173
+
174
+ // Schedule next cleanup if there are still browsers
175
+ if (this.pool.length > 0) {
176
+ this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
177
+ }
178
+ }
179
+
180
+ async closeAll() {
181
+ if (this.cleanupTimer) {
182
+ clearTimeout(this.cleanupTimer);
183
+ this.cleanupTimer = null;
184
+ }
185
+
186
+ // Clear the queue
187
+ this.requestQueue = [];
188
+
189
+ const closePromises = this.pool.map(async (browser) => {
190
+ try {
191
+ // Check if browser is still connected
192
+ if (browser.instance && browser.instance.isConnected()) {
193
+ await browser.instance.close();
194
+ }
195
+ } catch (error) {
196
+ // Silently ignore protocol errors and disconnection errors
197
+ if (!error.message.includes('Protocol error') &&
198
+ !error.message.includes('Target closed') &&
199
+ !error.message.includes('Connection closed')) {
200
+ console.warn('Error closing browser:', error.message);
201
+ }
202
+ }
203
+ });
204
+
205
+ await Promise.all(closePromises);
206
+ this.pool = [];
207
+ this.busyBrowsers.clear();
208
+ }
209
+
210
+ getStats() {
211
+ return {
212
+ ...this.stats,
213
+ poolSize: this.pool.length,
214
+ busyCount: this.busyBrowsers.size,
215
+ idleCount: this.pool.length - this.busyBrowsers.size,
216
+ queueLength: this.requestQueue.length
217
+ };
218
+ }
219
+ }
220
+
221
+ // Global browser pool instance
222
+ const browserPool = new BrowserPool(3, 5000);
223
+
224
+ // Graceful shutdown
225
+ process.on('SIGTERM', () => browserPool.closeAll());
226
+ process.on('SIGINT', () => browserPool.closeAll());
227
+ process.on('beforeExit', () => browserPool.closeAll());
228
+
229
+ export default browserPool;
package/index.d.ts CHANGED
@@ -139,6 +139,118 @@ export interface HealthCheckResult {
139
139
  timestamp: string;
140
140
  }
141
141
 
142
+ export interface BulkScrapeOptions extends ScrapingOptions {
143
+ /** Number of concurrent requests (default: 5) */
144
+ concurrency?: number;
145
+ /** Progress callback function */
146
+ progressCallback?: (progress: BulkProgress) => void;
147
+ /** Continue processing on error (default: true) */
148
+ continueOnError?: boolean;
149
+ }
150
+
151
+ export interface BulkScrapeStreamOptions extends ScrapingOptions {
152
+ /** Number of concurrent requests (default: 5) */
153
+ concurrency?: number;
154
+ /** Callback for each successful result */
155
+ onResult: (result: BulkScrapeResultItem) => void | Promise<void>;
156
+ /** Callback for errors */
157
+ onError?: (error: BulkScrapeErrorItem) => void | Promise<void>;
158
+ /** Progress callback function */
159
+ progressCallback?: (progress: BulkProgress) => void;
160
+ }
161
+
162
+ export interface BulkProgress {
163
+ /** Number of URLs processed */
164
+ processed: number;
165
+ /** Total number of URLs */
166
+ total: number;
167
+ /** Percentage complete */
168
+ percentage: number;
169
+ /** Current URL being processed */
170
+ current: string;
171
+ }
172
+
173
+ export interface BulkScrapeResult {
174
+ /** Successfully scraped results */
175
+ success: BulkScrapeResultItem[];
176
+ /** Failed scrapes */
177
+ failed: BulkScrapeErrorItem[];
178
+ /** Total number of URLs */
179
+ total: number;
180
+ /** Start timestamp */
181
+ startTime: number;
182
+ /** End timestamp */
183
+ endTime: number;
184
+ /** Aggregate statistics */
185
+ stats: BulkScrapeStats;
186
+ }
187
+
188
+ export interface BulkScrapeResultItem extends ScrapingResult {
189
+ /** The URL that was scraped */
190
+ url: string;
191
+ /** Time taken in milliseconds */
192
+ duration: number;
193
+ /** Timestamp of completion */
194
+ timestamp: string;
195
+ }
196
+
197
+ export interface BulkScrapeErrorItem {
198
+ /** The URL that failed */
199
+ url: string;
200
+ /** Success is always false for errors */
201
+ success: false;
202
+ /** Error message */
203
+ error: string;
204
+ /** Time taken in milliseconds */
205
+ duration: number;
206
+ /** Timestamp of failure */
207
+ timestamp: string;
208
+ }
209
+
210
+ export interface BulkScrapeStats {
211
+ /** Number of successful scrapes */
212
+ successful: number;
213
+ /** Number of failed scrapes */
214
+ failed: number;
215
+ /** Total time taken in milliseconds */
216
+ totalTime: number;
217
+ /** Average time per URL in milliseconds */
218
+ averageTime: number;
219
+ /** Count of methods used */
220
+ methods: {
221
+ direct: number;
222
+ lightpanda: number;
223
+ puppeteer: number;
224
+ pdf: number;
225
+ };
226
+ }
227
+
228
+ export interface BulkScrapeStreamStats {
229
+ /** Total number of URLs */
230
+ total: number;
231
+ /** Number of URLs processed */
232
+ processed: number;
233
+ /** Number of successful scrapes */
234
+ successful: number;
235
+ /** Number of failed scrapes */
236
+ failed: number;
237
+ /** Start timestamp */
238
+ startTime: number;
239
+ /** End timestamp */
240
+ endTime: number;
241
+ /** Total time in milliseconds */
242
+ totalTime: number;
243
+ /** Average time per URL in milliseconds */
244
+ averageTime: number;
245
+ /** Count of methods used */
246
+ methods: {
247
+ direct: number;
248
+ lightpanda: number;
249
+ puppeteer: number;
250
+ pdf: number;
251
+ };
252
+ }
253
+
142
254
  /**
143
255
  * BNCA Smart Scraper - Intelligent web scraping with multi-level fallback
144
256
  */
@@ -264,6 +376,27 @@ export class BNCASmartScraper {
264
376
  * @param message Message to log
265
377
  */
266
378
  private log(message: string): void;
379
+
380
+ /**
381
+ * Clean up resources - closes all browser instances
382
+ */
383
+ cleanup(): Promise<void>;
384
+
385
+ /**
386
+ * Bulk scrape multiple URLs with optimized concurrency
387
+ * @param urls Array of URLs to scrape
388
+ * @param options Bulk scraping options
389
+ * @returns Promise resolving to bulk scraping results
390
+ */
391
+ bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>;
392
+
393
+ /**
394
+ * Bulk scrape with streaming results
395
+ * @param urls Array of URLs to scrape
396
+ * @param options Bulk scraping options with callbacks
397
+ * @returns Promise resolving to summary statistics
398
+ */
399
+ bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
267
400
  }
268
401
 
269
402
  /**
@@ -306,6 +439,22 @@ export function askWebsiteAI(url: string, question: string, options?: ScrapingOp
306
439
  processing?: 'openrouter' | 'openai' | 'backend' | 'local';
307
440
  }>;
308
441
 
442
+ /**
443
+ * Convenience function for bulk scraping multiple URLs
444
+ * @param urls Array of URLs to scrape
445
+ * @param options Bulk scraping options
446
+ * @returns Promise resolving to bulk scraping results
447
+ */
448
+ export function bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>;
449
+
450
+ /**
451
+ * Convenience function for bulk scraping with streaming results
452
+ * @param urls Array of URLs to scrape
453
+ * @param options Bulk scraping options with callbacks
454
+ * @returns Promise resolving to summary statistics
455
+ */
456
+ export function bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
457
+
309
458
  /**
310
459
  * Default export - same as BNCASmartScraper class
311
460
  */