npm - @monostate/node-scraper - Versions diffs - 1.7.0 → 1.8.1 - Mend

@monostate/node-scraper 1.7.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md CHANGED Viewed

@@ -19,6 +19,10 @@ yarn add @monostate/node-scraper
 pnpm add @monostate/node-scraper
 ```
+**Fixed in v1.8.1**: Critical production fix - browser-pool.js now included in npm package.
+**New in v1.8.0**: Bulk scraping with automatic request queueing, progress tracking, and streaming results! Process hundreds of URLs efficiently. Plus critical memory leak fix with browser pooling.
 **Fixed in v1.7.0**: Critical cross-platform compatibility fix - binaries are now correctly downloaded per platform instead of being bundled.
 **New in v1.6.0**: Method override support! Force specific scraping methods with `method` parameter for testing and optimization.
@@ -76,6 +80,24 @@ console.log(result.stats); // Performance statistics
 await scraper.cleanup(); // Clean up resources
 ```
+### Browser Pool Configuration (New in v1.8.0)
+The package now includes automatic browser instance pooling to prevent memory leaks:
+```javascript
+// Browser pool is managed automatically with these defaults:
+// - Max 3 concurrent browser instances
+// - 5 second idle timeout before cleanup
+// - Automatic reuse of browser instances
+// For heavy workloads, you can manually clean up:
+const scraper = new BNCASmartScraper();
+// ... perform multiple scrapes ...
+await scraper.cleanup(); // Closes all browser instances
+```
+**Important**: The convenience functions (`smartScrape`, `smartScreenshot`, etc.) automatically handle cleanup. You only need to call `cleanup()` when using the `BNCASmartScraper` class directly.
 ### Method Override (New in v1.6.0)
 Force a specific scraping method instead of using automatic fallback:
@@ -110,6 +132,68 @@ const result = await smartScrape('https://example.com', { method: 'auto' });
 }
 ```
+### Bulk Scraping (New in v1.8.0)
+Process multiple URLs efficiently with automatic request queueing and progress tracking:
+```javascript
+import { bulkScrape } from '@monostate/node-scraper';
+// Basic bulk scraping
+const urls = [
+  'https://example1.com',
+  'https://example2.com',
+  'https://example3.com',
+  // ... hundreds more
+];
+const results = await bulkScrape(urls, {
+  concurrency: 5,  // Process 5 URLs at a time
+  continueOnError: true,  // Don't stop on failures
+  progressCallback: (progress) => {
+    console.log(`Progress: ${progress.percentage.toFixed(1)}% (${progress.processed}/${progress.total})`);
+  }
+});
+console.log(`Success: ${results.stats.successful}, Failed: ${results.stats.failed}`);
+console.log(`Total time: ${results.stats.totalTime}ms`);
+console.log(`Average time per URL: ${results.stats.averageTime}ms`);
+```
+#### Streaming Results
+For large datasets, use streaming to process results as they complete:
+```javascript
+import { bulkScrapeStream } from '@monostate/node-scraper';
+await bulkScrapeStream(urls, {
+  concurrency: 10,
+  onResult: async (result) => {
+    // Process each successful result immediately
+    await saveToDatabase(result);
+    console.log(`✓ ${result.url} - ${result.duration}ms`);
+  },
+  onError: async (error) => {
+    // Handle errors as they occur
+    console.error(`✗ ${error.url} - ${error.error}`);
+  },
+  progressCallback: (progress) => {
+    process.stdout.write(`\rProcessing: ${progress.percentage.toFixed(1)}%`);
+  }
+});
+```
+**Features:**
+- Automatic request queueing (no more memory errors!)
+- Configurable concurrency control
+- Real-time progress tracking
+- Continue on error or stop on first failure
+- Detailed statistics and method tracking
+- Browser instance pooling for efficiency
+For detailed examples and advanced usage, see [BULK_SCRAPING.md](./BULK_SCRAPING.md).
 ## How It Works
 BNCA uses a sophisticated multi-tier system with intelligent detection:

package/browser-pool.js ADDED Viewed

@@ -0,0 +1,229 @@
+class BrowserPool {
+    constructor(maxInstances = 3, idleTimeout = 5000) {
+        this.maxInstances = maxInstances;
+        this.idleTimeout = idleTimeout;
+        this.pool = [];
+        this.busyBrowsers = new Set();
+        this.cleanupTimer = null;
+        this.requestQueue = [];
+        this.stats = {
+            created: 0,
+            reused: 0,
+            queued: 0,
+            cleaned: 0
+        };
+    }
+    async getBrowser() {
+        // Try to get an idle browser from pool
+        let browser = this.pool.find(b => !this.busyBrowsers.has(b.instance));
+        if (browser) {
+            browser.lastUsed = Date.now();
+            this.busyBrowsers.add(browser.instance);
+            this.stats.reused++;
+            return browser.instance;
+        }
+        // Create new browser if under limit
+        if (this.pool.length < this.maxInstances) {
+            browser = await this.createBrowser();
+            this.pool.push(browser);
+            this.busyBrowsers.add(browser.instance);
+            this.stats.created++;
+            return browser.instance;
+        }
+        // Queue the request and wait for available browser
+        this.stats.queued++;
+        return this.queueRequest();
+    }
+    async createBrowser() {
+        const puppeteer = await this.getPuppeteer();
+        const instance = await puppeteer.launch({
+            headless: 'new',
+            args: [
+                '--no-sandbox',
+                '--disable-setuid-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+                '--disable-web-security',
+                '--disable-features=VizDisplayCompositor',
+                '--disable-background-timer-throttling',
+                '--disable-backgrounding-occluded-windows',
+                '--disable-renderer-backgrounding',
+                '--disable-extensions',
+                '--disable-default-apps',
+                '--disable-sync',
+                '--metrics-recording-only',
+                '--mute-audio',
+                '--no-first-run'
+            ]
+        });
+        const browser = {
+            instance,
+            created: Date.now(),
+            lastUsed: Date.now(),
+            pageCount: 0
+        };
+        // Handle browser disconnect
+        instance.on('disconnected', () => {
+            this.removeBrowser(browser);
+            this.processQueue();
+        });
+        return browser;
+    }
+    async getPuppeteer() {
+        try {
+            const puppeteer = await import('puppeteer');
+            return puppeteer.default || puppeteer;
+        } catch (error) {
+            throw new Error('Puppeteer is not installed. Please install it to use Puppeteer-based scraping.');
+        }
+    }
+    async queueRequest() {
+        return new Promise((resolve) => {
+            this.requestQueue.push({ resolve, timestamp: Date.now() });
+        });
+    }
+    processQueue() {
+        if (this.requestQueue.length === 0) return;
+        // Find available browser
+        const available = this.pool.find(b => !this.busyBrowsers.has(b.instance));
+        if (!available) return;
+        // Process oldest request in queue
+        const request = this.requestQueue.shift();
+        if (request) {
+            available.lastUsed = Date.now();
+            this.busyBrowsers.add(available.instance);
+            request.resolve(available.instance);
+        }
+    }
+    releaseBrowser(browser) {
+        this.busyBrowsers.delete(browser);
+        // Process any queued requests
+        this.processQueue();
+        // Start cleanup timer if not already running
+        if (!this.cleanupTimer) {
+            this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
+        }
+    }
+    removeBrowser(browserObj) {
+        const index = this.pool.findIndex(b => b.instance === browserObj.instance);
+        if (index !== -1) {
+            this.pool.splice(index, 1);
+            this.busyBrowsers.delete(browserObj.instance);
+        }
+    }
+    async cleanup() {
+        this.cleanupTimer = null;
+        const now = Date.now();
+        const toRemove = [];
+        // Keep at least one browser if there are queued requests
+        const minBrowsers = this.requestQueue.length > 0 ? 1 : 0;
+        for (const browser of this.pool) {
+            // Skip if we need to keep minimum browsers
+            if (this.pool.length - toRemove.length <= minBrowsers) break;
+            // Remove idle browsers
+            const isIdle = !this.busyBrowsers.has(browser.instance);
+            const idleTime = now - browser.lastUsed;
+            if (isIdle && idleTime > this.idleTimeout) {
+                toRemove.push(browser);
+            }
+        }
+        // Close idle browsers
+        for (const browser of toRemove) {
+            try {
+                // Check if browser is still connected
+                if (browser.instance && browser.instance.isConnected()) {
+                    await browser.instance.close();
+                }
+                this.removeBrowser(browser);
+                this.stats.cleaned++;
+            } catch (error) {
+                // Silently ignore protocol errors and disconnection errors
+                if (!error.message.includes('Protocol error') &&
+                    !error.message.includes('Target closed') &&
+                    !error.message.includes('Connection closed')) {
+                    console.warn('Error closing browser:', error.message);
+                }
+                // Remove browser even if close failed
+                this.removeBrowser(browser);
+            }
+        }
+        // Schedule next cleanup if there are still browsers
+        if (this.pool.length > 0) {
+            this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
+        }
+    }
+    async closeAll() {
+        if (this.cleanupTimer) {
+            clearTimeout(this.cleanupTimer);
+            this.cleanupTimer = null;
+        }
+        // Clear the queue
+        this.requestQueue = [];
+        const closePromises = this.pool.map(async (browser) => {
+            try {
+                // Check if browser is still connected
+                if (browser.instance && browser.instance.isConnected()) {
+                    await browser.instance.close();
+                }
+            } catch (error) {
+                // Silently ignore protocol errors and disconnection errors
+                if (!error.message.includes('Protocol error') &&
+                    !error.message.includes('Target closed') &&
+                    !error.message.includes('Connection closed')) {
+                    console.warn('Error closing browser:', error.message);
+                }
+            }
+        });
+        await Promise.all(closePromises);
+        this.pool = [];
+        this.busyBrowsers.clear();
+    }
+    getStats() {
+        return {
+            ...this.stats,
+            poolSize: this.pool.length,
+            busyCount: this.busyBrowsers.size,
+            idleCount: this.pool.length - this.busyBrowsers.size,
+            queueLength: this.requestQueue.length
+        };
+    }
+}
+// Global browser pool instance
+const browserPool = new BrowserPool(3, 5000);
+// Graceful shutdown
+process.on('SIGTERM', () => browserPool.closeAll());
+process.on('SIGINT', () => browserPool.closeAll());
+process.on('beforeExit', () => browserPool.closeAll());
+export default browserPool;

package/index.d.ts CHANGED Viewed

@@ -139,6 +139,118 @@ export interface HealthCheckResult {
   timestamp: string;
 }
+export interface BulkScrapeOptions extends ScrapingOptions {
+  /** Number of concurrent requests (default: 5) */
+  concurrency?: number;
+  /** Progress callback function */
+  progressCallback?: (progress: BulkProgress) => void;
+  /** Continue processing on error (default: true) */
+  continueOnError?: boolean;
+}
+export interface BulkScrapeStreamOptions extends ScrapingOptions {
+  /** Number of concurrent requests (default: 5) */
+  concurrency?: number;
+  /** Callback for each successful result */
+  onResult: (result: BulkScrapeResultItem) => void | Promise<void>;
+  /** Callback for errors */
+  onError?: (error: BulkScrapeErrorItem) => void | Promise<void>;
+  /** Progress callback function */
+  progressCallback?: (progress: BulkProgress) => void;
+}
+export interface BulkProgress {
+  /** Number of URLs processed */
+  processed: number;
+  /** Total number of URLs */
+  total: number;
+  /** Percentage complete */
+  percentage: number;
+  /** Current URL being processed */
+  current: string;
+}
+export interface BulkScrapeResult {
+  /** Successfully scraped results */
+  success: BulkScrapeResultItem[];
+  /** Failed scrapes */
+  failed: BulkScrapeErrorItem[];
+  /** Total number of URLs */
+  total: number;
+  /** Start timestamp */
+  startTime: number;
+  /** End timestamp */
+  endTime: number;
+  /** Aggregate statistics */
+  stats: BulkScrapeStats;
+}
+export interface BulkScrapeResultItem extends ScrapingResult {
+  /** The URL that was scraped */
+  url: string;
+  /** Time taken in milliseconds */
+  duration: number;
+  /** Timestamp of completion */
+  timestamp: string;
+}
+export interface BulkScrapeErrorItem {
+  /** The URL that failed */
+  url: string;
+  /** Success is always false for errors */
+  success: false;
+  /** Error message */
+  error: string;
+  /** Time taken in milliseconds */
+  duration: number;
+  /** Timestamp of failure */
+  timestamp: string;
+}
+export interface BulkScrapeStats {
+  /** Number of successful scrapes */
+  successful: number;
+  /** Number of failed scrapes */
+  failed: number;
+  /** Total time taken in milliseconds */
+  totalTime: number;
+  /** Average time per URL in milliseconds */
+  averageTime: number;
+  /** Count of methods used */
+  methods: {
+    direct: number;
+    lightpanda: number;
+    puppeteer: number;
+    pdf: number;
+  };
+}
+export interface BulkScrapeStreamStats {
+  /** Total number of URLs */
+  total: number;
+  /** Number of URLs processed */
+  processed: number;
+  /** Number of successful scrapes */
+  successful: number;
+  /** Number of failed scrapes */
+  failed: number;
+  /** Start timestamp */
+  startTime: number;
+  /** End timestamp */
+  endTime: number;
+  /** Total time in milliseconds */
+  totalTime: number;
+  /** Average time per URL in milliseconds */
+  averageTime: number;
+  /** Count of methods used */
+  methods: {
+    direct: number;
+    lightpanda: number;
+    puppeteer: number;
+    pdf: number;
+  };
+}
 /**
  * BNCA Smart Scraper - Intelligent web scraping with multi-level fallback
  */
@@ -264,6 +376,27 @@ export class BNCASmartScraper {
    * @param message Message to log
    */
   private log(message: string): void;
+  /**
+   * Clean up resources - closes all browser instances
+   */
+  cleanup(): Promise<void>;
+  /**
+   * Bulk scrape multiple URLs with optimized concurrency
+   * @param urls Array of URLs to scrape
+   * @param options Bulk scraping options
+   * @returns Promise resolving to bulk scraping results
+   */
+  bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>;
+  /**
+   * Bulk scrape with streaming results
+   * @param urls Array of URLs to scrape
+   * @param options Bulk scraping options with callbacks
+   * @returns Promise resolving to summary statistics
+   */
+  bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
 }
 /**
@@ -306,6 +439,22 @@ export function askWebsiteAI(url: string, question: string, options?: ScrapingOp
   processing?: 'openrouter' | 'openai' | 'backend' | 'local';
 }>;
+/**
+ * Convenience function for bulk scraping multiple URLs
+ * @param urls Array of URLs to scrape
+ * @param options Bulk scraping options
+ * @returns Promise resolving to bulk scraping results
+ */
+export function bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>;
+/**
+ * Convenience function for bulk scraping with streaming results
+ * @param urls Array of URLs to scrape
+ * @param options Bulk scraping options with callbacks
+ * @returns Promise resolving to summary statistics
+ */
+export function bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>;
 /**
  * Default export - same as BNCASmartScraper class
  */