npm - @monostate/node-scraper - Versions diffs - 1.8.0 → 2.0.0 - Mend

@monostate/node-scraper 1.8.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/BULK_SCRAPING.md +626 -0
package/README.md +106 -556
package/browser-pool.js +229 -0
package/index.js +46 -28
package/package.json +7 -5
package/scripts/install-lightpanda.js +20 -7

package/browser-pool.js ADDED Viewed

@@ -0,0 +1,229 @@
+class BrowserPool {
+    constructor(maxInstances = 3, idleTimeout = 5000) {
+        this.maxInstances = maxInstances;
+        this.idleTimeout = idleTimeout;
+        this.pool = [];
+        this.busyBrowsers = new Set();
+        this.cleanupTimer = null;
+        this.requestQueue = [];
+        this.stats = {
+            created: 0,
+            reused: 0,
+            queued: 0,
+            cleaned: 0
+        };
+    }
+    async getBrowser() {
+        // Try to get an idle browser from pool
+        let browser = this.pool.find(b => !this.busyBrowsers.has(b.instance));
+        if (browser) {
+            browser.lastUsed = Date.now();
+            this.busyBrowsers.add(browser.instance);
+            this.stats.reused++;
+            return browser.instance;
+        }
+        // Create new browser if under limit
+        if (this.pool.length < this.maxInstances) {
+            browser = await this.createBrowser();
+            this.pool.push(browser);
+            this.busyBrowsers.add(browser.instance);
+            this.stats.created++;
+            return browser.instance;
+        }
+        // Queue the request and wait for available browser
+        this.stats.queued++;
+        return this.queueRequest();
+    }
+    async createBrowser() {
+        const puppeteer = await this.getPuppeteer();
+        const instance = await puppeteer.launch({
+            headless: true,
+            args: [
+                '--no-sandbox',
+                '--disable-setuid-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+                '--disable-web-security',
+                '--disable-features=VizDisplayCompositor',
+                '--disable-background-timer-throttling',
+                '--disable-backgrounding-occluded-windows',
+                '--disable-renderer-backgrounding',
+                '--disable-extensions',
+                '--disable-default-apps',
+                '--disable-sync',
+                '--metrics-recording-only',
+                '--mute-audio',
+                '--no-first-run'
+            ]
+        });
+        const browser = {
+            instance,
+            created: Date.now(),
+            lastUsed: Date.now(),
+            pageCount: 0
+        };
+        // Handle browser disconnect
+        instance.on('disconnected', () => {
+            this.removeBrowser(browser);
+            this.processQueue();
+        });
+        return browser;
+    }
+    async getPuppeteer() {
+        try {
+            const puppeteer = await import('puppeteer');
+            return puppeteer.default || puppeteer;
+        } catch (error) {
+            throw new Error('Puppeteer is not installed. Please install it to use Puppeteer-based scraping.');
+        }
+    }
+    async queueRequest() {
+        return new Promise((resolve) => {
+            this.requestQueue.push({ resolve, timestamp: Date.now() });
+        });
+    }
+    processQueue() {
+        if (this.requestQueue.length === 0) return;
+        // Find available browser
+        const available = this.pool.find(b => !this.busyBrowsers.has(b.instance));
+        if (!available) return;
+        // Process oldest request in queue
+        const request = this.requestQueue.shift();
+        if (request) {
+            available.lastUsed = Date.now();
+            this.busyBrowsers.add(available.instance);
+            request.resolve(available.instance);
+        }
+    }
+    releaseBrowser(browser) {
+        this.busyBrowsers.delete(browser);
+        // Process any queued requests
+        this.processQueue();
+        // Start cleanup timer if not already running
+        if (!this.cleanupTimer) {
+            this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
+        }
+    }
+    removeBrowser(browserObj) {
+        const index = this.pool.findIndex(b => b.instance === browserObj.instance);
+        if (index !== -1) {
+            this.pool.splice(index, 1);
+            this.busyBrowsers.delete(browserObj.instance);
+        }
+    }
+    async cleanup() {
+        this.cleanupTimer = null;
+        const now = Date.now();
+        const toRemove = [];
+        // Keep at least one browser if there are queued requests
+        const minBrowsers = this.requestQueue.length > 0 ? 1 : 0;
+        for (const browser of this.pool) {
+            // Skip if we need to keep minimum browsers
+            if (this.pool.length - toRemove.length <= minBrowsers) break;
+            // Remove idle browsers
+            const isIdle = !this.busyBrowsers.has(browser.instance);
+            const idleTime = now - browser.lastUsed;
+            if (isIdle && idleTime > this.idleTimeout) {
+                toRemove.push(browser);
+            }
+        }
+        // Close idle browsers
+        for (const browser of toRemove) {
+            try {
+                // Check if browser is still connected
+                if (browser.instance && browser.instance.isConnected()) {
+                    await browser.instance.close();
+                }
+                this.removeBrowser(browser);
+                this.stats.cleaned++;
+            } catch (error) {
+                // Silently ignore protocol errors and disconnection errors
+                if (!error.message.includes('Protocol error') &&
+                    !error.message.includes('Target closed') &&
+                    !error.message.includes('Connection closed')) {
+                    console.warn('Error closing browser:', error.message);
+                }
+                // Remove browser even if close failed
+                this.removeBrowser(browser);
+            }
+        }
+        // Schedule next cleanup if there are still browsers
+        if (this.pool.length > 0) {
+            this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
+        }
+    }
+    async closeAll() {
+        if (this.cleanupTimer) {
+            clearTimeout(this.cleanupTimer);
+            this.cleanupTimer = null;
+        }
+        // Clear the queue
+        this.requestQueue = [];
+        const closePromises = this.pool.map(async (browser) => {
+            try {
+                // Check if browser is still connected
+                if (browser.instance && browser.instance.isConnected()) {
+                    await browser.instance.close();
+                }
+            } catch (error) {
+                // Silently ignore protocol errors and disconnection errors
+                if (!error.message.includes('Protocol error') &&
+                    !error.message.includes('Target closed') &&
+                    !error.message.includes('Connection closed')) {
+                    console.warn('Error closing browser:', error.message);
+                }
+            }
+        });
+        await Promise.all(closePromises);
+        this.pool = [];
+        this.busyBrowsers.clear();
+    }
+    getStats() {
+        return {
+            ...this.stats,
+            poolSize: this.pool.length,
+            busyCount: this.busyBrowsers.size,
+            idleCount: this.pool.length - this.busyBrowsers.size,
+            queueLength: this.requestQueue.length
+        };
+    }
+}
+// Global browser pool instance
+const browserPool = new BrowserPool(3, 5000);
+// Graceful shutdown
+process.on('SIGTERM', () => browserPool.closeAll());
+process.on('SIGINT', () => browserPool.closeAll());
+process.on('beforeExit', () => browserPool.closeAll());
+export default browserPool;

package/index.js CHANGED Viewed

@@ -1,11 +1,10 @@
-import fetch from 'node-fetch';
 import { spawn, execSync } from 'child_process';
 import fs from 'fs/promises';
 import { existsSync, statSync } from 'fs';
 import path from 'path';
 import { fileURLToPath } from 'url';
 import { promises as fsPromises } from 'fs';
-import pdfParse from 'pdf-parse/lib/pdf-parse.js';
+import { PDFParse } from 'pdf-parse';
 import browserPool from './browser-pool.js';
 let puppeteer = null;
@@ -604,27 +603,41 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
     }
     return new Promise((resolve) => {
-      const args = ['fetch', '--dump', url];
+      const format = config.lightpandaFormat || 'html';
+      const args = [
+        'fetch',
+        '--dump', format,
+        '--with_frames',
+        '--http_timeout', String(config.timeout),
+        url
+      ];
       const process = spawn(this.options.lightpandaPath, args, {
-        timeout: config.timeout + 1000 // Add buffer for process timeout only
+        timeout: config.timeout + 2000 // Buffer above http_timeout
       });
       let output = '';
       let errorOutput = '';
       process.stdout.on('data', (data) => {
         output += data.toString();
       });
       process.stderr.on('data', (data) => {
         errorOutput += data.toString();
       });
       process.on('close', (code) => {
         if (code === 0 && output.length > 0) {
-          const content = this.extractContentFromHTML(output);
+          // Markdown output is already clean text, no HTML extraction needed
+          const content = format === 'markdown'
+            ? JSON.stringify({
+                title: output.match(/^#\s+(.+)$/m)?.[1] || '',
+                content: output,
+                extractedAt: new Date().toISOString()
+              }, null, 2)
+            : this.extractContentFromHTML(output);
           this.stats.lightpanda.successes++;
           resolve({
             success: true,
             content,
@@ -642,7 +655,7 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
           });
         }
       });
       process.on('error', (error) => {
         resolve({
           success: false,
@@ -847,25 +860,30 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
         };
       }
-      // Parse PDF
-      const pdfData = await pdfParse(buffer);
+      // Parse PDF with pdf-parse v2 API
+      const parser = new PDFParse({ data: new Uint8Array(buffer) });
+      await parser.load();
+      const textResult = await parser.getText();
+      const infoResult = await parser.getInfo();
+      parser.destroy();
       // Extract structured content
+      const pdfInfo = infoResult.info || {};
       const content = {
-        title: pdfData.info?.Title || 'Untitled PDF',
-        author: pdfData.info?.Author || '',
-        subject: pdfData.info?.Subject || '',
-        keywords: pdfData.info?.Keywords || '',
-        creator: pdfData.info?.Creator || '',
-        producer: pdfData.info?.Producer || '',
-        creationDate: pdfData.info?.CreationDate || '',
-        modificationDate: pdfData.info?.ModificationDate || '',
-        pages: pdfData.numpages || 0,
-        text: pdfData.text || '',
-        metadata: pdfData.metadata || null,
+        title: pdfInfo.Title || infoResult.outline?.[0]?.title || 'Untitled PDF',
+        author: pdfInfo.Author || '',
+        subject: pdfInfo.Subject || '',
+        keywords: pdfInfo.Keywords || '',
+        creator: pdfInfo.Creator || '',
+        producer: pdfInfo.Producer || '',
+        creationDate: pdfInfo.CreationDate || '',
+        modificationDate: pdfInfo.ModDate || '',
+        pages: textResult.total || 0,
+        text: textResult.text || '',
+        metadata: infoResult.metadata || null,
         url: url
       };
       this.stats.pdf.successes++;
       return {
@@ -1008,11 +1026,11 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
       });
       // Extract window state data
-      const windowDataMatch = html.match(/window\.__(?:INITIAL_STATE__|INITIAL_DATA__|NEXT_DATA__)__\s*=\s*({[\s\S]*?});/);
+      const windowDataMatch = html.match(/window\.__(INITIAL_STATE|INITIAL_DATA|NEXT_DATA)__\s*=\s*({[\s\S]*?});/);
       let windowData = null;
       if (windowDataMatch) {
         try {
-          windowData = JSON.parse(windowDataMatch[1]);
+          windowData = JSON.parse(windowDataMatch[2]);
         } catch {
           windowData = 'Found but unparseable';
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@monostate/node-scraper",
-  "version": "1.8.0",
+  "version": "2.0.0",
   "description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
   "type": "module",
   "main": "index.js",
@@ -14,11 +14,14 @@
   "files": [
     "index.js",
     "index.d.ts",
+    "browser-pool.js",
     "README.md",
+    "BULK_SCRAPING.md",
     "package.json",
     "scripts/"
   ],
   "scripts": {
+    "test": "node --test test/",
     "postinstall": "node scripts/install-lightpanda.js"
   },
   "keywords": [
@@ -45,11 +48,10 @@
   "author": "BNCA Team",
   "license": "MIT",
   "dependencies": {
-    "node-fetch": "^3.3.2",
-    "pdf-parse": "^1.1.1"
+    "pdf-parse": "^2.4.5"
   },
   "peerDependencies": {
-    "puppeteer": "^24.11.2"
+    "puppeteer": "^24.38.0"
   },
   "peerDependenciesMeta": {
     "puppeteer": {
@@ -57,7 +59,7 @@
     }
   },
   "engines": {
-    "node": ">=18.0.0"
+    "node": ">=20.0.0"
   },
   "repository": {
     "type": "git",

package/scripts/install-lightpanda.js CHANGED Viewed

@@ -6,17 +6,30 @@ import path from 'path';
 import { createWriteStream } from 'fs';
 import { execSync } from 'child_process';
-const LIGHTPANDA_VERSION = 'nightly';
+const LIGHTPANDA_VERSION = 'v0.2.5';
 const BINARY_DIR = path.join(path.dirname(path.dirname(new URL(import.meta.url).pathname)), 'bin');
 const BINARY_NAME = 'lightpanda';
 const BINARY_PATH = path.join(BINARY_DIR, BINARY_NAME);
-// Platform-specific download URLs (matching official Lightpanda instructions)
-const DOWNLOAD_URLS = {
-  'darwin': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-aarch64-macos`,
-  'linux': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-x86_64-linux`,
-  'wsl': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-x86_64-linux` // WSL uses Linux binary
-};
+function detectArch() {
+  const arch = process.arch;
+  if (arch === 'arm64' || arch === 'aarch64') return 'aarch64';
+  if (arch === 'x64' || arch === 'x86_64') return 'x86_64';
+  return arch;
+}
+// Platform-specific download URLs (matching official Lightpanda releases)
+function getDownloadUrls() {
+  const arch = detectArch();
+  const base = `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}`;
+  return {
+    'darwin': `${base}/lightpanda-${arch}-macos`,
+    'linux': `${base}/lightpanda-${arch}-linux`,
+    'wsl': `${base}/lightpanda-x86_64-linux`
+  };
+}
+const DOWNLOAD_URLS = getDownloadUrls();
 function detectPlatform() {
   const platform = process.platform;