npm - @monostate/node-scraper - Versions diffs - 1.8.1 → 2.1.0 - Mend

@monostate/node-scraper 1.8.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +106 -558
package/browser-pool.js +1 -1
package/browser-session.js +551 -0
package/index.d.ts +97 -0
package/index.js +50 -28
package/lightpanda-server.js +151 -0
package/package.json +10 -5
package/scripts/install-lightpanda.js +20 -7

package/index.js CHANGED Viewed

@@ -1,11 +1,10 @@
-import fetch from 'node-fetch';
 import { spawn, execSync } from 'child_process';
 import fs from 'fs/promises';
 import { existsSync, statSync } from 'fs';
 import path from 'path';
 import { fileURLToPath } from 'url';
 import { promises as fsPromises } from 'fs';
-import pdfParse from 'pdf-parse/lib/pdf-parse.js';
+import { PDFParse } from 'pdf-parse';
 import browserPool from './browser-pool.js';
 let puppeteer = null;
@@ -604,27 +603,41 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
     }
     return new Promise((resolve) => {
-      const args = ['fetch', '--dump', url];
+      const format = config.lightpandaFormat || 'html';
+      const args = [
+        'fetch',
+        '--dump', format,
+        '--with_frames',
+        '--http_timeout', String(config.timeout),
+        url
+      ];
       const process = spawn(this.options.lightpandaPath, args, {
-        timeout: config.timeout + 1000 // Add buffer for process timeout only
+        timeout: config.timeout + 2000 // Buffer above http_timeout
       });
       let output = '';
       let errorOutput = '';
       process.stdout.on('data', (data) => {
         output += data.toString();
       });
       process.stderr.on('data', (data) => {
         errorOutput += data.toString();
       });
       process.on('close', (code) => {
         if (code === 0 && output.length > 0) {
-          const content = this.extractContentFromHTML(output);
+          // Markdown output is already clean text, no HTML extraction needed
+          const content = format === 'markdown'
+            ? JSON.stringify({
+                title: output.match(/^#\s+(.+)$/m)?.[1] || '',
+                content: output,
+                extractedAt: new Date().toISOString()
+              }, null, 2)
+            : this.extractContentFromHTML(output);
           this.stats.lightpanda.successes++;
           resolve({
             success: true,
             content,
@@ -642,7 +655,7 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
           });
         }
       });
       process.on('error', (error) => {
         resolve({
           success: false,
@@ -847,25 +860,30 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
         };
       }
-      // Parse PDF
-      const pdfData = await pdfParse(buffer);
+      // Parse PDF with pdf-parse v2 API
+      const parser = new PDFParse({ data: new Uint8Array(buffer) });
+      await parser.load();
+      const textResult = await parser.getText();
+      const infoResult = await parser.getInfo();
+      parser.destroy();
       // Extract structured content
+      const pdfInfo = infoResult.info || {};
       const content = {
-        title: pdfData.info?.Title || 'Untitled PDF',
-        author: pdfData.info?.Author || '',
-        subject: pdfData.info?.Subject || '',
-        keywords: pdfData.info?.Keywords || '',
-        creator: pdfData.info?.Creator || '',
-        producer: pdfData.info?.Producer || '',
-        creationDate: pdfData.info?.CreationDate || '',
-        modificationDate: pdfData.info?.ModificationDate || '',
-        pages: pdfData.numpages || 0,
-        text: pdfData.text || '',
-        metadata: pdfData.metadata || null,
+        title: pdfInfo.Title || infoResult.outline?.[0]?.title || 'Untitled PDF',
+        author: pdfInfo.Author || '',
+        subject: pdfInfo.Subject || '',
+        keywords: pdfInfo.Keywords || '',
+        creator: pdfInfo.Creator || '',
+        producer: pdfInfo.Producer || '',
+        creationDate: pdfInfo.CreationDate || '',
+        modificationDate: pdfInfo.ModDate || '',
+        pages: textResult.total || 0,
+        text: textResult.text || '',
+        metadata: infoResult.metadata || null,
         url: url
       };
       this.stats.pdf.successes++;
       return {
@@ -1008,11 +1026,11 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
       });
       // Extract window state data
-      const windowDataMatch = html.match(/window\.__(?:INITIAL_STATE__|INITIAL_DATA__|NEXT_DATA__)__\s*=\s*({[\s\S]*?});/);
+      const windowDataMatch = html.match(/window\.__(INITIAL_STATE|INITIAL_DATA|NEXT_DATA)__\s*=\s*({[\s\S]*?});/);
       let windowData = null;
       if (windowDataMatch) {
         try {
-          windowData = JSON.parse(windowDataMatch[1]);
+          windowData = JSON.parse(windowDataMatch[2]);
         } catch {
           windowData = 'Found but unparseable';
         }
@@ -1777,4 +1795,8 @@ export async function bulkScrapeStream(urls, options = {}) {
   }
 }
+// Browser session exports
+export { BrowserSession, createSession } from './browser-session.js';
+export { default as LightPandaServer, getLightPandaServer, stopLightPandaServer } from './lightpanda-server.js';
 export default BNCASmartScraper;

package/lightpanda-server.js ADDED Viewed

@@ -0,0 +1,151 @@
+import { spawn } from 'child_process';
+import { createServer } from 'net';
+import path from 'path';
+import fs from 'fs';
+class LightPandaServer {
+  constructor(binaryPath) {
+    this.binaryPath = binaryPath || this._findBinary();
+    this.process = null;
+    this.host = '127.0.0.1';
+    this.port = null;
+    this.ready = false;
+  }
+  async start(port) {
+    if (this.process && this.ready) return this.getEndpoint();
+    this.port = port || await this._findAvailablePort();
+    return new Promise((resolve, reject) => {
+      const args = [
+        'serve',
+        '--host', this.host,
+        '--port', String(this.port),
+        '--cdp_max_connections', '16',
+      ];
+      this.process = spawn(this.binaryPath, args, {
+        stdio: ['ignore', 'pipe', 'pipe'],
+      });
+      let stderr = '';
+      const onReady = () => {
+        this.ready = true;
+        resolve(this.getEndpoint());
+      };
+      // LP prints to stderr when ready — wait for it or poll /json/version
+      this.process.stderr.on('data', (data) => {
+        stderr += data.toString();
+        // LightPanda logs server start to stderr
+        if (stderr.includes('Listening on') || stderr.includes('server started')) {
+          onReady();
+        }
+      });
+      this.process.on('error', (err) => {
+        this.ready = false;
+        reject(new Error(`Failed to start LightPanda: ${err.message}`));
+      });
+      this.process.on('exit', (code) => {
+        this.ready = false;
+        this.process = null;
+        if (!this.ready) {
+          reject(new Error(`LightPanda exited with code ${code}: ${stderr}`));
+        }
+      });
+      // Fallback: poll /json/version if no stderr signal within 3s
+      setTimeout(async () => {
+        if (this.ready) return;
+        try {
+          const res = await fetch(`http://${this.host}:${this.port}/json/version`);
+          if (res.ok) onReady();
+        } catch {
+          // Still starting up, give it more time
+        }
+      }, 1500);
+      // Hard timeout
+      setTimeout(() => {
+        if (!this.ready) {
+          this.stop();
+          reject(new Error(`LightPanda failed to start within 5s. stderr: ${stderr}`));
+        }
+      }, 5000);
+    });
+  }
+  getEndpoint() {
+    return `ws://${this.host}:${this.port}`;
+  }
+  isRunning() {
+    return this.ready && this.process !== null;
+  }
+  stop() {
+    if (this.process) {
+      try {
+        this.process.kill('SIGTERM');
+      } catch {
+        // already dead
+      }
+      this.process = null;
+    }
+    this.ready = false;
+    this.port = null;
+  }
+  async _findAvailablePort() {
+    return new Promise((resolve, reject) => {
+      const server = createServer();
+      server.listen(0, '127.0.0.1', () => {
+        const port = server.address().port;
+        server.close(() => resolve(port));
+      });
+      server.on('error', reject);
+    });
+  }
+  _findBinary() {
+    // Check common locations
+    const candidates = [
+      path.join(path.dirname(new URL(import.meta.url).pathname), 'bin', 'lightpanda'),
+      '/usr/local/bin/lightpanda',
+      '/usr/bin/lightpanda',
+    ];
+    for (const p of candidates) {
+      if (fs.existsSync(p)) return p;
+    }
+    return 'lightpanda'; // hope it's on PATH
+  }
+}
+// Singleton instance — shared across all sessions
+let _instance = null;
+export function getLightPandaServer(binaryPath) {
+  if (!_instance) {
+    _instance = new LightPandaServer(binaryPath);
+  }
+  return _instance;
+}
+export function stopLightPandaServer() {
+  if (_instance) {
+    _instance.stop();
+    _instance = null;
+  }
+}
+process.on('SIGTERM', stopLightPandaServer);
+process.on('SIGINT', stopLightPandaServer);
+process.on('beforeExit', stopLightPandaServer);
+export default LightPandaServer;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@monostate/node-scraper",
-  "version": "1.8.1",
+  "version": "2.1.0",
   "description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
   "type": "module",
   "main": "index.js",
@@ -15,12 +15,15 @@
     "index.js",
     "index.d.ts",
     "browser-pool.js",
+    "browser-session.js",
+    "lightpanda-server.js",
     "README.md",
     "BULK_SCRAPING.md",
     "package.json",
     "scripts/"
   ],
   "scripts": {
+    "test": "node --test test/",
     "postinstall": "node scripts/install-lightpanda.js"
   },
   "keywords": [
@@ -33,6 +36,9 @@
     "data-extraction",
     "automation",
     "browser",
+    "browser-use",
+    "cdp",
+    "ai-agent",
     "ai-powered",
     "question-answering",
     "pdf-parsing",
@@ -47,11 +53,10 @@
   "author": "BNCA Team",
   "license": "MIT",
   "dependencies": {
-    "node-fetch": "^3.3.2",
-    "pdf-parse": "^1.1.1"
+    "pdf-parse": "^2.4.5"
   },
   "peerDependencies": {
-    "puppeteer": "^24.11.2"
+    "puppeteer": "^24.38.0"
   },
   "peerDependenciesMeta": {
     "puppeteer": {
@@ -59,7 +64,7 @@
     }
   },
   "engines": {
-    "node": ">=18.0.0"
+    "node": ">=20.0.0"
   },
   "repository": {
     "type": "git",

package/scripts/install-lightpanda.js CHANGED Viewed

@@ -6,17 +6,30 @@ import path from 'path';
 import { createWriteStream } from 'fs';
 import { execSync } from 'child_process';
-const LIGHTPANDA_VERSION = 'nightly';
+const LIGHTPANDA_VERSION = 'v0.2.5';
 const BINARY_DIR = path.join(path.dirname(path.dirname(new URL(import.meta.url).pathname)), 'bin');
 const BINARY_NAME = 'lightpanda';
 const BINARY_PATH = path.join(BINARY_DIR, BINARY_NAME);
-// Platform-specific download URLs (matching official Lightpanda instructions)
-const DOWNLOAD_URLS = {
-  'darwin': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-aarch64-macos`,
-  'linux': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-x86_64-linux`,
-  'wsl': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-x86_64-linux` // WSL uses Linux binary
-};
+function detectArch() {
+  const arch = process.arch;
+  if (arch === 'arm64' || arch === 'aarch64') return 'aarch64';
+  if (arch === 'x64' || arch === 'x86_64') return 'x86_64';
+  return arch;
+}
+// Platform-specific download URLs (matching official Lightpanda releases)
+function getDownloadUrls() {
+  const arch = detectArch();
+  const base = `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}`;
+  return {
+    'darwin': `${base}/lightpanda-${arch}-macos`,
+    'linux': `${base}/lightpanda-${arch}-linux`,
+    'wsl': `${base}/lightpanda-x86_64-linux`
+  };
+}
+const DOWNLOAD_URLS = getDownloadUrls();
 function detectPlatform() {
   const platform = process.platform;