@monostate/node-scraper 1.8.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ class BrowserPool {
2
+ constructor(maxInstances = 3, idleTimeout = 5000) {
3
+ this.maxInstances = maxInstances;
4
+ this.idleTimeout = idleTimeout;
5
+ this.pool = [];
6
+ this.busyBrowsers = new Set();
7
+ this.cleanupTimer = null;
8
+ this.requestQueue = [];
9
+ this.stats = {
10
+ created: 0,
11
+ reused: 0,
12
+ queued: 0,
13
+ cleaned: 0
14
+ };
15
+ }
16
+
17
+ async getBrowser() {
18
+ // Try to get an idle browser from pool
19
+ let browser = this.pool.find(b => !this.busyBrowsers.has(b.instance));
20
+
21
+ if (browser) {
22
+ browser.lastUsed = Date.now();
23
+ this.busyBrowsers.add(browser.instance);
24
+ this.stats.reused++;
25
+ return browser.instance;
26
+ }
27
+
28
+ // Create new browser if under limit
29
+ if (this.pool.length < this.maxInstances) {
30
+ browser = await this.createBrowser();
31
+ this.pool.push(browser);
32
+ this.busyBrowsers.add(browser.instance);
33
+ this.stats.created++;
34
+ return browser.instance;
35
+ }
36
+
37
+ // Queue the request and wait for available browser
38
+ this.stats.queued++;
39
+ return this.queueRequest();
40
+ }
41
+
42
+ async createBrowser() {
43
+ const puppeteer = await this.getPuppeteer();
44
+ const instance = await puppeteer.launch({
45
+ headless: true,
46
+ args: [
47
+ '--no-sandbox',
48
+ '--disable-setuid-sandbox',
49
+ '--disable-dev-shm-usage',
50
+ '--disable-gpu',
51
+ '--disable-web-security',
52
+ '--disable-features=VizDisplayCompositor',
53
+ '--disable-background-timer-throttling',
54
+ '--disable-backgrounding-occluded-windows',
55
+ '--disable-renderer-backgrounding',
56
+ '--disable-extensions',
57
+ '--disable-default-apps',
58
+ '--disable-sync',
59
+ '--metrics-recording-only',
60
+ '--mute-audio',
61
+ '--no-first-run'
62
+ ]
63
+ });
64
+
65
+ const browser = {
66
+ instance,
67
+ created: Date.now(),
68
+ lastUsed: Date.now(),
69
+ pageCount: 0
70
+ };
71
+
72
+ // Handle browser disconnect
73
+ instance.on('disconnected', () => {
74
+ this.removeBrowser(browser);
75
+ this.processQueue();
76
+ });
77
+
78
+ return browser;
79
+ }
80
+
81
+ async getPuppeteer() {
82
+ try {
83
+ const puppeteer = await import('puppeteer');
84
+ return puppeteer.default || puppeteer;
85
+ } catch (error) {
86
+ throw new Error('Puppeteer is not installed. Please install it to use Puppeteer-based scraping.');
87
+ }
88
+ }
89
+
90
+ async queueRequest() {
91
+ return new Promise((resolve) => {
92
+ this.requestQueue.push({ resolve, timestamp: Date.now() });
93
+ });
94
+ }
95
+
96
+ processQueue() {
97
+ if (this.requestQueue.length === 0) return;
98
+
99
+ // Find available browser
100
+ const available = this.pool.find(b => !this.busyBrowsers.has(b.instance));
101
+ if (!available) return;
102
+
103
+ // Process oldest request in queue
104
+ const request = this.requestQueue.shift();
105
+ if (request) {
106
+ available.lastUsed = Date.now();
107
+ this.busyBrowsers.add(available.instance);
108
+ request.resolve(available.instance);
109
+ }
110
+ }
111
+
112
+ releaseBrowser(browser) {
113
+ this.busyBrowsers.delete(browser);
114
+
115
+ // Process any queued requests
116
+ this.processQueue();
117
+
118
+ // Start cleanup timer if not already running
119
+ if (!this.cleanupTimer) {
120
+ this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
121
+ }
122
+ }
123
+
124
+ removeBrowser(browserObj) {
125
+ const index = this.pool.findIndex(b => b.instance === browserObj.instance);
126
+ if (index !== -1) {
127
+ this.pool.splice(index, 1);
128
+ this.busyBrowsers.delete(browserObj.instance);
129
+ }
130
+ }
131
+
132
+ async cleanup() {
133
+ this.cleanupTimer = null;
134
+ const now = Date.now();
135
+ const toRemove = [];
136
+
137
+ // Keep at least one browser if there are queued requests
138
+ const minBrowsers = this.requestQueue.length > 0 ? 1 : 0;
139
+
140
+ for (const browser of this.pool) {
141
+ // Skip if we need to keep minimum browsers
142
+ if (this.pool.length - toRemove.length <= minBrowsers) break;
143
+
144
+ // Remove idle browsers
145
+ const isIdle = !this.busyBrowsers.has(browser.instance);
146
+ const idleTime = now - browser.lastUsed;
147
+
148
+ if (isIdle && idleTime > this.idleTimeout) {
149
+ toRemove.push(browser);
150
+ }
151
+ }
152
+
153
+ // Close idle browsers
154
+ for (const browser of toRemove) {
155
+ try {
156
+ // Check if browser is still connected
157
+ if (browser.instance && browser.instance.isConnected()) {
158
+ await browser.instance.close();
159
+ }
160
+ this.removeBrowser(browser);
161
+ this.stats.cleaned++;
162
+ } catch (error) {
163
+ // Silently ignore protocol errors and disconnection errors
164
+ if (!error.message.includes('Protocol error') &&
165
+ !error.message.includes('Target closed') &&
166
+ !error.message.includes('Connection closed')) {
167
+ console.warn('Error closing browser:', error.message);
168
+ }
169
+ // Remove browser even if close failed
170
+ this.removeBrowser(browser);
171
+ }
172
+ }
173
+
174
+ // Schedule next cleanup if there are still browsers
175
+ if (this.pool.length > 0) {
176
+ this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
177
+ }
178
+ }
179
+
180
+ async closeAll() {
181
+ if (this.cleanupTimer) {
182
+ clearTimeout(this.cleanupTimer);
183
+ this.cleanupTimer = null;
184
+ }
185
+
186
+ // Clear the queue
187
+ this.requestQueue = [];
188
+
189
+ const closePromises = this.pool.map(async (browser) => {
190
+ try {
191
+ // Check if browser is still connected
192
+ if (browser.instance && browser.instance.isConnected()) {
193
+ await browser.instance.close();
194
+ }
195
+ } catch (error) {
196
+ // Silently ignore protocol errors and disconnection errors
197
+ if (!error.message.includes('Protocol error') &&
198
+ !error.message.includes('Target closed') &&
199
+ !error.message.includes('Connection closed')) {
200
+ console.warn('Error closing browser:', error.message);
201
+ }
202
+ }
203
+ });
204
+
205
+ await Promise.all(closePromises);
206
+ this.pool = [];
207
+ this.busyBrowsers.clear();
208
+ }
209
+
210
+ getStats() {
211
+ return {
212
+ ...this.stats,
213
+ poolSize: this.pool.length,
214
+ busyCount: this.busyBrowsers.size,
215
+ idleCount: this.pool.length - this.busyBrowsers.size,
216
+ queueLength: this.requestQueue.length
217
+ };
218
+ }
219
+ }
220
+
221
+ // Global browser pool instance
222
+ const browserPool = new BrowserPool(3, 5000);
223
+
224
+ // Graceful shutdown
225
+ process.on('SIGTERM', () => browserPool.closeAll());
226
+ process.on('SIGINT', () => browserPool.closeAll());
227
+ process.on('beforeExit', () => browserPool.closeAll());
228
+
229
+ export default browserPool;
package/index.js CHANGED
@@ -1,11 +1,10 @@
1
- import fetch from 'node-fetch';
2
1
  import { spawn, execSync } from 'child_process';
3
2
  import fs from 'fs/promises';
4
3
  import { existsSync, statSync } from 'fs';
5
4
  import path from 'path';
6
5
  import { fileURLToPath } from 'url';
7
6
  import { promises as fsPromises } from 'fs';
8
- import pdfParse from 'pdf-parse/lib/pdf-parse.js';
7
+ import { PDFParse } from 'pdf-parse';
9
8
  import browserPool from './browser-pool.js';
10
9
 
11
10
  let puppeteer = null;
@@ -604,27 +603,41 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
604
603
  }
605
604
 
606
605
  return new Promise((resolve) => {
607
- const args = ['fetch', '--dump', url];
606
+ const format = config.lightpandaFormat || 'html';
607
+ const args = [
608
+ 'fetch',
609
+ '--dump', format,
610
+ '--with_frames',
611
+ '--http_timeout', String(config.timeout),
612
+ url
613
+ ];
608
614
  const process = spawn(this.options.lightpandaPath, args, {
609
- timeout: config.timeout + 1000 // Add buffer for process timeout only
615
+ timeout: config.timeout + 2000 // Buffer above http_timeout
610
616
  });
611
-
617
+
612
618
  let output = '';
613
619
  let errorOutput = '';
614
-
620
+
615
621
  process.stdout.on('data', (data) => {
616
622
  output += data.toString();
617
623
  });
618
-
624
+
619
625
  process.stderr.on('data', (data) => {
620
626
  errorOutput += data.toString();
621
627
  });
622
-
628
+
623
629
  process.on('close', (code) => {
624
630
  if (code === 0 && output.length > 0) {
625
- const content = this.extractContentFromHTML(output);
631
+ // Markdown output is already clean text, no HTML extraction needed
632
+ const content = format === 'markdown'
633
+ ? JSON.stringify({
634
+ title: output.match(/^#\s+(.+)$/m)?.[1] || '',
635
+ content: output,
636
+ extractedAt: new Date().toISOString()
637
+ }, null, 2)
638
+ : this.extractContentFromHTML(output);
626
639
  this.stats.lightpanda.successes++;
627
-
640
+
628
641
  resolve({
629
642
  success: true,
630
643
  content,
@@ -642,7 +655,7 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
642
655
  });
643
656
  }
644
657
  });
645
-
658
+
646
659
  process.on('error', (error) => {
647
660
  resolve({
648
661
  success: false,
@@ -847,25 +860,30 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
847
860
  };
848
861
  }
849
862
 
850
- // Parse PDF
851
- const pdfData = await pdfParse(buffer);
852
-
863
+ // Parse PDF with pdf-parse v2 API
864
+ const parser = new PDFParse({ data: new Uint8Array(buffer) });
865
+ await parser.load();
866
+ const textResult = await parser.getText();
867
+ const infoResult = await parser.getInfo();
868
+ parser.destroy();
869
+
853
870
  // Extract structured content
871
+ const pdfInfo = infoResult.info || {};
854
872
  const content = {
855
- title: pdfData.info?.Title || 'Untitled PDF',
856
- author: pdfData.info?.Author || '',
857
- subject: pdfData.info?.Subject || '',
858
- keywords: pdfData.info?.Keywords || '',
859
- creator: pdfData.info?.Creator || '',
860
- producer: pdfData.info?.Producer || '',
861
- creationDate: pdfData.info?.CreationDate || '',
862
- modificationDate: pdfData.info?.ModificationDate || '',
863
- pages: pdfData.numpages || 0,
864
- text: pdfData.text || '',
865
- metadata: pdfData.metadata || null,
873
+ title: pdfInfo.Title || infoResult.outline?.[0]?.title || 'Untitled PDF',
874
+ author: pdfInfo.Author || '',
875
+ subject: pdfInfo.Subject || '',
876
+ keywords: pdfInfo.Keywords || '',
877
+ creator: pdfInfo.Creator || '',
878
+ producer: pdfInfo.Producer || '',
879
+ creationDate: pdfInfo.CreationDate || '',
880
+ modificationDate: pdfInfo.ModDate || '',
881
+ pages: textResult.total || 0,
882
+ text: textResult.text || '',
883
+ metadata: infoResult.metadata || null,
866
884
  url: url
867
885
  };
868
-
886
+
869
887
  this.stats.pdf.successes++;
870
888
 
871
889
  return {
@@ -1008,11 +1026,11 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
1008
1026
  });
1009
1027
 
1010
1028
  // Extract window state data
1011
- const windowDataMatch = html.match(/window\.__(?:INITIAL_STATE__|INITIAL_DATA__|NEXT_DATA__)__\s*=\s*({[\s\S]*?});/);
1029
+ const windowDataMatch = html.match(/window\.__(INITIAL_STATE|INITIAL_DATA|NEXT_DATA)__\s*=\s*({[\s\S]*?});/);
1012
1030
  let windowData = null;
1013
1031
  if (windowDataMatch) {
1014
1032
  try {
1015
- windowData = JSON.parse(windowDataMatch[1]);
1033
+ windowData = JSON.parse(windowDataMatch[2]);
1016
1034
  } catch {
1017
1035
  windowData = 'Found but unparseable';
1018
1036
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@monostate/node-scraper",
3
- "version": "1.8.0",
3
+ "version": "2.0.0",
4
4
  "description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -14,11 +14,14 @@
14
14
  "files": [
15
15
  "index.js",
16
16
  "index.d.ts",
17
+ "browser-pool.js",
17
18
  "README.md",
19
+ "BULK_SCRAPING.md",
18
20
  "package.json",
19
21
  "scripts/"
20
22
  ],
21
23
  "scripts": {
24
+ "test": "node --test test/",
22
25
  "postinstall": "node scripts/install-lightpanda.js"
23
26
  },
24
27
  "keywords": [
@@ -45,11 +48,10 @@
45
48
  "author": "BNCA Team",
46
49
  "license": "MIT",
47
50
  "dependencies": {
48
- "node-fetch": "^3.3.2",
49
- "pdf-parse": "^1.1.1"
51
+ "pdf-parse": "^2.4.5"
50
52
  },
51
53
  "peerDependencies": {
52
- "puppeteer": "^24.11.2"
54
+ "puppeteer": "^24.38.0"
53
55
  },
54
56
  "peerDependenciesMeta": {
55
57
  "puppeteer": {
@@ -57,7 +59,7 @@
57
59
  }
58
60
  },
59
61
  "engines": {
60
- "node": ">=18.0.0"
62
+ "node": ">=20.0.0"
61
63
  },
62
64
  "repository": {
63
65
  "type": "git",
@@ -6,17 +6,30 @@ import path from 'path';
6
6
  import { createWriteStream } from 'fs';
7
7
  import { execSync } from 'child_process';
8
8
 
9
- const LIGHTPANDA_VERSION = 'nightly';
9
+ const LIGHTPANDA_VERSION = 'v0.2.5';
10
10
  const BINARY_DIR = path.join(path.dirname(path.dirname(new URL(import.meta.url).pathname)), 'bin');
11
11
  const BINARY_NAME = 'lightpanda';
12
12
  const BINARY_PATH = path.join(BINARY_DIR, BINARY_NAME);
13
13
 
14
- // Platform-specific download URLs (matching official Lightpanda instructions)
15
- const DOWNLOAD_URLS = {
16
- 'darwin': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-aarch64-macos`,
17
- 'linux': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-x86_64-linux`,
18
- 'wsl': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-x86_64-linux` // WSL uses Linux binary
19
- };
14
+ function detectArch() {
15
+ const arch = process.arch;
16
+ if (arch === 'arm64' || arch === 'aarch64') return 'aarch64';
17
+ if (arch === 'x64' || arch === 'x86_64') return 'x86_64';
18
+ return arch;
19
+ }
20
+
21
+ // Platform-specific download URLs (matching official Lightpanda releases)
22
+ function getDownloadUrls() {
23
+ const arch = detectArch();
24
+ const base = `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}`;
25
+ return {
26
+ 'darwin': `${base}/lightpanda-${arch}-macos`,
27
+ 'linux': `${base}/lightpanda-${arch}-linux`,
28
+ 'wsl': `${base}/lightpanda-x86_64-linux`
29
+ };
30
+ }
31
+
32
+ const DOWNLOAD_URLS = getDownloadUrls();
20
33
 
21
34
  function detectPlatform() {
22
35
  const platform = process.platform;