@monostate/node-scraper 1.8.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/BULK_SCRAPING.md +626 -0
- package/README.md +106 -556
- package/browser-pool.js +229 -0
- package/index.js +46 -28
- package/package.json +7 -5
- package/scripts/install-lightpanda.js +20 -7
package/browser-pool.js
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
class BrowserPool {
|
|
2
|
+
constructor(maxInstances = 3, idleTimeout = 5000) {
|
|
3
|
+
this.maxInstances = maxInstances;
|
|
4
|
+
this.idleTimeout = idleTimeout;
|
|
5
|
+
this.pool = [];
|
|
6
|
+
this.busyBrowsers = new Set();
|
|
7
|
+
this.cleanupTimer = null;
|
|
8
|
+
this.requestQueue = [];
|
|
9
|
+
this.stats = {
|
|
10
|
+
created: 0,
|
|
11
|
+
reused: 0,
|
|
12
|
+
queued: 0,
|
|
13
|
+
cleaned: 0
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
async getBrowser() {
|
|
18
|
+
// Try to get an idle browser from pool
|
|
19
|
+
let browser = this.pool.find(b => !this.busyBrowsers.has(b.instance));
|
|
20
|
+
|
|
21
|
+
if (browser) {
|
|
22
|
+
browser.lastUsed = Date.now();
|
|
23
|
+
this.busyBrowsers.add(browser.instance);
|
|
24
|
+
this.stats.reused++;
|
|
25
|
+
return browser.instance;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Create new browser if under limit
|
|
29
|
+
if (this.pool.length < this.maxInstances) {
|
|
30
|
+
browser = await this.createBrowser();
|
|
31
|
+
this.pool.push(browser);
|
|
32
|
+
this.busyBrowsers.add(browser.instance);
|
|
33
|
+
this.stats.created++;
|
|
34
|
+
return browser.instance;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Queue the request and wait for available browser
|
|
38
|
+
this.stats.queued++;
|
|
39
|
+
return this.queueRequest();
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
async createBrowser() {
|
|
43
|
+
const puppeteer = await this.getPuppeteer();
|
|
44
|
+
const instance = await puppeteer.launch({
|
|
45
|
+
headless: true,
|
|
46
|
+
args: [
|
|
47
|
+
'--no-sandbox',
|
|
48
|
+
'--disable-setuid-sandbox',
|
|
49
|
+
'--disable-dev-shm-usage',
|
|
50
|
+
'--disable-gpu',
|
|
51
|
+
'--disable-web-security',
|
|
52
|
+
'--disable-features=VizDisplayCompositor',
|
|
53
|
+
'--disable-background-timer-throttling',
|
|
54
|
+
'--disable-backgrounding-occluded-windows',
|
|
55
|
+
'--disable-renderer-backgrounding',
|
|
56
|
+
'--disable-extensions',
|
|
57
|
+
'--disable-default-apps',
|
|
58
|
+
'--disable-sync',
|
|
59
|
+
'--metrics-recording-only',
|
|
60
|
+
'--mute-audio',
|
|
61
|
+
'--no-first-run'
|
|
62
|
+
]
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
const browser = {
|
|
66
|
+
instance,
|
|
67
|
+
created: Date.now(),
|
|
68
|
+
lastUsed: Date.now(),
|
|
69
|
+
pageCount: 0
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// Handle browser disconnect
|
|
73
|
+
instance.on('disconnected', () => {
|
|
74
|
+
this.removeBrowser(browser);
|
|
75
|
+
this.processQueue();
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
return browser;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
async getPuppeteer() {
|
|
82
|
+
try {
|
|
83
|
+
const puppeteer = await import('puppeteer');
|
|
84
|
+
return puppeteer.default || puppeteer;
|
|
85
|
+
} catch (error) {
|
|
86
|
+
throw new Error('Puppeteer is not installed. Please install it to use Puppeteer-based scraping.');
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async queueRequest() {
|
|
91
|
+
return new Promise((resolve) => {
|
|
92
|
+
this.requestQueue.push({ resolve, timestamp: Date.now() });
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
processQueue() {
|
|
97
|
+
if (this.requestQueue.length === 0) return;
|
|
98
|
+
|
|
99
|
+
// Find available browser
|
|
100
|
+
const available = this.pool.find(b => !this.busyBrowsers.has(b.instance));
|
|
101
|
+
if (!available) return;
|
|
102
|
+
|
|
103
|
+
// Process oldest request in queue
|
|
104
|
+
const request = this.requestQueue.shift();
|
|
105
|
+
if (request) {
|
|
106
|
+
available.lastUsed = Date.now();
|
|
107
|
+
this.busyBrowsers.add(available.instance);
|
|
108
|
+
request.resolve(available.instance);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
releaseBrowser(browser) {
|
|
113
|
+
this.busyBrowsers.delete(browser);
|
|
114
|
+
|
|
115
|
+
// Process any queued requests
|
|
116
|
+
this.processQueue();
|
|
117
|
+
|
|
118
|
+
// Start cleanup timer if not already running
|
|
119
|
+
if (!this.cleanupTimer) {
|
|
120
|
+
this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
removeBrowser(browserObj) {
|
|
125
|
+
const index = this.pool.findIndex(b => b.instance === browserObj.instance);
|
|
126
|
+
if (index !== -1) {
|
|
127
|
+
this.pool.splice(index, 1);
|
|
128
|
+
this.busyBrowsers.delete(browserObj.instance);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
async cleanup() {
|
|
133
|
+
this.cleanupTimer = null;
|
|
134
|
+
const now = Date.now();
|
|
135
|
+
const toRemove = [];
|
|
136
|
+
|
|
137
|
+
// Keep at least one browser if there are queued requests
|
|
138
|
+
const minBrowsers = this.requestQueue.length > 0 ? 1 : 0;
|
|
139
|
+
|
|
140
|
+
for (const browser of this.pool) {
|
|
141
|
+
// Skip if we need to keep minimum browsers
|
|
142
|
+
if (this.pool.length - toRemove.length <= minBrowsers) break;
|
|
143
|
+
|
|
144
|
+
// Remove idle browsers
|
|
145
|
+
const isIdle = !this.busyBrowsers.has(browser.instance);
|
|
146
|
+
const idleTime = now - browser.lastUsed;
|
|
147
|
+
|
|
148
|
+
if (isIdle && idleTime > this.idleTimeout) {
|
|
149
|
+
toRemove.push(browser);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Close idle browsers
|
|
154
|
+
for (const browser of toRemove) {
|
|
155
|
+
try {
|
|
156
|
+
// Check if browser is still connected
|
|
157
|
+
if (browser.instance && browser.instance.isConnected()) {
|
|
158
|
+
await browser.instance.close();
|
|
159
|
+
}
|
|
160
|
+
this.removeBrowser(browser);
|
|
161
|
+
this.stats.cleaned++;
|
|
162
|
+
} catch (error) {
|
|
163
|
+
// Silently ignore protocol errors and disconnection errors
|
|
164
|
+
if (!error.message.includes('Protocol error') &&
|
|
165
|
+
!error.message.includes('Target closed') &&
|
|
166
|
+
!error.message.includes('Connection closed')) {
|
|
167
|
+
console.warn('Error closing browser:', error.message);
|
|
168
|
+
}
|
|
169
|
+
// Remove browser even if close failed
|
|
170
|
+
this.removeBrowser(browser);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Schedule next cleanup if there are still browsers
|
|
175
|
+
if (this.pool.length > 0) {
|
|
176
|
+
this.cleanupTimer = setTimeout(() => this.cleanup(), this.idleTimeout);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
async closeAll() {
|
|
181
|
+
if (this.cleanupTimer) {
|
|
182
|
+
clearTimeout(this.cleanupTimer);
|
|
183
|
+
this.cleanupTimer = null;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Clear the queue
|
|
187
|
+
this.requestQueue = [];
|
|
188
|
+
|
|
189
|
+
const closePromises = this.pool.map(async (browser) => {
|
|
190
|
+
try {
|
|
191
|
+
// Check if browser is still connected
|
|
192
|
+
if (browser.instance && browser.instance.isConnected()) {
|
|
193
|
+
await browser.instance.close();
|
|
194
|
+
}
|
|
195
|
+
} catch (error) {
|
|
196
|
+
// Silently ignore protocol errors and disconnection errors
|
|
197
|
+
if (!error.message.includes('Protocol error') &&
|
|
198
|
+
!error.message.includes('Target closed') &&
|
|
199
|
+
!error.message.includes('Connection closed')) {
|
|
200
|
+
console.warn('Error closing browser:', error.message);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
await Promise.all(closePromises);
|
|
206
|
+
this.pool = [];
|
|
207
|
+
this.busyBrowsers.clear();
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
getStats() {
|
|
211
|
+
return {
|
|
212
|
+
...this.stats,
|
|
213
|
+
poolSize: this.pool.length,
|
|
214
|
+
busyCount: this.busyBrowsers.size,
|
|
215
|
+
idleCount: this.pool.length - this.busyBrowsers.size,
|
|
216
|
+
queueLength: this.requestQueue.length
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Global browser pool instance
|
|
222
|
+
const browserPool = new BrowserPool(3, 5000);
|
|
223
|
+
|
|
224
|
+
// Graceful shutdown
|
|
225
|
+
process.on('SIGTERM', () => browserPool.closeAll());
|
|
226
|
+
process.on('SIGINT', () => browserPool.closeAll());
|
|
227
|
+
process.on('beforeExit', () => browserPool.closeAll());
|
|
228
|
+
|
|
229
|
+
export default browserPool;
|
package/index.js
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import fetch from 'node-fetch';
|
|
2
1
|
import { spawn, execSync } from 'child_process';
|
|
3
2
|
import fs from 'fs/promises';
|
|
4
3
|
import { existsSync, statSync } from 'fs';
|
|
5
4
|
import path from 'path';
|
|
6
5
|
import { fileURLToPath } from 'url';
|
|
7
6
|
import { promises as fsPromises } from 'fs';
|
|
8
|
-
import
|
|
7
|
+
import { PDFParse } from 'pdf-parse';
|
|
9
8
|
import browserPool from './browser-pool.js';
|
|
10
9
|
|
|
11
10
|
let puppeteer = null;
|
|
@@ -604,27 +603,41 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
604
603
|
}
|
|
605
604
|
|
|
606
605
|
return new Promise((resolve) => {
|
|
607
|
-
const
|
|
606
|
+
const format = config.lightpandaFormat || 'html';
|
|
607
|
+
const args = [
|
|
608
|
+
'fetch',
|
|
609
|
+
'--dump', format,
|
|
610
|
+
'--with_frames',
|
|
611
|
+
'--http_timeout', String(config.timeout),
|
|
612
|
+
url
|
|
613
|
+
];
|
|
608
614
|
const process = spawn(this.options.lightpandaPath, args, {
|
|
609
|
-
timeout: config.timeout +
|
|
615
|
+
timeout: config.timeout + 2000 // Buffer above http_timeout
|
|
610
616
|
});
|
|
611
|
-
|
|
617
|
+
|
|
612
618
|
let output = '';
|
|
613
619
|
let errorOutput = '';
|
|
614
|
-
|
|
620
|
+
|
|
615
621
|
process.stdout.on('data', (data) => {
|
|
616
622
|
output += data.toString();
|
|
617
623
|
});
|
|
618
|
-
|
|
624
|
+
|
|
619
625
|
process.stderr.on('data', (data) => {
|
|
620
626
|
errorOutput += data.toString();
|
|
621
627
|
});
|
|
622
|
-
|
|
628
|
+
|
|
623
629
|
process.on('close', (code) => {
|
|
624
630
|
if (code === 0 && output.length > 0) {
|
|
625
|
-
|
|
631
|
+
// Markdown output is already clean text, no HTML extraction needed
|
|
632
|
+
const content = format === 'markdown'
|
|
633
|
+
? JSON.stringify({
|
|
634
|
+
title: output.match(/^#\s+(.+)$/m)?.[1] || '',
|
|
635
|
+
content: output,
|
|
636
|
+
extractedAt: new Date().toISOString()
|
|
637
|
+
}, null, 2)
|
|
638
|
+
: this.extractContentFromHTML(output);
|
|
626
639
|
this.stats.lightpanda.successes++;
|
|
627
|
-
|
|
640
|
+
|
|
628
641
|
resolve({
|
|
629
642
|
success: true,
|
|
630
643
|
content,
|
|
@@ -642,7 +655,7 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
642
655
|
});
|
|
643
656
|
}
|
|
644
657
|
});
|
|
645
|
-
|
|
658
|
+
|
|
646
659
|
process.on('error', (error) => {
|
|
647
660
|
resolve({
|
|
648
661
|
success: false,
|
|
@@ -847,25 +860,30 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
847
860
|
};
|
|
848
861
|
}
|
|
849
862
|
|
|
850
|
-
// Parse PDF
|
|
851
|
-
const
|
|
852
|
-
|
|
863
|
+
// Parse PDF with pdf-parse v2 API
|
|
864
|
+
const parser = new PDFParse({ data: new Uint8Array(buffer) });
|
|
865
|
+
await parser.load();
|
|
866
|
+
const textResult = await parser.getText();
|
|
867
|
+
const infoResult = await parser.getInfo();
|
|
868
|
+
parser.destroy();
|
|
869
|
+
|
|
853
870
|
// Extract structured content
|
|
871
|
+
const pdfInfo = infoResult.info || {};
|
|
854
872
|
const content = {
|
|
855
|
-
title:
|
|
856
|
-
author:
|
|
857
|
-
subject:
|
|
858
|
-
keywords:
|
|
859
|
-
creator:
|
|
860
|
-
producer:
|
|
861
|
-
creationDate:
|
|
862
|
-
modificationDate:
|
|
863
|
-
pages:
|
|
864
|
-
text:
|
|
865
|
-
metadata:
|
|
873
|
+
title: pdfInfo.Title || infoResult.outline?.[0]?.title || 'Untitled PDF',
|
|
874
|
+
author: pdfInfo.Author || '',
|
|
875
|
+
subject: pdfInfo.Subject || '',
|
|
876
|
+
keywords: pdfInfo.Keywords || '',
|
|
877
|
+
creator: pdfInfo.Creator || '',
|
|
878
|
+
producer: pdfInfo.Producer || '',
|
|
879
|
+
creationDate: pdfInfo.CreationDate || '',
|
|
880
|
+
modificationDate: pdfInfo.ModDate || '',
|
|
881
|
+
pages: textResult.total || 0,
|
|
882
|
+
text: textResult.text || '',
|
|
883
|
+
metadata: infoResult.metadata || null,
|
|
866
884
|
url: url
|
|
867
885
|
};
|
|
868
|
-
|
|
886
|
+
|
|
869
887
|
this.stats.pdf.successes++;
|
|
870
888
|
|
|
871
889
|
return {
|
|
@@ -1008,11 +1026,11 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
1008
1026
|
});
|
|
1009
1027
|
|
|
1010
1028
|
// Extract window state data
|
|
1011
|
-
const windowDataMatch = html.match(/window\.__(
|
|
1029
|
+
const windowDataMatch = html.match(/window\.__(INITIAL_STATE|INITIAL_DATA|NEXT_DATA)__\s*=\s*({[\s\S]*?});/);
|
|
1012
1030
|
let windowData = null;
|
|
1013
1031
|
if (windowDataMatch) {
|
|
1014
1032
|
try {
|
|
1015
|
-
windowData = JSON.parse(windowDataMatch[
|
|
1033
|
+
windowData = JSON.parse(windowDataMatch[2]);
|
|
1016
1034
|
} catch {
|
|
1017
1035
|
windowData = 'Found but unparseable';
|
|
1018
1036
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@monostate/node-scraper",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -14,11 +14,14 @@
|
|
|
14
14
|
"files": [
|
|
15
15
|
"index.js",
|
|
16
16
|
"index.d.ts",
|
|
17
|
+
"browser-pool.js",
|
|
17
18
|
"README.md",
|
|
19
|
+
"BULK_SCRAPING.md",
|
|
18
20
|
"package.json",
|
|
19
21
|
"scripts/"
|
|
20
22
|
],
|
|
21
23
|
"scripts": {
|
|
24
|
+
"test": "node --test test/",
|
|
22
25
|
"postinstall": "node scripts/install-lightpanda.js"
|
|
23
26
|
},
|
|
24
27
|
"keywords": [
|
|
@@ -45,11 +48,10 @@
|
|
|
45
48
|
"author": "BNCA Team",
|
|
46
49
|
"license": "MIT",
|
|
47
50
|
"dependencies": {
|
|
48
|
-
"
|
|
49
|
-
"pdf-parse": "^1.1.1"
|
|
51
|
+
"pdf-parse": "^2.4.5"
|
|
50
52
|
},
|
|
51
53
|
"peerDependencies": {
|
|
52
|
-
"puppeteer": "^24.
|
|
54
|
+
"puppeteer": "^24.38.0"
|
|
53
55
|
},
|
|
54
56
|
"peerDependenciesMeta": {
|
|
55
57
|
"puppeteer": {
|
|
@@ -57,7 +59,7 @@
|
|
|
57
59
|
}
|
|
58
60
|
},
|
|
59
61
|
"engines": {
|
|
60
|
-
"node": ">=
|
|
62
|
+
"node": ">=20.0.0"
|
|
61
63
|
},
|
|
62
64
|
"repository": {
|
|
63
65
|
"type": "git",
|
|
@@ -6,17 +6,30 @@ import path from 'path';
|
|
|
6
6
|
import { createWriteStream } from 'fs';
|
|
7
7
|
import { execSync } from 'child_process';
|
|
8
8
|
|
|
9
|
-
const LIGHTPANDA_VERSION = '
|
|
9
|
+
const LIGHTPANDA_VERSION = 'v0.2.5';
|
|
10
10
|
const BINARY_DIR = path.join(path.dirname(path.dirname(new URL(import.meta.url).pathname)), 'bin');
|
|
11
11
|
const BINARY_NAME = 'lightpanda';
|
|
12
12
|
const BINARY_PATH = path.join(BINARY_DIR, BINARY_NAME);
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
const
|
|
16
|
-
'
|
|
17
|
-
'
|
|
18
|
-
|
|
19
|
-
}
|
|
14
|
+
function detectArch() {
|
|
15
|
+
const arch = process.arch;
|
|
16
|
+
if (arch === 'arm64' || arch === 'aarch64') return 'aarch64';
|
|
17
|
+
if (arch === 'x64' || arch === 'x86_64') return 'x86_64';
|
|
18
|
+
return arch;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Platform-specific download URLs (matching official Lightpanda releases)
|
|
22
|
+
function getDownloadUrls() {
|
|
23
|
+
const arch = detectArch();
|
|
24
|
+
const base = `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}`;
|
|
25
|
+
return {
|
|
26
|
+
'darwin': `${base}/lightpanda-${arch}-macos`,
|
|
27
|
+
'linux': `${base}/lightpanda-${arch}-linux`,
|
|
28
|
+
'wsl': `${base}/lightpanda-x86_64-linux`
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const DOWNLOAD_URLS = getDownloadUrls();
|
|
20
33
|
|
|
21
34
|
function detectPlatform() {
|
|
22
35
|
const platform = process.platform;
|