@monostate/node-scraper 1.8.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +106 -558
- package/browser-pool.js +1 -1
- package/browser-session.js +551 -0
- package/index.d.ts +97 -0
- package/index.js +50 -28
- package/lightpanda-server.js +151 -0
- package/package.json +10 -5
- package/scripts/install-lightpanda.js +20 -7
package/index.js
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import fetch from 'node-fetch';
|
|
2
1
|
import { spawn, execSync } from 'child_process';
|
|
3
2
|
import fs from 'fs/promises';
|
|
4
3
|
import { existsSync, statSync } from 'fs';
|
|
5
4
|
import path from 'path';
|
|
6
5
|
import { fileURLToPath } from 'url';
|
|
7
6
|
import { promises as fsPromises } from 'fs';
|
|
8
|
-
import
|
|
7
|
+
import { PDFParse } from 'pdf-parse';
|
|
9
8
|
import browserPool from './browser-pool.js';
|
|
10
9
|
|
|
11
10
|
let puppeteer = null;
|
|
@@ -604,27 +603,41 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
604
603
|
}
|
|
605
604
|
|
|
606
605
|
return new Promise((resolve) => {
|
|
607
|
-
const
|
|
606
|
+
const format = config.lightpandaFormat || 'html';
|
|
607
|
+
const args = [
|
|
608
|
+
'fetch',
|
|
609
|
+
'--dump', format,
|
|
610
|
+
'--with_frames',
|
|
611
|
+
'--http_timeout', String(config.timeout),
|
|
612
|
+
url
|
|
613
|
+
];
|
|
608
614
|
const process = spawn(this.options.lightpandaPath, args, {
|
|
609
|
-
timeout: config.timeout +
|
|
615
|
+
timeout: config.timeout + 2000 // Buffer above http_timeout
|
|
610
616
|
});
|
|
611
|
-
|
|
617
|
+
|
|
612
618
|
let output = '';
|
|
613
619
|
let errorOutput = '';
|
|
614
|
-
|
|
620
|
+
|
|
615
621
|
process.stdout.on('data', (data) => {
|
|
616
622
|
output += data.toString();
|
|
617
623
|
});
|
|
618
|
-
|
|
624
|
+
|
|
619
625
|
process.stderr.on('data', (data) => {
|
|
620
626
|
errorOutput += data.toString();
|
|
621
627
|
});
|
|
622
|
-
|
|
628
|
+
|
|
623
629
|
process.on('close', (code) => {
|
|
624
630
|
if (code === 0 && output.length > 0) {
|
|
625
|
-
|
|
631
|
+
// Markdown output is already clean text, no HTML extraction needed
|
|
632
|
+
const content = format === 'markdown'
|
|
633
|
+
? JSON.stringify({
|
|
634
|
+
title: output.match(/^#\s+(.+)$/m)?.[1] || '',
|
|
635
|
+
content: output,
|
|
636
|
+
extractedAt: new Date().toISOString()
|
|
637
|
+
}, null, 2)
|
|
638
|
+
: this.extractContentFromHTML(output);
|
|
626
639
|
this.stats.lightpanda.successes++;
|
|
627
|
-
|
|
640
|
+
|
|
628
641
|
resolve({
|
|
629
642
|
success: true,
|
|
630
643
|
content,
|
|
@@ -642,7 +655,7 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
642
655
|
});
|
|
643
656
|
}
|
|
644
657
|
});
|
|
645
|
-
|
|
658
|
+
|
|
646
659
|
process.on('error', (error) => {
|
|
647
660
|
resolve({
|
|
648
661
|
success: false,
|
|
@@ -847,25 +860,30 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
847
860
|
};
|
|
848
861
|
}
|
|
849
862
|
|
|
850
|
-
// Parse PDF
|
|
851
|
-
const
|
|
852
|
-
|
|
863
|
+
// Parse PDF with pdf-parse v2 API
|
|
864
|
+
const parser = new PDFParse({ data: new Uint8Array(buffer) });
|
|
865
|
+
await parser.load();
|
|
866
|
+
const textResult = await parser.getText();
|
|
867
|
+
const infoResult = await parser.getInfo();
|
|
868
|
+
parser.destroy();
|
|
869
|
+
|
|
853
870
|
// Extract structured content
|
|
871
|
+
const pdfInfo = infoResult.info || {};
|
|
854
872
|
const content = {
|
|
855
|
-
title:
|
|
856
|
-
author:
|
|
857
|
-
subject:
|
|
858
|
-
keywords:
|
|
859
|
-
creator:
|
|
860
|
-
producer:
|
|
861
|
-
creationDate:
|
|
862
|
-
modificationDate:
|
|
863
|
-
pages:
|
|
864
|
-
text:
|
|
865
|
-
metadata:
|
|
873
|
+
title: pdfInfo.Title || infoResult.outline?.[0]?.title || 'Untitled PDF',
|
|
874
|
+
author: pdfInfo.Author || '',
|
|
875
|
+
subject: pdfInfo.Subject || '',
|
|
876
|
+
keywords: pdfInfo.Keywords || '',
|
|
877
|
+
creator: pdfInfo.Creator || '',
|
|
878
|
+
producer: pdfInfo.Producer || '',
|
|
879
|
+
creationDate: pdfInfo.CreationDate || '',
|
|
880
|
+
modificationDate: pdfInfo.ModDate || '',
|
|
881
|
+
pages: textResult.total || 0,
|
|
882
|
+
text: textResult.text || '',
|
|
883
|
+
metadata: infoResult.metadata || null,
|
|
866
884
|
url: url
|
|
867
885
|
};
|
|
868
|
-
|
|
886
|
+
|
|
869
887
|
this.stats.pdf.successes++;
|
|
870
888
|
|
|
871
889
|
return {
|
|
@@ -1008,11 +1026,11 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
|
|
|
1008
1026
|
});
|
|
1009
1027
|
|
|
1010
1028
|
// Extract window state data
|
|
1011
|
-
const windowDataMatch = html.match(/window\.__(
|
|
1029
|
+
const windowDataMatch = html.match(/window\.__(INITIAL_STATE|INITIAL_DATA|NEXT_DATA)__\s*=\s*({[\s\S]*?});/);
|
|
1012
1030
|
let windowData = null;
|
|
1013
1031
|
if (windowDataMatch) {
|
|
1014
1032
|
try {
|
|
1015
|
-
windowData = JSON.parse(windowDataMatch[
|
|
1033
|
+
windowData = JSON.parse(windowDataMatch[2]);
|
|
1016
1034
|
} catch {
|
|
1017
1035
|
windowData = 'Found but unparseable';
|
|
1018
1036
|
}
|
|
@@ -1777,4 +1795,8 @@ export async function bulkScrapeStream(urls, options = {}) {
|
|
|
1777
1795
|
}
|
|
1778
1796
|
}
|
|
1779
1797
|
|
|
1798
|
+
// Browser session exports
|
|
1799
|
+
export { BrowserSession, createSession } from './browser-session.js';
|
|
1800
|
+
export { default as LightPandaServer, getLightPandaServer, stopLightPandaServer } from './lightpanda-server.js';
|
|
1801
|
+
|
|
1780
1802
|
export default BNCASmartScraper;
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import { spawn } from 'child_process';
|
|
2
|
+
import { createServer } from 'net';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import fs from 'fs';
|
|
5
|
+
|
|
6
|
+
class LightPandaServer {
|
|
7
|
+
constructor(binaryPath) {
|
|
8
|
+
this.binaryPath = binaryPath || this._findBinary();
|
|
9
|
+
this.process = null;
|
|
10
|
+
this.host = '127.0.0.1';
|
|
11
|
+
this.port = null;
|
|
12
|
+
this.ready = false;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
async start(port) {
|
|
16
|
+
if (this.process && this.ready) return this.getEndpoint();
|
|
17
|
+
|
|
18
|
+
this.port = port || await this._findAvailablePort();
|
|
19
|
+
|
|
20
|
+
return new Promise((resolve, reject) => {
|
|
21
|
+
const args = [
|
|
22
|
+
'serve',
|
|
23
|
+
'--host', this.host,
|
|
24
|
+
'--port', String(this.port),
|
|
25
|
+
'--cdp_max_connections', '16',
|
|
26
|
+
];
|
|
27
|
+
|
|
28
|
+
this.process = spawn(this.binaryPath, args, {
|
|
29
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
let stderr = '';
|
|
33
|
+
|
|
34
|
+
const onReady = () => {
|
|
35
|
+
this.ready = true;
|
|
36
|
+
resolve(this.getEndpoint());
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
// LP prints to stderr when ready — wait for it or poll /json/version
|
|
40
|
+
this.process.stderr.on('data', (data) => {
|
|
41
|
+
stderr += data.toString();
|
|
42
|
+
// LightPanda logs server start to stderr
|
|
43
|
+
if (stderr.includes('Listening on') || stderr.includes('server started')) {
|
|
44
|
+
onReady();
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
this.process.on('error', (err) => {
|
|
49
|
+
this.ready = false;
|
|
50
|
+
reject(new Error(`Failed to start LightPanda: ${err.message}`));
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
this.process.on('exit', (code) => {
|
|
54
|
+
this.ready = false;
|
|
55
|
+
this.process = null;
|
|
56
|
+
if (!this.ready) {
|
|
57
|
+
reject(new Error(`LightPanda exited with code ${code}: ${stderr}`));
|
|
58
|
+
}
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
// Fallback: poll /json/version if no stderr signal within 3s
|
|
62
|
+
setTimeout(async () => {
|
|
63
|
+
if (this.ready) return;
|
|
64
|
+
try {
|
|
65
|
+
const res = await fetch(`http://${this.host}:${this.port}/json/version`);
|
|
66
|
+
if (res.ok) onReady();
|
|
67
|
+
} catch {
|
|
68
|
+
// Still starting up, give it more time
|
|
69
|
+
}
|
|
70
|
+
}, 1500);
|
|
71
|
+
|
|
72
|
+
// Hard timeout
|
|
73
|
+
setTimeout(() => {
|
|
74
|
+
if (!this.ready) {
|
|
75
|
+
this.stop();
|
|
76
|
+
reject(new Error(`LightPanda failed to start within 5s. stderr: ${stderr}`));
|
|
77
|
+
}
|
|
78
|
+
}, 5000);
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
getEndpoint() {
|
|
83
|
+
return `ws://${this.host}:${this.port}`;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
isRunning() {
|
|
87
|
+
return this.ready && this.process !== null;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
stop() {
|
|
91
|
+
if (this.process) {
|
|
92
|
+
try {
|
|
93
|
+
this.process.kill('SIGTERM');
|
|
94
|
+
} catch {
|
|
95
|
+
// already dead
|
|
96
|
+
}
|
|
97
|
+
this.process = null;
|
|
98
|
+
}
|
|
99
|
+
this.ready = false;
|
|
100
|
+
this.port = null;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
async _findAvailablePort() {
|
|
104
|
+
return new Promise((resolve, reject) => {
|
|
105
|
+
const server = createServer();
|
|
106
|
+
server.listen(0, '127.0.0.1', () => {
|
|
107
|
+
const port = server.address().port;
|
|
108
|
+
server.close(() => resolve(port));
|
|
109
|
+
});
|
|
110
|
+
server.on('error', reject);
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
_findBinary() {
|
|
115
|
+
// Check common locations
|
|
116
|
+
const candidates = [
|
|
117
|
+
path.join(path.dirname(new URL(import.meta.url).pathname), 'bin', 'lightpanda'),
|
|
118
|
+
'/usr/local/bin/lightpanda',
|
|
119
|
+
'/usr/bin/lightpanda',
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
for (const p of candidates) {
|
|
123
|
+
if (fs.existsSync(p)) return p;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return 'lightpanda'; // hope it's on PATH
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Singleton instance — shared across all sessions
|
|
131
|
+
let _instance = null;
|
|
132
|
+
|
|
133
|
+
export function getLightPandaServer(binaryPath) {
|
|
134
|
+
if (!_instance) {
|
|
135
|
+
_instance = new LightPandaServer(binaryPath);
|
|
136
|
+
}
|
|
137
|
+
return _instance;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export function stopLightPandaServer() {
|
|
141
|
+
if (_instance) {
|
|
142
|
+
_instance.stop();
|
|
143
|
+
_instance = null;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
process.on('SIGTERM', stopLightPandaServer);
|
|
148
|
+
process.on('SIGINT', stopLightPandaServer);
|
|
149
|
+
process.on('beforeExit', stopLightPandaServer);
|
|
150
|
+
|
|
151
|
+
export default LightPandaServer;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@monostate/node-scraper",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -15,12 +15,15 @@
|
|
|
15
15
|
"index.js",
|
|
16
16
|
"index.d.ts",
|
|
17
17
|
"browser-pool.js",
|
|
18
|
+
"browser-session.js",
|
|
19
|
+
"lightpanda-server.js",
|
|
18
20
|
"README.md",
|
|
19
21
|
"BULK_SCRAPING.md",
|
|
20
22
|
"package.json",
|
|
21
23
|
"scripts/"
|
|
22
24
|
],
|
|
23
25
|
"scripts": {
|
|
26
|
+
"test": "node --test test/",
|
|
24
27
|
"postinstall": "node scripts/install-lightpanda.js"
|
|
25
28
|
},
|
|
26
29
|
"keywords": [
|
|
@@ -33,6 +36,9 @@
|
|
|
33
36
|
"data-extraction",
|
|
34
37
|
"automation",
|
|
35
38
|
"browser",
|
|
39
|
+
"browser-use",
|
|
40
|
+
"cdp",
|
|
41
|
+
"ai-agent",
|
|
36
42
|
"ai-powered",
|
|
37
43
|
"question-answering",
|
|
38
44
|
"pdf-parsing",
|
|
@@ -47,11 +53,10 @@
|
|
|
47
53
|
"author": "BNCA Team",
|
|
48
54
|
"license": "MIT",
|
|
49
55
|
"dependencies": {
|
|
50
|
-
"
|
|
51
|
-
"pdf-parse": "^1.1.1"
|
|
56
|
+
"pdf-parse": "^2.4.5"
|
|
52
57
|
},
|
|
53
58
|
"peerDependencies": {
|
|
54
|
-
"puppeteer": "^24.
|
|
59
|
+
"puppeteer": "^24.38.0"
|
|
55
60
|
},
|
|
56
61
|
"peerDependenciesMeta": {
|
|
57
62
|
"puppeteer": {
|
|
@@ -59,7 +64,7 @@
|
|
|
59
64
|
}
|
|
60
65
|
},
|
|
61
66
|
"engines": {
|
|
62
|
-
"node": ">=
|
|
67
|
+
"node": ">=20.0.0"
|
|
63
68
|
},
|
|
64
69
|
"repository": {
|
|
65
70
|
"type": "git",
|
|
@@ -6,17 +6,30 @@ import path from 'path';
|
|
|
6
6
|
import { createWriteStream } from 'fs';
|
|
7
7
|
import { execSync } from 'child_process';
|
|
8
8
|
|
|
9
|
-
const LIGHTPANDA_VERSION = '
|
|
9
|
+
const LIGHTPANDA_VERSION = 'v0.2.5';
|
|
10
10
|
const BINARY_DIR = path.join(path.dirname(path.dirname(new URL(import.meta.url).pathname)), 'bin');
|
|
11
11
|
const BINARY_NAME = 'lightpanda';
|
|
12
12
|
const BINARY_PATH = path.join(BINARY_DIR, BINARY_NAME);
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
const
|
|
16
|
-
'
|
|
17
|
-
'
|
|
18
|
-
|
|
19
|
-
}
|
|
14
|
+
function detectArch() {
|
|
15
|
+
const arch = process.arch;
|
|
16
|
+
if (arch === 'arm64' || arch === 'aarch64') return 'aarch64';
|
|
17
|
+
if (arch === 'x64' || arch === 'x86_64') return 'x86_64';
|
|
18
|
+
return arch;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Platform-specific download URLs (matching official Lightpanda releases)
|
|
22
|
+
function getDownloadUrls() {
|
|
23
|
+
const arch = detectArch();
|
|
24
|
+
const base = `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}`;
|
|
25
|
+
return {
|
|
26
|
+
'darwin': `${base}/lightpanda-${arch}-macos`,
|
|
27
|
+
'linux': `${base}/lightpanda-${arch}-linux`,
|
|
28
|
+
'wsl': `${base}/lightpanda-x86_64-linux`
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const DOWNLOAD_URLS = getDownloadUrls();
|
|
20
33
|
|
|
21
34
|
function detectPlatform() {
|
|
22
35
|
const platform = process.platform;
|