@monostate/node-scraper 1.8.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -1,11 +1,10 @@
1
- import fetch from 'node-fetch';
2
1
  import { spawn, execSync } from 'child_process';
3
2
  import fs from 'fs/promises';
4
3
  import { existsSync, statSync } from 'fs';
5
4
  import path from 'path';
6
5
  import { fileURLToPath } from 'url';
7
6
  import { promises as fsPromises } from 'fs';
8
- import pdfParse from 'pdf-parse/lib/pdf-parse.js';
7
+ import { PDFParse } from 'pdf-parse';
9
8
  import browserPool from './browser-pool.js';
10
9
 
11
10
  let puppeteer = null;
@@ -604,27 +603,41 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
604
603
  }
605
604
 
606
605
  return new Promise((resolve) => {
607
- const args = ['fetch', '--dump', url];
606
+ const format = config.lightpandaFormat || 'html';
607
+ const args = [
608
+ 'fetch',
609
+ '--dump', format,
610
+ '--with_frames',
611
+ '--http_timeout', String(config.timeout),
612
+ url
613
+ ];
608
614
  const process = spawn(this.options.lightpandaPath, args, {
609
- timeout: config.timeout + 1000 // Add buffer for process timeout only
615
+ timeout: config.timeout + 2000 // Buffer above http_timeout
610
616
  });
611
-
617
+
612
618
  let output = '';
613
619
  let errorOutput = '';
614
-
620
+
615
621
  process.stdout.on('data', (data) => {
616
622
  output += data.toString();
617
623
  });
618
-
624
+
619
625
  process.stderr.on('data', (data) => {
620
626
  errorOutput += data.toString();
621
627
  });
622
-
628
+
623
629
  process.on('close', (code) => {
624
630
  if (code === 0 && output.length > 0) {
625
- const content = this.extractContentFromHTML(output);
631
+ // Markdown output is already clean text, no HTML extraction needed
632
+ const content = format === 'markdown'
633
+ ? JSON.stringify({
634
+ title: output.match(/^#\s+(.+)$/m)?.[1] || '',
635
+ content: output,
636
+ extractedAt: new Date().toISOString()
637
+ }, null, 2)
638
+ : this.extractContentFromHTML(output);
626
639
  this.stats.lightpanda.successes++;
627
-
640
+
628
641
  resolve({
629
642
  success: true,
630
643
  content,
@@ -642,7 +655,7 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
642
655
  });
643
656
  }
644
657
  });
645
-
658
+
646
659
  process.on('error', (error) => {
647
660
  resolve({
648
661
  success: false,
@@ -847,25 +860,30 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
847
860
  };
848
861
  }
849
862
 
850
- // Parse PDF
851
- const pdfData = await pdfParse(buffer);
852
-
863
+ // Parse PDF with pdf-parse v2 API
864
+ const parser = new PDFParse({ data: new Uint8Array(buffer) });
865
+ await parser.load();
866
+ const textResult = await parser.getText();
867
+ const infoResult = await parser.getInfo();
868
+ parser.destroy();
869
+
853
870
  // Extract structured content
871
+ const pdfInfo = infoResult.info || {};
854
872
  const content = {
855
- title: pdfData.info?.Title || 'Untitled PDF',
856
- author: pdfData.info?.Author || '',
857
- subject: pdfData.info?.Subject || '',
858
- keywords: pdfData.info?.Keywords || '',
859
- creator: pdfData.info?.Creator || '',
860
- producer: pdfData.info?.Producer || '',
861
- creationDate: pdfData.info?.CreationDate || '',
862
- modificationDate: pdfData.info?.ModificationDate || '',
863
- pages: pdfData.numpages || 0,
864
- text: pdfData.text || '',
865
- metadata: pdfData.metadata || null,
873
+ title: pdfInfo.Title || infoResult.outline?.[0]?.title || 'Untitled PDF',
874
+ author: pdfInfo.Author || '',
875
+ subject: pdfInfo.Subject || '',
876
+ keywords: pdfInfo.Keywords || '',
877
+ creator: pdfInfo.Creator || '',
878
+ producer: pdfInfo.Producer || '',
879
+ creationDate: pdfInfo.CreationDate || '',
880
+ modificationDate: pdfInfo.ModDate || '',
881
+ pages: textResult.total || 0,
882
+ text: textResult.text || '',
883
+ metadata: infoResult.metadata || null,
866
884
  url: url
867
885
  };
868
-
886
+
869
887
  this.stats.pdf.successes++;
870
888
 
871
889
  return {
@@ -1008,11 +1026,11 @@ ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h
1008
1026
  });
1009
1027
 
1010
1028
  // Extract window state data
1011
- const windowDataMatch = html.match(/window\.__(?:INITIAL_STATE__|INITIAL_DATA__|NEXT_DATA__)__\s*=\s*({[\s\S]*?});/);
1029
+ const windowDataMatch = html.match(/window\.__(INITIAL_STATE|INITIAL_DATA|NEXT_DATA)__\s*=\s*({[\s\S]*?});/);
1012
1030
  let windowData = null;
1013
1031
  if (windowDataMatch) {
1014
1032
  try {
1015
- windowData = JSON.parse(windowDataMatch[1]);
1033
+ windowData = JSON.parse(windowDataMatch[2]);
1016
1034
  } catch {
1017
1035
  windowData = 'Found but unparseable';
1018
1036
  }
@@ -1777,4 +1795,8 @@ export async function bulkScrapeStream(urls, options = {}) {
1777
1795
  }
1778
1796
  }
1779
1797
 
1798
+ // Browser session exports
1799
+ export { BrowserSession, createSession } from './browser-session.js';
1800
+ export { default as LightPandaServer, getLightPandaServer, stopLightPandaServer } from './lightpanda-server.js';
1801
+
1780
1802
  export default BNCASmartScraper;
@@ -0,0 +1,151 @@
1
+ import { spawn } from 'child_process';
2
+ import { createServer } from 'net';
3
+ import path from 'path';
4
+ import fs from 'fs';
5
+
6
+ class LightPandaServer {
7
+ constructor(binaryPath) {
8
+ this.binaryPath = binaryPath || this._findBinary();
9
+ this.process = null;
10
+ this.host = '127.0.0.1';
11
+ this.port = null;
12
+ this.ready = false;
13
+ }
14
+
15
+ async start(port) {
16
+ if (this.process && this.ready) return this.getEndpoint();
17
+
18
+ this.port = port || await this._findAvailablePort();
19
+
20
+ return new Promise((resolve, reject) => {
21
+ const args = [
22
+ 'serve',
23
+ '--host', this.host,
24
+ '--port', String(this.port),
25
+ '--cdp_max_connections', '16',
26
+ ];
27
+
28
+ this.process = spawn(this.binaryPath, args, {
29
+ stdio: ['ignore', 'pipe', 'pipe'],
30
+ });
31
+
32
+ let stderr = '';
33
+
34
+ const onReady = () => {
35
+ this.ready = true;
36
+ resolve(this.getEndpoint());
37
+ };
38
+
39
+ // LP prints to stderr when ready — wait for it or poll /json/version
40
+ this.process.stderr.on('data', (data) => {
41
+ stderr += data.toString();
42
+ // LightPanda logs server start to stderr
43
+ if (stderr.includes('Listening on') || stderr.includes('server started')) {
44
+ onReady();
45
+ }
46
+ });
47
+
48
+ this.process.on('error', (err) => {
49
+ this.ready = false;
50
+ reject(new Error(`Failed to start LightPanda: ${err.message}`));
51
+ });
52
+
53
+ this.process.on('exit', (code) => {
54
+ this.ready = false;
55
+ this.process = null;
56
+ if (!this.ready) {
57
+ reject(new Error(`LightPanda exited with code ${code}: ${stderr}`));
58
+ }
59
+ });
60
+
61
+ // Fallback: poll /json/version if no stderr signal within 3s
62
+ setTimeout(async () => {
63
+ if (this.ready) return;
64
+ try {
65
+ const res = await fetch(`http://${this.host}:${this.port}/json/version`);
66
+ if (res.ok) onReady();
67
+ } catch {
68
+ // Still starting up, give it more time
69
+ }
70
+ }, 1500);
71
+
72
+ // Hard timeout
73
+ setTimeout(() => {
74
+ if (!this.ready) {
75
+ this.stop();
76
+ reject(new Error(`LightPanda failed to start within 5s. stderr: ${stderr}`));
77
+ }
78
+ }, 5000);
79
+ });
80
+ }
81
+
82
+ getEndpoint() {
83
+ return `ws://${this.host}:${this.port}`;
84
+ }
85
+
86
+ isRunning() {
87
+ return this.ready && this.process !== null;
88
+ }
89
+
90
+ stop() {
91
+ if (this.process) {
92
+ try {
93
+ this.process.kill('SIGTERM');
94
+ } catch {
95
+ // already dead
96
+ }
97
+ this.process = null;
98
+ }
99
+ this.ready = false;
100
+ this.port = null;
101
+ }
102
+
103
+ async _findAvailablePort() {
104
+ return new Promise((resolve, reject) => {
105
+ const server = createServer();
106
+ server.listen(0, '127.0.0.1', () => {
107
+ const port = server.address().port;
108
+ server.close(() => resolve(port));
109
+ });
110
+ server.on('error', reject);
111
+ });
112
+ }
113
+
114
+ _findBinary() {
115
+ // Check common locations
116
+ const candidates = [
117
+ path.join(path.dirname(new URL(import.meta.url).pathname), 'bin', 'lightpanda'),
118
+ '/usr/local/bin/lightpanda',
119
+ '/usr/bin/lightpanda',
120
+ ];
121
+
122
+ for (const p of candidates) {
123
+ if (fs.existsSync(p)) return p;
124
+ }
125
+
126
+ return 'lightpanda'; // hope it's on PATH
127
+ }
128
+ }
129
+
130
+ // Singleton instance — shared across all sessions
131
+ let _instance = null;
132
+
133
+ export function getLightPandaServer(binaryPath) {
134
+ if (!_instance) {
135
+ _instance = new LightPandaServer(binaryPath);
136
+ }
137
+ return _instance;
138
+ }
139
+
140
+ export function stopLightPandaServer() {
141
+ if (_instance) {
142
+ _instance.stop();
143
+ _instance = null;
144
+ }
145
+ }
146
+
147
+ process.on('SIGTERM', stopLightPandaServer);
148
+ process.on('SIGINT', stopLightPandaServer);
149
+ process.on('beforeExit', stopLightPandaServer);
150
+
151
+ export default LightPandaServer;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@monostate/node-scraper",
3
- "version": "1.8.1",
3
+ "version": "2.1.0",
4
4
  "description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -15,12 +15,15 @@
15
15
  "index.js",
16
16
  "index.d.ts",
17
17
  "browser-pool.js",
18
+ "browser-session.js",
19
+ "lightpanda-server.js",
18
20
  "README.md",
19
21
  "BULK_SCRAPING.md",
20
22
  "package.json",
21
23
  "scripts/"
22
24
  ],
23
25
  "scripts": {
26
+ "test": "node --test test/",
24
27
  "postinstall": "node scripts/install-lightpanda.js"
25
28
  },
26
29
  "keywords": [
@@ -33,6 +36,9 @@
33
36
  "data-extraction",
34
37
  "automation",
35
38
  "browser",
39
+ "browser-use",
40
+ "cdp",
41
+ "ai-agent",
36
42
  "ai-powered",
37
43
  "question-answering",
38
44
  "pdf-parsing",
@@ -47,11 +53,10 @@
47
53
  "author": "BNCA Team",
48
54
  "license": "MIT",
49
55
  "dependencies": {
50
- "node-fetch": "^3.3.2",
51
- "pdf-parse": "^1.1.1"
56
+ "pdf-parse": "^2.4.5"
52
57
  },
53
58
  "peerDependencies": {
54
- "puppeteer": "^24.11.2"
59
+ "puppeteer": "^24.38.0"
55
60
  },
56
61
  "peerDependenciesMeta": {
57
62
  "puppeteer": {
@@ -59,7 +64,7 @@
59
64
  }
60
65
  },
61
66
  "engines": {
62
- "node": ">=18.0.0"
67
+ "node": ">=20.0.0"
63
68
  },
64
69
  "repository": {
65
70
  "type": "git",
@@ -6,17 +6,30 @@ import path from 'path';
6
6
  import { createWriteStream } from 'fs';
7
7
  import { execSync } from 'child_process';
8
8
 
9
- const LIGHTPANDA_VERSION = 'nightly';
9
+ const LIGHTPANDA_VERSION = 'v0.2.5';
10
10
  const BINARY_DIR = path.join(path.dirname(path.dirname(new URL(import.meta.url).pathname)), 'bin');
11
11
  const BINARY_NAME = 'lightpanda';
12
12
  const BINARY_PATH = path.join(BINARY_DIR, BINARY_NAME);
13
13
 
14
- // Platform-specific download URLs (matching official Lightpanda instructions)
15
- const DOWNLOAD_URLS = {
16
- 'darwin': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-aarch64-macos`,
17
- 'linux': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-x86_64-linux`,
18
- 'wsl': `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}/lightpanda-x86_64-linux` // WSL uses Linux binary
19
- };
14
+ function detectArch() {
15
+ const arch = process.arch;
16
+ if (arch === 'arm64' || arch === 'aarch64') return 'aarch64';
17
+ if (arch === 'x64' || arch === 'x86_64') return 'x86_64';
18
+ return arch;
19
+ }
20
+
21
+ // Platform-specific download URLs (matching official Lightpanda releases)
22
+ function getDownloadUrls() {
23
+ const arch = detectArch();
24
+ const base = `https://github.com/lightpanda-io/browser/releases/download/${LIGHTPANDA_VERSION}`;
25
+ return {
26
+ 'darwin': `${base}/lightpanda-${arch}-macos`,
27
+ 'linux': `${base}/lightpanda-${arch}-linux`,
28
+ 'wsl': `${base}/lightpanda-x86_64-linux`
29
+ };
30
+ }
31
+
32
+ const DOWNLOAD_URLS = getDownloadUrls();
20
33
 
21
34
  function detectPlatform() {
22
35
  const platform = process.platform;