@monostate/node-scraper 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -19,7 +19,9 @@ yarn add @monostate/node-scraper
19
19
  pnpm add @monostate/node-scraper
20
20
  ```
21
21
 
22
- **🎉 New in v1.2.0**: Lightpanda binary is now automatically downloaded and configured during installation! No manual setup required.
22
+ **🎉 New in v1.3.0**: PDF parsing support added! Automatically extracts text, metadata, and page count from PDF documents.
23
+
24
+ **✨ Also in v1.2.0**: Lightpanda binary is now automatically downloaded and configured during installation! No manual setup required.
23
25
 
24
26
  ### Zero-Configuration Setup
25
27
 
@@ -45,6 +47,10 @@ console.log(screenshot.screenshot); // Base64 encoded image
45
47
  // Quick screenshot (optimized for speed)
46
48
  const quick = await quickShot('https://example.com');
47
49
  console.log(quick.screenshot); // Fast screenshot capture
50
+
51
+ // PDF parsing (automatic detection)
52
+ const pdfResult = await smartScrape('https://example.com/document.pdf');
53
+ console.log(pdfResult.content); // Extracted text, metadata, page count
48
54
  ```
49
55
 
50
56
  ### Advanced Usage
@@ -66,12 +72,13 @@ await scraper.cleanup(); // Clean up resources
66
72
 
67
73
  ## 🔧 How It Works
68
74
 
69
- BNCA uses a sophisticated 3-tier fallback system:
75
+ BNCA uses a sophisticated multi-tier system with intelligent detection:
70
76
 
71
77
  ### 1. 🔄 Direct Fetch (Fastest)
72
78
  - Pure HTTP requests with intelligent HTML parsing
73
79
  - **Performance**: Sub-second responses
74
80
  - **Success rate**: 75% of websites
81
+ - **PDF Detection**: Automatically detects PDFs by URL, content-type, or magic bytes
75
82
 
76
83
  ### 2. 🐼 Lightpanda Browser (Fast)
77
84
  - Lightweight browser engine (2-3x faster than Chromium)
@@ -83,6 +90,12 @@ BNCA uses a sophisticated 3-tier fallback system:
83
90
  - **Performance**: Complete JavaScript execution
84
91
  - **Fallback triggers**: Complex interactions needed
85
92
 
93
+ ### 📄 PDF Parser (Specialized)
94
+ - Automatic PDF detection and parsing
95
+ - **Features**: Text extraction, metadata, page count
96
+ - **Smart Detection**: Works even when PDFs are served with wrong content-types
97
+ - **Performance**: Typically 100-500ms for most PDFs
98
+
86
99
  ### 📸 Screenshot Methods
87
100
  - **Chrome CLI**: Direct Chrome screenshot capture
88
101
  - **Quickshot**: Optimized with retry logic and smart timeouts
@@ -186,6 +199,34 @@ Clean up resources (close browser instances).
186
199
  await scraper.cleanup();
187
200
  ```
188
201
 
202
+ ### 📄 PDF Support
203
+
204
+ BNCA automatically detects and parses PDF documents:
205
+
206
+ ```javascript
207
+ const pdfResult = await smartScrape('https://example.com/document.pdf');
208
+
209
+ // Parsed content includes:
210
+ const content = JSON.parse(pdfResult.content);
211
+ console.log(content.title); // PDF title
212
+ console.log(content.author); // Author name
213
+ console.log(content.pages); // Number of pages
214
+ console.log(content.text); // Full extracted text
215
+ console.log(content.creationDate); // Creation date
216
+ console.log(content.metadata); // Additional metadata
217
+ ```
218
+
219
+ **PDF Detection Methods:**
220
+ - URL ending with `.pdf`
221
+ - Content-Type header `application/pdf`
222
+ - Binary content starting with `%PDF` (magic bytes)
223
+ - Works with PDFs served as `application/octet-stream` (e.g., GitHub raw files)
224
+
225
+ **Limitations:**
226
+ - Maximum file size: 20MB
227
+ - Text extraction only (no image OCR)
228
+ - Requires `pdf-parse` dependency (automatically installed)
229
+
189
230
  ## 📱 Next.js Integration
190
231
 
191
232
  ### API Route Example
@@ -354,7 +395,14 @@ const result: ScrapingResult = await scraper.scrape('https://example.com');
354
395
 
355
396
  ## 📋 Changelog
356
397
 
357
- ### v1.2.0 (Latest)
398
+ ### v1.3.0 (Latest)
399
+ - 📄 **PDF Support**: Full PDF parsing with text extraction, metadata, and page count
400
+ - 🔍 **Smart PDF Detection**: Detects PDFs by URL patterns, content-type, or magic bytes
401
+ - 🚀 **Robust Parsing**: Handles PDFs served with incorrect content-types (e.g., GitHub raw files)
402
+ - ⚡ **Fast Performance**: PDF parsing typically completes in 100-500ms
403
+ - 📊 **Comprehensive Extraction**: Title, author, creation date, page count, and full text
404
+
405
+ ### v1.2.0
358
406
  - 🎉 **Auto-Installation**: Lightpanda binary is now automatically downloaded during `npm install`
359
407
  - 🔧 **Cross-Platform Support**: Automatic detection and installation for macOS, Linux, and Windows/WSL
360
408
  - ⚡ **Improved Performance**: Enhanced binary detection and ES6 module compatibility
package/bin/lightpanda CHANGED
File without changes
package/index.js CHANGED
@@ -5,6 +5,7 @@ import { existsSync, statSync } from 'fs';
5
5
  import path from 'path';
6
6
  import { fileURLToPath } from 'url';
7
7
  import { promises as fsPromises } from 'fs';
8
+ import pdfParse from 'pdf-parse/lib/pdf-parse.js';
8
9
 
9
10
  let puppeteer = null;
10
11
  try {
@@ -42,7 +43,8 @@ export class BNCASmartScraper {
42
43
  this.stats = {
43
44
  directFetch: { attempts: 0, successes: 0 },
44
45
  lightpanda: { attempts: 0, successes: 0 },
45
- puppeteer: { attempts: 0, successes: 0 }
46
+ puppeteer: { attempts: 0, successes: 0 },
47
+ pdf: { attempts: 0, successes: 0 }
46
48
  };
47
49
  }
48
50
 
@@ -60,6 +62,35 @@ export class BNCASmartScraper {
60
62
  let lastError = null;
61
63
 
62
64
  try {
65
+ // Check if URL is a PDF (by extension or content-type check)
66
+ const isPdfUrl = url.toLowerCase().endsWith('.pdf') ||
67
+ url.toLowerCase().includes('.pdf?') ||
68
+ url.toLowerCase().includes('/pdf/');
69
+
70
+ if (isPdfUrl) {
71
+ this.log(' 📄 PDF detected, using PDF parser...');
72
+ result = await this.tryPDFParse(url, config);
73
+
74
+ if (result.success) {
75
+ method = 'pdf';
76
+ this.log(' ✅ PDF parsing successful');
77
+
78
+ const totalTime = Date.now() - startTime;
79
+ return {
80
+ ...result,
81
+ method,
82
+ performance: {
83
+ totalTime,
84
+ method
85
+ },
86
+ stats: this.getStats()
87
+ };
88
+ } else {
89
+ this.log(' ❌ PDF parsing failed');
90
+ lastError = result.error;
91
+ }
92
+ }
93
+
63
94
  // Step 1: Try direct fetch first (fastest)
64
95
  this.log(' 🔄 Attempting direct fetch...');
65
96
  result = await this.tryDirectFetch(url, config);
@@ -67,6 +98,29 @@ export class BNCASmartScraper {
67
98
  if (result.success && !result.needsBrowser) {
68
99
  method = 'direct-fetch';
69
100
  this.log(' ✅ Direct fetch successful');
101
+ } else if (result.isPdf) {
102
+ // Direct fetch detected a PDF, try PDF parser
103
+ this.log(' 📄 Direct fetch detected PDF content, using PDF parser...');
104
+ result = await this.tryPDFParse(url, config);
105
+
106
+ if (result.success) {
107
+ method = 'pdf';
108
+ this.log(' ✅ PDF parsing successful');
109
+
110
+ const totalTime = Date.now() - startTime;
111
+ return {
112
+ ...result,
113
+ method,
114
+ performance: {
115
+ totalTime,
116
+ method
117
+ },
118
+ stats: this.getStats()
119
+ };
120
+ } else {
121
+ this.log(' ❌ PDF parsing failed');
122
+ lastError = result.error;
123
+ }
70
124
  } else {
71
125
  this.log(result.needsBrowser ? ' ⚠️ Browser rendering required' : ' ❌ Direct fetch failed');
72
126
  lastError = result.error;
@@ -152,7 +206,32 @@ export class BNCASmartScraper {
152
206
  };
153
207
  }
154
208
 
155
- const html = await response.text();
209
+ // Check if the response is actually a PDF
210
+ const contentType = response.headers.get('content-type') || '';
211
+ if (contentType.includes('application/pdf')) {
212
+ return {
213
+ success: false,
214
+ error: 'Content is PDF, should use PDF parser',
215
+ isPdf: true
216
+ };
217
+ }
218
+
219
+ // Get response as array buffer to check magic bytes
220
+ const buffer = await response.arrayBuffer();
221
+ const firstBytes = new Uint8Array(buffer.slice(0, 5));
222
+ const signature = Array.from(firstBytes).map(b => String.fromCharCode(b)).join('');
223
+
224
+ // Check for PDF magic bytes
225
+ if (signature.startsWith('%PDF')) {
226
+ return {
227
+ success: false,
228
+ error: 'Content is PDF (detected by magic bytes), should use PDF parser',
229
+ isPdf: true
230
+ };
231
+ }
232
+
233
+ // Convert buffer back to text for HTML processing
234
+ const html = new TextDecoder().decode(buffer);
156
235
 
157
236
  // Intelligent browser detection
158
237
  const needsBrowser = this.detectBrowserRequirement(html, url);
@@ -390,6 +469,95 @@ export class BNCASmartScraper {
390
469
  }
391
470
  }
392
471
 
472
+ /**
473
+ * PDF parsing method - handles PDF documents
474
+ */
475
+ async tryPDFParse(url, config) {
476
+ this.stats.pdf.attempts++;
477
+
478
+ try {
479
+ // Download PDF with timeout
480
+ const controller = new AbortController();
481
+ const timeoutId = setTimeout(() => controller.abort(), config.timeout);
482
+
483
+ const response = await fetch(url, {
484
+ headers: {
485
+ 'User-Agent': config.userAgent,
486
+ 'Accept': 'application/pdf,*/*'
487
+ },
488
+ signal: controller.signal
489
+ });
490
+
491
+ clearTimeout(timeoutId);
492
+
493
+ if (!response.ok) {
494
+ return {
495
+ success: false,
496
+ error: `HTTP ${response.status}: ${response.statusText}`
497
+ };
498
+ }
499
+
500
+ // Check content type (be lenient - accept various content types)
501
+ const contentType = response.headers.get('content-type') || '';
502
+ const acceptableTypes = ['pdf', 'octet-stream', 'binary', 'download'];
503
+ const isAcceptableType = acceptableTypes.some(type => contentType.includes(type));
504
+
505
+ if (!isAcceptableType && !url.toLowerCase().includes('.pdf')) {
506
+ return {
507
+ success: false,
508
+ error: `Not a PDF document: ${contentType}`
509
+ };
510
+ }
511
+
512
+ // Get PDF buffer
513
+ const arrayBuffer = await response.arrayBuffer();
514
+ const buffer = Buffer.from(arrayBuffer);
515
+
516
+ // Check size limit (20MB)
517
+ if (buffer.length > 20 * 1024 * 1024) {
518
+ return {
519
+ success: false,
520
+ error: 'PDF too large (max 20MB)'
521
+ };
522
+ }
523
+
524
+ // Parse PDF
525
+ const pdfData = await pdfParse(buffer);
526
+
527
+ // Extract structured content
528
+ const content = {
529
+ title: pdfData.info?.Title || 'Untitled PDF',
530
+ author: pdfData.info?.Author || '',
531
+ subject: pdfData.info?.Subject || '',
532
+ keywords: pdfData.info?.Keywords || '',
533
+ creator: pdfData.info?.Creator || '',
534
+ producer: pdfData.info?.Producer || '',
535
+ creationDate: pdfData.info?.CreationDate || '',
536
+ modificationDate: pdfData.info?.ModificationDate || '',
537
+ pages: pdfData.numpages || 0,
538
+ text: pdfData.text || '',
539
+ metadata: pdfData.metadata || null,
540
+ url: url
541
+ };
542
+
543
+ this.stats.pdf.successes++;
544
+
545
+ return {
546
+ success: true,
547
+ content: JSON.stringify(content, null, 2),
548
+ size: buffer.length,
549
+ contentType: 'application/pdf',
550
+ pages: content.pages
551
+ };
552
+
553
+ } catch (error) {
554
+ return {
555
+ success: false,
556
+ error: `PDF parsing error: ${error.message}`
557
+ };
558
+ }
559
+ }
560
+
393
561
  /**
394
562
  * Intelligent detection of browser requirement
395
563
  */
@@ -633,7 +801,9 @@ export class BNCASmartScraper {
633
801
  lightpanda: this.stats.lightpanda.attempts > 0 ?
634
802
  (this.stats.lightpanda.successes / this.stats.lightpanda.attempts * 100).toFixed(1) + '%' : '0%',
635
803
  puppeteer: this.stats.puppeteer.attempts > 0 ?
636
- (this.stats.puppeteer.successes / this.stats.puppeteer.attempts * 100).toFixed(1) + '%' : '0%'
804
+ (this.stats.puppeteer.successes / this.stats.puppeteer.attempts * 100).toFixed(1) + '%' : '0%',
805
+ pdf: this.stats.pdf.attempts > 0 ?
806
+ (this.stats.pdf.successes / this.stats.pdf.attempts * 100).toFixed(1) + '%' : '0%'
637
807
  }
638
808
  };
639
809
  }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@monostate/node-scraper",
3
- "version": "1.2.0",
4
- "description": "Intelligent web scraping with multi-level fallback system - 11.35x faster than Firecrawl",
3
+ "version": "1.3.0",
4
+ "description": "Intelligent web scraping with PDF support and multi-level fallback system - 11.35x faster than Firecrawl",
5
5
  "type": "module",
6
6
  "main": "index.js",
7
7
  "types": "index.d.ts",
@@ -19,6 +19,9 @@
19
19
  "scripts/",
20
20
  "bin/"
21
21
  ],
22
+ "scripts": {
23
+ "postinstall": "node scripts/install-lightpanda.js"
24
+ },
22
25
  "keywords": [
23
26
  "web-scraping",
24
27
  "crawling",
@@ -37,7 +40,8 @@
37
40
  "author": "BNCA Team",
38
41
  "license": "MIT",
39
42
  "dependencies": {
40
- "node-fetch": "^3.3.2"
43
+ "node-fetch": "^3.3.2",
44
+ "pdf-parse": "^1.1.1"
41
45
  },
42
46
  "peerDependencies": {
43
47
  "puppeteer": ">=20.0.0"
@@ -65,8 +69,5 @@
65
69
  },
66
70
  "publishConfig": {
67
71
  "access": "public"
68
- },
69
- "scripts": {
70
- "postinstall": "node scripts/install-lightpanda.js"
71
72
  }
72
73
  }
package/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2025 BNCA Team
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.