pdf-plus 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +426 -0
- package/dist/index.d.mts +694 -0
- package/dist/index.d.ts +694 -0
- package/dist/index.js +40 -0
- package/dist/index.mjs +40 -0
- package/package.json +80 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 PDF Extractor Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
# pdf-plus
|
|
2
|
+
|
|
3
|
+
A comprehensive PDF content extraction library with support for text, images, and structured data.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- 📝 **Text Extraction** - High-quality text extraction with positioning
|
|
8
|
+
- 🖼️ **Image Detection** - Detect and reference images in PDF content
|
|
9
|
+
- 💾 **Image File Extraction** - Extract actual image files from PDFs
|
|
10
|
+
- 🎨 **Flexible Formatting** - Customizable image reference formats
|
|
11
|
+
- ⚡ **Performance Options** - Text-only, images-only, or combined modes
|
|
12
|
+
- 🔧 **TypeScript Support** - Full TypeScript definitions included
|
|
13
|
+
- 🛡️ **Robust Validation** - Comprehensive input validation and error handling
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Using pnpm (recommended)
|
|
19
|
+
pnpm add pdf-plus
|
|
20
|
+
|
|
21
|
+
# Using npm
|
|
22
|
+
npm install pdf-plus
|
|
23
|
+
|
|
24
|
+
# Using yarn
|
|
25
|
+
yarn add pdf-plus
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
```typescript
|
|
31
|
+
import { extractPdfContent } from "pdf-plus";
|
|
32
|
+
|
|
33
|
+
// Extract both text and images
|
|
34
|
+
const result = await extractPdfContent("document.pdf", {
|
|
35
|
+
extractText: true,
|
|
36
|
+
extractImages: true,
|
|
37
|
+
verbose: true,
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
console.log(
|
|
41
|
+
`Extracted ${result.images.length} images from ${result.document.pages} pages`
|
|
42
|
+
);
|
|
43
|
+
console.log(`Text content: ${result.cleanText.substring(0, 100)}...`);
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Usage Examples
|
|
47
|
+
|
|
48
|
+
### Text-Only Extraction (Fast)
|
|
49
|
+
|
|
50
|
+
```typescript
|
|
51
|
+
import { extractText } from "pdf-plus";
|
|
52
|
+
|
|
53
|
+
const text = await extractText("document.pdf");
|
|
54
|
+
console.log(`Extracted ${text.length} characters`);
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Images-Only Extraction
|
|
58
|
+
|
|
59
|
+
```typescript
|
|
60
|
+
import { extractImages } from "pdf-plus";
|
|
61
|
+
|
|
62
|
+
const images = await extractImages("document.pdf", {
|
|
63
|
+
extractImageFiles: true,
|
|
64
|
+
imageOutputDir: "./my-images",
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
console.log(`Found ${images.length} images`);
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Custom Image References
|
|
71
|
+
|
|
72
|
+
```typescript
|
|
73
|
+
import { extractPdfContent } from "pdf-plus";
|
|
74
|
+
|
|
75
|
+
const result = await extractPdfContent("document.pdf", {
|
|
76
|
+
imageRefFormat: "📷 Image {index} on page {page}",
|
|
77
|
+
extractImageFiles: true,
|
|
78
|
+
useImagePaths: true,
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
// Text will contain: "📷 Image 1 on page 1" instead of "[IMAGE:img_1]"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Advanced Configuration
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
import { PDFExtractor } from "pdf-plus";
|
|
88
|
+
|
|
89
|
+
const extractor = new PDFExtractor();
|
|
90
|
+
|
|
91
|
+
const result = await extractor.extract("large-document.pdf", {
|
|
92
|
+
extractText: true,
|
|
93
|
+
extractImages: true,
|
|
94
|
+
extractImageFiles: true,
|
|
95
|
+
imageOutputDir: "./extracted-images",
|
|
96
|
+
memoryLimit: "1GB",
|
|
97
|
+
batchSize: 10,
|
|
98
|
+
progressCallback: (progress) => {
|
|
99
|
+
console.log(
|
|
100
|
+
`Processing page ${progress.currentPage}/${progress.totalPages}`
|
|
101
|
+
);
|
|
102
|
+
},
|
|
103
|
+
});
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Real-World Examples
|
|
107
|
+
|
|
108
|
+
#### Extract and Save Images from Academic Papers
|
|
109
|
+
|
|
110
|
+
```typescript
|
|
111
|
+
import { extractPdfContent } from "pdf-plus";
|
|
112
|
+
import path from "path";
|
|
113
|
+
|
|
114
|
+
async function extractAcademicPaper(pdfPath: string) {
|
|
115
|
+
const result = await extractPdfContent(pdfPath, {
|
|
116
|
+
extractText: true,
|
|
117
|
+
extractImages: true,
|
|
118
|
+
extractImageFiles: true,
|
|
119
|
+
imageOutputDir: "./paper-images",
|
|
120
|
+
imageRefFormat: "Figure {index}: {name}",
|
|
121
|
+
verbose: true,
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
// Save text content
|
|
125
|
+
const fs = await import("fs");
|
|
126
|
+
fs.writeFileSync("./paper-text.txt", result.cleanText);
|
|
127
|
+
|
|
128
|
+
// Log extraction summary
|
|
129
|
+
console.log(`📄 Extracted from ${result.document.filename}:`);
|
|
130
|
+
console.log(` 📝 Text: ${result.document.textLength} characters`);
|
|
131
|
+
console.log(` 🖼️ Images: ${result.images.length} found`);
|
|
132
|
+
console.log(` 📊 Pages: ${result.document.pages}`);
|
|
133
|
+
|
|
134
|
+
return result;
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
#### Batch Process Multiple PDFs
|
|
139
|
+
|
|
140
|
+
```typescript
|
|
141
|
+
import { PDFExtractor } from "pdf-plus";
|
|
142
|
+
import { glob } from "glob";
|
|
143
|
+
|
|
144
|
+
async function batchProcessPDFs(pattern: string) {
|
|
145
|
+
const extractor = new PDFExtractor("./cache"); // Enable caching
|
|
146
|
+
const pdfFiles = await glob(pattern);
|
|
147
|
+
|
|
148
|
+
const results = [];
|
|
149
|
+
|
|
150
|
+
for (const pdfFile of pdfFiles) {
|
|
151
|
+
console.log(`Processing: ${pdfFile}`);
|
|
152
|
+
|
|
153
|
+
try {
|
|
154
|
+
const result = await extractor.extract(pdfFile, {
|
|
155
|
+
extractText: true,
|
|
156
|
+
extractImages: true,
|
|
157
|
+
imageOutputDir: `./output/${path.basename(pdfFile, ".pdf")}`,
|
|
158
|
+
batchSize: 5, // Process 5 pages at a time
|
|
159
|
+
verbose: false,
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
results.push({
|
|
163
|
+
file: pdfFile,
|
|
164
|
+
success: true,
|
|
165
|
+
pages: result.document.pages,
|
|
166
|
+
images: result.images.length,
|
|
167
|
+
textLength: result.document.textLength,
|
|
168
|
+
});
|
|
169
|
+
} catch (error) {
|
|
170
|
+
console.error(`Failed to process ${pdfFile}:`, error);
|
|
171
|
+
results.push({
|
|
172
|
+
file: pdfFile,
|
|
173
|
+
success: false,
|
|
174
|
+
error: error.message,
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
return results;
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## API Reference
|
|
184
|
+
|
|
185
|
+
### Main Functions
|
|
186
|
+
|
|
187
|
+
#### `extractPdfContent(pdfPath, options)`
|
|
188
|
+
|
|
189
|
+
Extract complete content from a PDF file.
|
|
190
|
+
|
|
191
|
+
**Parameters:**
|
|
192
|
+
|
|
193
|
+
- `pdfPath` (string) - Path to the PDF file
|
|
194
|
+
- `options` (ExtractionOptions) - Extraction configuration
|
|
195
|
+
|
|
196
|
+
**Returns:** `Promise<ExtractionResult>`
|
|
197
|
+
|
|
198
|
+
#### `extractText(pdfPath, options)`
|
|
199
|
+
|
|
200
|
+
Extract only text content (optimized for speed).
|
|
201
|
+
|
|
202
|
+
**Returns:** `Promise<string>`
|
|
203
|
+
|
|
204
|
+
#### `extractImages(pdfPath, options)`
|
|
205
|
+
|
|
206
|
+
Extract only image references.
|
|
207
|
+
|
|
208
|
+
**Returns:** `Promise<ImageItem[]>`
|
|
209
|
+
|
|
210
|
+
#### `extractImageFiles(pdfPath, outputDir, options)`
|
|
211
|
+
|
|
212
|
+
Extract and save actual image files.
|
|
213
|
+
|
|
214
|
+
**Returns:** `Promise<string[]>` - Array of saved file paths
|
|
215
|
+
|
|
216
|
+
### Options
|
|
217
|
+
|
|
218
|
+
```typescript
|
|
219
|
+
interface ExtractionOptions {
|
|
220
|
+
extractText?: boolean; // Extract text content (default: true)
|
|
221
|
+
extractImages?: boolean; // Extract image references (default: true)
|
|
222
|
+
extractImageFiles?: boolean; // Save actual image files (default: false)
|
|
223
|
+
useImagePaths?: boolean; // Use file paths in references (default: false)
|
|
224
|
+
imageOutputDir?: string; // Directory for image files (default: './extracted-images')
|
|
225
|
+
imageRefFormat?: string; // Custom reference format (default: '[IMAGE:{id}]')
|
|
226
|
+
baseName?: string; // Base name for output files
|
|
227
|
+
verbose?: boolean; // Show detailed progress (default: false)
|
|
228
|
+
memoryLimit?: string; // Memory limit (e.g., '512MB', '1GB')
|
|
229
|
+
batchSize?: number; // Pages per batch (1-100)
|
|
230
|
+
progressCallback?: (progress: ProgressInfo) => void;
|
|
231
|
+
}
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Format Placeholders
|
|
235
|
+
|
|
236
|
+
Use these placeholders in `imageRefFormat`:
|
|
237
|
+
|
|
238
|
+
- `{id}` - Unique image ID (e.g., `img_1`)
|
|
239
|
+
- `{name}` - Original image name from PDF
|
|
240
|
+
- `{page}` - Page number
|
|
241
|
+
- `{index}` - Global image index
|
|
242
|
+
- `{path}` - File path (when `extractImageFiles` is true)
|
|
243
|
+
|
|
244
|
+
**Examples:**
|
|
245
|
+
|
|
246
|
+
- `[IMAGE:{id}]` → `[IMAGE:img_1]`
|
|
247
|
+
- `📷 Image {index}` → `📷 Image 1`
|
|
248
|
+
- `{name} on page {page}` → `artwork_1 on page 5`
|
|
249
|
+
- `<img src="{path}">` → `<img src="./images/img_1.jpg">`
|
|
250
|
+
|
|
251
|
+
## Performance Modes
|
|
252
|
+
|
|
253
|
+
### Text-Only Mode (Fastest)
|
|
254
|
+
|
|
255
|
+
```typescript
|
|
256
|
+
const text = await extractText("document.pdf");
|
|
257
|
+
// ~40% faster than combined mode
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Images-Only Mode
|
|
261
|
+
|
|
262
|
+
```typescript
|
|
263
|
+
const images = await extractImages("document.pdf");
|
|
264
|
+
// ~20% faster than combined mode
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### Combined Mode (Default)
|
|
268
|
+
|
|
269
|
+
```typescript
|
|
270
|
+
const result = await extractPdfContent("document.pdf");
|
|
271
|
+
// Full extraction with text and image references
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
## Error Handling
|
|
275
|
+
|
|
276
|
+
```typescript
|
|
277
|
+
import { extractPdfContent } from "pdf-plus";
|
|
278
|
+
|
|
279
|
+
try {
|
|
280
|
+
const result = await extractPdfContent("document.pdf");
|
|
281
|
+
} catch (error) {
|
|
282
|
+
if (error.code === "VALIDATION_ERROR") {
|
|
283
|
+
console.error("Configuration error:", error.validationErrors);
|
|
284
|
+
} else if (error.code === "EXTRACTION_ERROR") {
|
|
285
|
+
console.error("Extraction failed:", error.message);
|
|
286
|
+
} else {
|
|
287
|
+
console.error("Unexpected error:", error);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
## Development
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
# Install dependencies
|
|
296
|
+
pnpm install
|
|
297
|
+
|
|
298
|
+
# Build the library
|
|
299
|
+
pnpm run build
|
|
300
|
+
|
|
301
|
+
# Lint and format
|
|
302
|
+
pnpm run lint:fix
|
|
303
|
+
pnpm run format
|
|
304
|
+
|
|
305
|
+
# Type checking
|
|
306
|
+
pnpm run check
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
## Requirements
|
|
310
|
+
|
|
311
|
+
- Node.js >= 18.0.0
|
|
312
|
+
- TypeScript >= 5.0 (for development)
|
|
313
|
+
|
|
314
|
+
## License
|
|
315
|
+
|
|
316
|
+
MIT
|
|
317
|
+
|
|
318
|
+
## Contributing
|
|
319
|
+
|
|
320
|
+
Contributions are welcome! Please read our contributing guidelines and submit pull requests to our repository.
|
|
321
|
+
|
|
322
|
+
## Troubleshooting
|
|
323
|
+
|
|
324
|
+
### Common Issues
|
|
325
|
+
|
|
326
|
+
#### "Cannot find module" errors
|
|
327
|
+
|
|
328
|
+
Make sure you're using the correct import syntax for your environment:
|
|
329
|
+
|
|
330
|
+
```typescript
|
|
331
|
+
// ESM (recommended)
|
|
332
|
+
import { extractPdfContent } from "pdf-plus";
|
|
333
|
+
|
|
334
|
+
// CommonJS
|
|
335
|
+
const { extractPdfContent } = require("pdf-plus");
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
#### Memory issues with large PDFs
|
|
339
|
+
|
|
340
|
+
For large documents, use streaming options:
|
|
341
|
+
|
|
342
|
+
```typescript
|
|
343
|
+
const result = await extractPdfContent("large-document.pdf", {
|
|
344
|
+
memoryLimit: "512MB",
|
|
345
|
+
batchSize: 5,
|
|
346
|
+
useCache: true,
|
|
347
|
+
});
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
#### Image extraction not working
|
|
351
|
+
|
|
352
|
+
Try different engines:
|
|
353
|
+
|
|
354
|
+
```typescript
|
|
355
|
+
const result = await extractPdfContent("document.pdf", {
|
|
356
|
+
imageEngine: "poppler", // or 'pdf-lib', 'auto'
|
|
357
|
+
extractImageFiles: true,
|
|
358
|
+
});
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
#### Text extraction issues
|
|
362
|
+
|
|
363
|
+
Some PDFs may have encoding issues. Try:
|
|
364
|
+
|
|
365
|
+
```typescript
|
|
366
|
+
const result = await extractPdfContent("document.pdf", {
|
|
367
|
+
extractText: true,
|
|
368
|
+
textEngine: "pdfjs", // Alternative engine
|
|
369
|
+
verbose: true, // See detailed logs
|
|
370
|
+
});
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Performance Tips
|
|
374
|
+
|
|
375
|
+
1. **Use specific extraction modes** for better performance:
|
|
376
|
+
|
|
377
|
+
```typescript
|
|
378
|
+
// Text only (fastest)
|
|
379
|
+
const text = await extractText("document.pdf");
|
|
380
|
+
|
|
381
|
+
// Images only
|
|
382
|
+
const images = await extractImages("document.pdf");
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
2. **Enable caching** for repeated operations:
|
|
386
|
+
|
|
387
|
+
```typescript
|
|
388
|
+
const extractor = new PDFExtractor("./cache");
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
3. **Process pages in batches** for large documents:
|
|
392
|
+
```typescript
|
|
393
|
+
const result = await extractPdfContent("large.pdf", {
|
|
394
|
+
batchSize: 10,
|
|
395
|
+
memoryLimit: "1GB",
|
|
396
|
+
});
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
### Getting Help
|
|
400
|
+
|
|
401
|
+
- Check the [Issues](https://github.com/kauandotnet/pdfnode/issues) page
|
|
402
|
+
- Review [examples](./examples/) for common use cases
|
|
403
|
+
- Enable verbose logging for debugging: `{ verbose: true }`
|
|
404
|
+
|
|
405
|
+
## Roadmap
|
|
406
|
+
|
|
407
|
+
### Planned Features
|
|
408
|
+
|
|
409
|
+
- **OCR Support**: Text extraction from image-based PDFs
|
|
410
|
+
- **Advanced Text Analysis**: Font detection, text classification
|
|
411
|
+
- **Streaming API**: Process large documents efficiently
|
|
412
|
+
- **Cloud Integration**: Direct integration with cloud storage
|
|
413
|
+
- **CLI Tool**: Command-line interface for batch processing
|
|
414
|
+
- **Web Worker Support**: Browser-based extraction
|
|
415
|
+
- **Plugin System**: Extensible architecture for custom extractors
|
|
416
|
+
|
|
417
|
+
### Version 1.x Roadmap
|
|
418
|
+
|
|
419
|
+
- [ ] OCR integration with Tesseract.js
|
|
420
|
+
- [ ] Advanced image processing options
|
|
421
|
+
- [ ] Streaming extraction API
|
|
422
|
+
- [ ] Performance optimizations
|
|
423
|
+
- [ ] Browser compatibility layer
|
|
424
|
+
- [ ] CLI tool development
|
|
425
|
+
|
|
426
|
+
See [CHANGELOG.md](./CHANGELOG.md) for detailed version history.
|