pdf-plus 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 PDF Extractor Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,426 @@
1
+ # pdf-plus
2
+
3
+ A comprehensive PDF content extraction library with support for text, images, and structured data.
4
+
5
+ ## Features
6
+
7
+ - 📝 **Text Extraction** - High-quality text extraction with positioning
8
+ - 🖼️ **Image Detection** - Detect and reference images in PDF content
9
+ - 💾 **Image File Extraction** - Extract actual image files from PDFs
10
+ - 🎨 **Flexible Formatting** - Customizable image reference formats
11
+ - ⚡ **Performance Options** - Text-only, images-only, or combined modes
12
+ - 🔧 **TypeScript Support** - Full TypeScript definitions included
13
+ - 🛡️ **Robust Validation** - Comprehensive input validation and error handling
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ # Using pnpm (recommended)
19
+ pnpm add pdf-plus
20
+
21
+ # Using npm
22
+ npm install pdf-plus
23
+
24
+ # Using yarn
25
+ yarn add pdf-plus
26
+ ```
27
+
28
+ ## Quick Start
29
+
30
+ ```typescript
31
+ import { extractPdfContent } from "pdf-plus";
32
+
33
+ // Extract both text and images
34
+ const result = await extractPdfContent("document.pdf", {
35
+ extractText: true,
36
+ extractImages: true,
37
+ verbose: true,
38
+ });
39
+
40
+ console.log(
41
+ `Extracted ${result.images.length} images from ${result.document.pages} pages`
42
+ );
43
+ console.log(`Text content: ${result.cleanText.substring(0, 100)}...`);
44
+ ```
45
+
46
+ ## Usage Examples
47
+
48
+ ### Text-Only Extraction (Fast)
49
+
50
+ ```typescript
51
+ import { extractText } from "pdf-plus";
52
+
53
+ const text = await extractText("document.pdf");
54
+ console.log(`Extracted ${text.length} characters`);
55
+ ```
56
+
57
+ ### Images-Only Extraction
58
+
59
+ ```typescript
60
+ import { extractImages } from "pdf-plus";
61
+
62
+ const images = await extractImages("document.pdf", {
63
+ extractImageFiles: true,
64
+ imageOutputDir: "./my-images",
65
+ });
66
+
67
+ console.log(`Found ${images.length} images`);
68
+ ```
69
+
70
+ ### Custom Image References
71
+
72
+ ```typescript
73
+ import { extractPdfContent } from "pdf-plus";
74
+
75
+ const result = await extractPdfContent("document.pdf", {
76
+ imageRefFormat: "📷 Image {index} on page {page}",
77
+ extractImageFiles: true,
78
+ useImagePaths: true,
79
+ });
80
+
81
+ // Text will contain: "📷 Image 1 on page 1" instead of "[IMAGE:img_1]"
82
+ ```
83
+
84
+ ### Advanced Configuration
85
+
86
+ ```typescript
87
+ import { PDFExtractor } from "pdf-plus";
88
+
89
+ const extractor = new PDFExtractor();
90
+
91
+ const result = await extractor.extract("large-document.pdf", {
92
+ extractText: true,
93
+ extractImages: true,
94
+ extractImageFiles: true,
95
+ imageOutputDir: "./extracted-images",
96
+ memoryLimit: "1GB",
97
+ batchSize: 10,
98
+ progressCallback: (progress) => {
99
+ console.log(
100
+ `Processing page ${progress.currentPage}/${progress.totalPages}`
101
+ );
102
+ },
103
+ });
104
+ ```
105
+
106
+ ### Real-World Examples
107
+
108
+ #### Extract and Save Images from Academic Papers
109
+
110
+ ```typescript
111
+ import { extractPdfContent } from "pdf-plus";
112
+ import path from "path";
113
+
114
+ async function extractAcademicPaper(pdfPath: string) {
115
+ const result = await extractPdfContent(pdfPath, {
116
+ extractText: true,
117
+ extractImages: true,
118
+ extractImageFiles: true,
119
+ imageOutputDir: "./paper-images",
120
+ imageRefFormat: "Figure {index}: {name}",
121
+ verbose: true,
122
+ });
123
+
124
+ // Save text content
125
+ const fs = await import("fs");
126
+ fs.writeFileSync("./paper-text.txt", result.cleanText);
127
+
128
+ // Log extraction summary
129
+ console.log(`📄 Extracted from ${result.document.filename}:`);
130
+ console.log(` 📝 Text: ${result.document.textLength} characters`);
131
+ console.log(` 🖼️ Images: ${result.images.length} found`);
132
+ console.log(` 📊 Pages: ${result.document.pages}`);
133
+
134
+ return result;
135
+ }
136
+ ```
137
+
138
+ #### Batch Process Multiple PDFs
139
+
140
+ ```typescript
141
+ import { PDFExtractor } from "pdf-plus";
142
+ import { glob } from "glob";
143
+
144
+ async function batchProcessPDFs(pattern: string) {
145
+ const extractor = new PDFExtractor("./cache"); // Enable caching
146
+ const pdfFiles = await glob(pattern);
147
+
148
+ const results = [];
149
+
150
+ for (const pdfFile of pdfFiles) {
151
+ console.log(`Processing: ${pdfFile}`);
152
+
153
+ try {
154
+ const result = await extractor.extract(pdfFile, {
155
+ extractText: true,
156
+ extractImages: true,
157
+ imageOutputDir: `./output/${path.basename(pdfFile, ".pdf")}`,
158
+ batchSize: 5, // Process 5 pages at a time
159
+ verbose: false,
160
+ });
161
+
162
+ results.push({
163
+ file: pdfFile,
164
+ success: true,
165
+ pages: result.document.pages,
166
+ images: result.images.length,
167
+ textLength: result.document.textLength,
168
+ });
169
+ } catch (error) {
170
+ console.error(`Failed to process ${pdfFile}:`, error);
171
+ results.push({
172
+ file: pdfFile,
173
+ success: false,
174
+ error: error.message,
175
+ });
176
+ }
177
+ }
178
+
179
+ return results;
180
+ }
181
+ ```
182
+
183
+ ## API Reference
184
+
185
+ ### Main Functions
186
+
187
+ #### `extractPdfContent(pdfPath, options)`
188
+
189
+ Extract complete content from a PDF file.
190
+
191
+ **Parameters:**
192
+
193
+ - `pdfPath` (string) - Path to the PDF file
194
+ - `options` (ExtractionOptions) - Extraction configuration
195
+
196
+ **Returns:** `Promise<ExtractionResult>`
197
+
198
+ #### `extractText(pdfPath, options)`
199
+
200
+ Extract only text content (optimized for speed).
201
+
202
+ **Returns:** `Promise<string>`
203
+
204
+ #### `extractImages(pdfPath, options)`
205
+
206
+ Extract only image references.
207
+
208
+ **Returns:** `Promise<ImageItem[]>`
209
+
210
+ #### `extractImageFiles(pdfPath, outputDir, options)`
211
+
212
+ Extract and save actual image files.
213
+
214
+ **Returns:** `Promise<string[]>` - Array of saved file paths
215
+
216
+ ### Options
217
+
218
+ ```typescript
219
+ interface ExtractionOptions {
220
+ extractText?: boolean; // Extract text content (default: true)
221
+ extractImages?: boolean; // Extract image references (default: true)
222
+ extractImageFiles?: boolean; // Save actual image files (default: false)
223
+ useImagePaths?: boolean; // Use file paths in references (default: false)
224
+ imageOutputDir?: string; // Directory for image files (default: './extracted-images')
225
+ imageRefFormat?: string; // Custom reference format (default: '[IMAGE:{id}]')
226
+ baseName?: string; // Base name for output files
227
+ verbose?: boolean; // Show detailed progress (default: false)
228
+ memoryLimit?: string; // Memory limit (e.g., '512MB', '1GB')
229
+ batchSize?: number; // Pages per batch (1-100)
230
+ progressCallback?: (progress: ProgressInfo) => void;
231
+ }
232
+ ```
233
+
234
+ ### Format Placeholders
235
+
236
+ Use these placeholders in `imageRefFormat`:
237
+
238
+ - `{id}` - Unique image ID (e.g., `img_1`)
239
+ - `{name}` - Original image name from PDF
240
+ - `{page}` - Page number
241
+ - `{index}` - Global image index
242
+ - `{path}` - File path (when `extractImageFiles` is true)
243
+
244
+ **Examples:**
245
+
246
+ - `[IMAGE:{id}]` → `[IMAGE:img_1]`
247
+ - `📷 Image {index}` → `📷 Image 1`
248
+ - `{name} on page {page}` → `artwork_1 on page 5`
249
+ - `<img src="{path}">` → `<img src="./images/img_1.jpg">`
250
+
251
+ ## Performance Modes
252
+
253
+ ### Text-Only Mode (Fastest)
254
+
255
+ ```typescript
256
+ const text = await extractText("document.pdf");
257
+ // ~40% faster than combined mode
258
+ ```
259
+
260
+ ### Images-Only Mode
261
+
262
+ ```typescript
263
+ const images = await extractImages("document.pdf");
264
+ // ~20% faster than combined mode
265
+ ```
266
+
267
+ ### Combined Mode (Default)
268
+
269
+ ```typescript
270
+ const result = await extractPdfContent("document.pdf");
271
+ // Full extraction with text and image references
272
+ ```
273
+
274
+ ## Error Handling
275
+
276
+ ```typescript
277
+ import { extractPdfContent } from "pdf-plus";
278
+
279
+ try {
280
+ const result = await extractPdfContent("document.pdf");
281
+ } catch (error) {
282
+ if (error.code === "VALIDATION_ERROR") {
283
+ console.error("Configuration error:", error.validationErrors);
284
+ } else if (error.code === "EXTRACTION_ERROR") {
285
+ console.error("Extraction failed:", error.message);
286
+ } else {
287
+ console.error("Unexpected error:", error);
288
+ }
289
+ }
290
+ ```
291
+
292
+ ## Development
293
+
294
+ ```bash
295
+ # Install dependencies
296
+ pnpm install
297
+
298
+ # Build the library
299
+ pnpm run build
300
+
301
+ # Lint and format
302
+ pnpm run lint:fix
303
+ pnpm run format
304
+
305
+ # Type checking
306
+ pnpm run check
307
+ ```
308
+
309
+ ## Requirements
310
+
311
+ - Node.js >= 18.0.0
312
+ - TypeScript >= 5.0 (for development)
313
+
314
+ ## License
315
+
316
+ MIT
317
+
318
+ ## Contributing
319
+
320
+ Contributions are welcome! Please read our contributing guidelines and submit pull requests to our repository.
321
+
322
+ ## Troubleshooting
323
+
324
+ ### Common Issues
325
+
326
+ #### "Cannot find module" errors
327
+
328
+ Make sure you're using the correct import syntax for your environment:
329
+
330
+ ```typescript
331
+ // ESM (recommended)
332
+ import { extractPdfContent } from "pdf-plus";
333
+
334
+ // CommonJS
335
+ const { extractPdfContent } = require("pdf-plus");
336
+ ```
337
+
338
+ #### Memory issues with large PDFs
339
+
340
+ For large documents, use streaming options:
341
+
342
+ ```typescript
343
+ const result = await extractPdfContent("large-document.pdf", {
344
+ memoryLimit: "512MB",
345
+ batchSize: 5,
346
+ useCache: true,
347
+ });
348
+ ```
349
+
350
+ #### Image extraction not working
351
+
352
+ Try different engines:
353
+
354
+ ```typescript
355
+ const result = await extractPdfContent("document.pdf", {
356
+ imageEngine: "poppler", // or 'pdf-lib', 'auto'
357
+ extractImageFiles: true,
358
+ });
359
+ ```
360
+
361
+ #### Text extraction issues
362
+
363
+ Some PDFs may have encoding issues. Try:
364
+
365
+ ```typescript
366
+ const result = await extractPdfContent("document.pdf", {
367
+ extractText: true,
368
+ textEngine: "pdfjs", // Alternative engine
369
+ verbose: true, // See detailed logs
370
+ });
371
+ ```
372
+
373
+ ### Performance Tips
374
+
375
+ 1. **Use specific extraction modes** for better performance:
376
+
377
+ ```typescript
378
+ // Text only (fastest)
379
+ const text = await extractText("document.pdf");
380
+
381
+ // Images only
382
+ const images = await extractImages("document.pdf");
383
+ ```
384
+
385
+ 2. **Enable caching** for repeated operations:
386
+
387
+ ```typescript
388
+ const extractor = new PDFExtractor("./cache");
389
+ ```
390
+
391
+ 3. **Process pages in batches** for large documents:
392
+ ```typescript
393
+ const result = await extractPdfContent("large.pdf", {
394
+ batchSize: 10,
395
+ memoryLimit: "1GB",
396
+ });
397
+ ```
398
+
399
+ ### Getting Help
400
+
401
+ - Check the [Issues](https://github.com/kauandotnet/pdfnode/issues) page
402
+ - Review [examples](./examples/) for common use cases
403
+ - Enable verbose logging for debugging: `{ verbose: true }`
404
+
405
+ ## Roadmap
406
+
407
+ ### Planned Features
408
+
409
+ - **OCR Support**: Text extraction from image-based PDFs
410
+ - **Advanced Text Analysis**: Font detection, text classification
411
+ - **Streaming API**: Process large documents efficiently
412
+ - **Cloud Integration**: Direct integration with cloud storage
413
+ - **CLI Tool**: Command-line interface for batch processing
414
+ - **Web Worker Support**: Browser-based extraction
415
+ - **Plugin System**: Extensible architecture for custom extractors
416
+
417
+ ### Version 1.x Roadmap
418
+
419
+ - [ ] OCR integration with Tesseract.js
420
+ - [ ] Advanced image processing options
421
+ - [ ] Streaming extraction API
422
+ - [ ] Performance optimizations
423
+ - [ ] Browser compatibility layer
424
+ - [ ] CLI tool development
425
+
426
+ See [CHANGELOG.md](./CHANGELOG.md) for detailed version history.