pdf-plus 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +96 -25
- package/dist/index.d.mts +51 -1
- package/dist/index.d.ts +51 -1
- package/dist/index.js +25 -25
- package/dist/index.mjs +25 -25
- package/package.json +2 -3
package/README.md
CHANGED
|
@@ -82,33 +82,55 @@ for await (const event of stream) {
|
|
|
82
82
|
|
|
83
83
|
See [PHASE4-STREAMING.md](./PHASE4-STREAMING.md) for complete streaming API documentation.
|
|
84
84
|
|
|
85
|
-
###
|
|
85
|
+
### Generate Page Images (NEW! - Phase 5)
|
|
86
86
|
|
|
87
|
-
|
|
87
|
+
Render PDF pages to high-quality images with a simple function call:
|
|
88
88
|
|
|
89
89
|
```typescript
|
|
90
|
-
import {
|
|
90
|
+
import { generatePageImages } from "pdf-plus";
|
|
91
91
|
|
|
92
|
-
|
|
92
|
+
// Simple - render all pages to JPG images
|
|
93
|
+
const imagePaths = await generatePageImages(
|
|
94
|
+
"document.pdf", // PDF file path
|
|
95
|
+
"./page-images" // Output directory where images will be saved
|
|
96
|
+
);
|
|
93
97
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
verbose: true,
|
|
100
|
-
});
|
|
98
|
+
console.log(`Generated ${imagePaths.length} page images`);
|
|
99
|
+
// Returns: ['/path/to/page-images/jpg/page-001.jpg', '/path/to/page-images/jpg/page-002.jpg', ...]
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**With Options:**
|
|
101
103
|
|
|
102
|
-
|
|
104
|
+
```typescript
|
|
105
|
+
const imagePaths = await generatePageImages("document.pdf", "./page-images", {
|
|
106
|
+
pageImageFormat: "jpg", // 'jpg', 'png', or 'webp'
|
|
107
|
+
pageImageDpi: 150, // DPI quality (72, 150, 300, 600)
|
|
108
|
+
pageRenderEngine: "poppler", // 'poppler' (recommended) or 'pdfjs'
|
|
109
|
+
specificPages: [1, 2, 3], // Optional: only render specific pages
|
|
110
|
+
parallelProcessing: true, // Parallel rendering (default: true)
|
|
111
|
+
maxConcurrentPages: 10, // Max parallel pages (default: 10)
|
|
112
|
+
verbose: true, // Show progress
|
|
113
|
+
});
|
|
103
114
|
```
|
|
104
115
|
|
|
105
116
|
**Features:**
|
|
106
117
|
|
|
107
|
-
- 🎨 **Multiple formats** -
|
|
108
|
-
- 📐 **Quality control** - Adjustable DPI (72, 150, 300, 600)
|
|
109
|
-
- 📄 **Page selection** -
|
|
110
|
-
-
|
|
111
|
-
-
|
|
118
|
+
- 🎨 **Multiple formats** - JPG, PNG, WebP
|
|
119
|
+
- 📐 **Quality control** - Adjustable DPI (72, 150, 300, 600)
|
|
120
|
+
- 📄 **Page selection** - Render specific pages or all pages
|
|
121
|
+
- 🚀 **Parallel rendering** - Fast multi-page processing
|
|
122
|
+
- 📁 **Returns file paths** - Array of absolute paths to generated images
|
|
123
|
+
- 🔧 **Two engines** - Poppler (best quality) or PDF.js
|
|
124
|
+
|
|
125
|
+
**Output Structure:**
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
page-images/
|
|
129
|
+
└── jpg/
|
|
130
|
+
├── page-001.jpg
|
|
131
|
+
├── page-002.jpg
|
|
132
|
+
└── page-003.jpg
|
|
133
|
+
```
|
|
112
134
|
|
|
113
135
|
See [PAGE-TO-IMAGE-FEATURE.md](./PAGE-TO-IMAGE-FEATURE.md) for complete page-to-image documentation.
|
|
114
136
|
|
|
@@ -123,17 +145,33 @@ const text = await extractText("document.pdf");
|
|
|
123
145
|
console.log(`Extracted ${text.length} characters`);
|
|
124
146
|
```
|
|
125
147
|
|
|
126
|
-
### Images
|
|
148
|
+
### Extract Embedded Images
|
|
127
149
|
|
|
128
150
|
```typescript
|
|
129
|
-
import {
|
|
151
|
+
import { extractImageFiles } from "pdf-plus";
|
|
130
152
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
153
|
+
// Extract and save embedded images from PDF
|
|
154
|
+
const imagePaths = await extractImageFiles(
|
|
155
|
+
"document.pdf",
|
|
156
|
+
"./extracted-images" // Output directory for embedded images
|
|
157
|
+
);
|
|
135
158
|
|
|
136
|
-
console.log(`
|
|
159
|
+
console.log(`Extracted ${imagePaths.length} embedded images`);
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Generate Page Images (Render Pages)
|
|
163
|
+
|
|
164
|
+
```typescript
|
|
165
|
+
import { generatePageImages } from "pdf-plus";
|
|
166
|
+
|
|
167
|
+
// Render PDF pages to image files
|
|
168
|
+
const imagePaths = await generatePageImages(
|
|
169
|
+
"document.pdf",
|
|
170
|
+
"./page-images" // Output directory for page images
|
|
171
|
+
);
|
|
172
|
+
|
|
173
|
+
console.log(`Generated ${imagePaths.length} page images`);
|
|
174
|
+
// Each page becomes an image: page-001.jpg, page-002.jpg, etc.
|
|
137
175
|
```
|
|
138
176
|
|
|
139
177
|
### Image Extraction with Optimization
|
|
@@ -364,10 +402,43 @@ Extract only image references.
|
|
|
364
402
|
|
|
365
403
|
#### `extractImageFiles(pdfPath, outputDir, options)`
|
|
366
404
|
|
|
367
|
-
Extract and save
|
|
405
|
+
Extract and save embedded image files from PDF.
|
|
406
|
+
|
|
407
|
+
**Parameters:**
|
|
408
|
+
|
|
409
|
+
- `pdfPath` - Path to the PDF file
|
|
410
|
+
- `outputDir` - Output directory path where embedded images will be saved
|
|
411
|
+
- `options` - Optional extraction options
|
|
368
412
|
|
|
369
413
|
**Returns:** `Promise<string[]>` - Array of saved file paths
|
|
370
414
|
|
|
415
|
+
#### `generatePageImages(pdfPath, outputDir, options)`
|
|
416
|
+
|
|
417
|
+
Render PDF pages to image files (page-to-image conversion).
|
|
418
|
+
|
|
419
|
+
**Parameters:**
|
|
420
|
+
|
|
421
|
+
- `pdfPath` - Path to the PDF file
|
|
422
|
+
- `outputDir` - Output directory path where page images will be saved
|
|
423
|
+
- `options` - Optional rendering options (pageImageFormat, pageImageDpi, pageRenderEngine, etc.)
|
|
424
|
+
|
|
425
|
+
**Returns:** `Promise<string[]>` - Array of absolute paths to generated page images
|
|
426
|
+
|
|
427
|
+
**Example:**
|
|
428
|
+
|
|
429
|
+
```typescript
|
|
430
|
+
import { generatePageImages } from "pdf-plus";
|
|
431
|
+
|
|
432
|
+
const imagePaths = await generatePageImages("document.pdf", "./page-images", {
|
|
433
|
+
pageImageFormat: "jpg",
|
|
434
|
+
pageImageDpi: 150,
|
|
435
|
+
pageRenderEngine: "poppler",
|
|
436
|
+
});
|
|
437
|
+
|
|
438
|
+
console.log(`Generated ${imagePaths.length} page images`);
|
|
439
|
+
// Returns: ['/absolute/path/to/page-images/jpg/page-001.jpg', ...]
|
|
440
|
+
```
|
|
441
|
+
|
|
371
442
|
### Options
|
|
372
443
|
|
|
373
444
|
```typescript
|
package/dist/index.d.mts
CHANGED
|
@@ -660,6 +660,29 @@ declare class PDFExtractor {
|
|
|
660
660
|
* Extract and save image files
|
|
661
661
|
*/
|
|
662
662
|
extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
663
|
+
/**
|
|
664
|
+
* Generate page images (render PDF pages to image files)
|
|
665
|
+
*
|
|
666
|
+
* This is a simplified method to only render PDF pages to images
|
|
667
|
+
* without extracting embedded images or text.
|
|
668
|
+
*
|
|
669
|
+
* @param pdfPath - Path to the PDF file
|
|
670
|
+
* @param outputDir - Directory to save page images
|
|
671
|
+
* @param options - Optional configuration (pageImageFormat, pageImageDpi, pageRenderEngine, etc.)
|
|
672
|
+
* @returns Promise resolving to array of generated image file paths
|
|
673
|
+
*
|
|
674
|
+
* @example
|
|
675
|
+
* ```typescript
|
|
676
|
+
* const extractor = new PDFExtractor();
|
|
677
|
+
* const imagePaths = await extractor.generatePageImages('document.pdf', './page-images', {
|
|
678
|
+
* pageImageFormat: 'jpg',
|
|
679
|
+
* pageImageDpi: 150,
|
|
680
|
+
* pageRenderEngine: 'poppler'
|
|
681
|
+
* });
|
|
682
|
+
* console.log(`Generated ${imagePaths.length} page images`);
|
|
683
|
+
* ```
|
|
684
|
+
*/
|
|
685
|
+
generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
663
686
|
private validateConfiguration;
|
|
664
687
|
private processResults;
|
|
665
688
|
/**
|
|
@@ -1658,6 +1681,32 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
|
|
|
1658
1681
|
* ```
|
|
1659
1682
|
*/
|
|
1660
1683
|
declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
1684
|
+
/**
|
|
1685
|
+
* Generate page images from a PDF (render pages to image files)
|
|
1686
|
+
*
|
|
1687
|
+
* This is a convenience function to render PDF pages to images without
|
|
1688
|
+
* extracting embedded images or text. Perfect for creating page previews
|
|
1689
|
+
* or thumbnails.
|
|
1690
|
+
*
|
|
1691
|
+
* @param pdfPath - Path to the PDF file
|
|
1692
|
+
* @param outputDir - Directory to save page images
|
|
1693
|
+
* @param options - Page rendering options
|
|
1694
|
+
* @returns Promise resolving to array of generated image file paths
|
|
1695
|
+
*
|
|
1696
|
+
* @example
|
|
1697
|
+
* ```typescript
|
|
1698
|
+
* import { generatePageImages } from 'pdf-plus';
|
|
1699
|
+
*
|
|
1700
|
+
* const imagePaths = await generatePageImages('document.pdf', './page-images', {
|
|
1701
|
+
* pageImageFormat: 'jpg',
|
|
1702
|
+
* pageImageDpi: 150,
|
|
1703
|
+
* pageRenderEngine: 'poppler'
|
|
1704
|
+
* });
|
|
1705
|
+
*
|
|
1706
|
+
* console.log(`Generated ${imagePaths.length} page images`);
|
|
1707
|
+
* ```
|
|
1708
|
+
*/
|
|
1709
|
+
declare function generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
1661
1710
|
/**
|
|
1662
1711
|
* Extract PDF content in streaming mode (Phase 4 - NEW!)
|
|
1663
1712
|
*
|
|
@@ -1717,6 +1766,7 @@ declare const _default: {
|
|
|
1717
1766
|
extractText: typeof extractText;
|
|
1718
1767
|
extractImages: typeof extractImages;
|
|
1719
1768
|
extractImageFiles: typeof extractImageFiles;
|
|
1769
|
+
generatePageImages: typeof generatePageImages;
|
|
1720
1770
|
extractPdfStream: typeof extractPdfStream;
|
|
1721
1771
|
validateConfig: typeof validateConfig;
|
|
1722
1772
|
validateImageRefFormat: typeof validateImageRefFormat;
|
|
@@ -1724,4 +1774,4 @@ declare const _default: {
|
|
|
1724
1774
|
version: string;
|
|
1725
1775
|
};
|
|
1726
1776
|
|
|
1727
|
-
export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
|
|
1777
|
+
export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, generatePageImages, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
|
package/dist/index.d.ts
CHANGED
|
@@ -660,6 +660,29 @@ declare class PDFExtractor {
|
|
|
660
660
|
* Extract and save image files
|
|
661
661
|
*/
|
|
662
662
|
extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
663
|
+
/**
|
|
664
|
+
* Generate page images (render PDF pages to image files)
|
|
665
|
+
*
|
|
666
|
+
* This is a simplified method to only render PDF pages to images
|
|
667
|
+
* without extracting embedded images or text.
|
|
668
|
+
*
|
|
669
|
+
* @param pdfPath - Path to the PDF file
|
|
670
|
+
* @param outputDir - Directory to save page images
|
|
671
|
+
* @param options - Optional configuration (pageImageFormat, pageImageDpi, pageRenderEngine, etc.)
|
|
672
|
+
* @returns Promise resolving to array of generated image file paths
|
|
673
|
+
*
|
|
674
|
+
* @example
|
|
675
|
+
* ```typescript
|
|
676
|
+
* const extractor = new PDFExtractor();
|
|
677
|
+
* const imagePaths = await extractor.generatePageImages('document.pdf', './page-images', {
|
|
678
|
+
* pageImageFormat: 'jpg',
|
|
679
|
+
* pageImageDpi: 150,
|
|
680
|
+
* pageRenderEngine: 'poppler'
|
|
681
|
+
* });
|
|
682
|
+
* console.log(`Generated ${imagePaths.length} page images`);
|
|
683
|
+
* ```
|
|
684
|
+
*/
|
|
685
|
+
generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
663
686
|
private validateConfiguration;
|
|
664
687
|
private processResults;
|
|
665
688
|
/**
|
|
@@ -1658,6 +1681,32 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
|
|
|
1658
1681
|
* ```
|
|
1659
1682
|
*/
|
|
1660
1683
|
declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
1684
|
+
/**
|
|
1685
|
+
* Generate page images from a PDF (render pages to image files)
|
|
1686
|
+
*
|
|
1687
|
+
* This is a convenience function to render PDF pages to images without
|
|
1688
|
+
* extracting embedded images or text. Perfect for creating page previews
|
|
1689
|
+
* or thumbnails.
|
|
1690
|
+
*
|
|
1691
|
+
* @param pdfPath - Path to the PDF file
|
|
1692
|
+
* @param outputDir - Directory to save page images
|
|
1693
|
+
* @param options - Page rendering options
|
|
1694
|
+
* @returns Promise resolving to array of generated image file paths
|
|
1695
|
+
*
|
|
1696
|
+
* @example
|
|
1697
|
+
* ```typescript
|
|
1698
|
+
* import { generatePageImages } from 'pdf-plus';
|
|
1699
|
+
*
|
|
1700
|
+
* const imagePaths = await generatePageImages('document.pdf', './page-images', {
|
|
1701
|
+
* pageImageFormat: 'jpg',
|
|
1702
|
+
* pageImageDpi: 150,
|
|
1703
|
+
* pageRenderEngine: 'poppler'
|
|
1704
|
+
* });
|
|
1705
|
+
*
|
|
1706
|
+
* console.log(`Generated ${imagePaths.length} page images`);
|
|
1707
|
+
* ```
|
|
1708
|
+
*/
|
|
1709
|
+
declare function generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
1661
1710
|
/**
|
|
1662
1711
|
* Extract PDF content in streaming mode (Phase 4 - NEW!)
|
|
1663
1712
|
*
|
|
@@ -1717,6 +1766,7 @@ declare const _default: {
|
|
|
1717
1766
|
extractText: typeof extractText;
|
|
1718
1767
|
extractImages: typeof extractImages;
|
|
1719
1768
|
extractImageFiles: typeof extractImageFiles;
|
|
1769
|
+
generatePageImages: typeof generatePageImages;
|
|
1720
1770
|
extractPdfStream: typeof extractPdfStream;
|
|
1721
1771
|
validateConfig: typeof validateConfig;
|
|
1722
1772
|
validateImageRefFormat: typeof validateImageRefFormat;
|
|
@@ -1724,4 +1774,4 @@ declare const _default: {
|
|
|
1724
1774
|
version: string;
|
|
1725
1775
|
};
|
|
1726
1776
|
|
|
1727
|
-
export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
|
|
1777
|
+
export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, generatePageImages, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
|