pdf-plus 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -82,33 +82,55 @@ for await (const event of stream) {
82
82
 
83
83
  See [PHASE4-STREAMING.md](./PHASE4-STREAMING.md) for complete streaming API documentation.
84
84
 
85
- ### Convert PDF Pages to Images (NEW! - Phase 5)
85
+ ### Generate Page Images (NEW! - Phase 5)
86
86
 
87
- Convert PDF pages to high-quality images (PNG, JPG, WebP):
87
+ Render PDF pages to high-quality images with a simple function call:
88
88
 
89
89
  ```typescript
90
- import { PageToImageConverter } from "pdf-plus";
90
+ import { generatePageImages } from "pdf-plus";
91
91
 
92
- const converter = new PageToImageConverter();
92
+ // Simple - render all pages to JPG images
93
+ const imagePaths = await generatePageImages(
94
+ "document.pdf", // PDF file path
95
+ "./page-images" // Output directory where images will be saved
96
+ );
93
97
 
94
- // Convert all pages to PNG
95
- const result = await converter.convertToImages("document.pdf", {
96
- outputDir: "./page-images",
97
- format: "png",
98
- dpi: 150,
99
- verbose: true,
100
- });
98
+ console.log(`Generated ${imagePaths.length} page images`);
99
+ // Returns: ['/path/to/page-images/jpg/page-001.jpg', '/path/to/page-images/jpg/page-002.jpg', ...]
100
+ ```
101
+
102
+ **With Options:**
101
103
 
102
- console.log(`Converted ${result.totalPages} pages`);
104
+ ```typescript
105
+ const imagePaths = await generatePageImages("document.pdf", "./page-images", {
106
+ pageImageFormat: "jpg", // 'jpg', 'png', or 'webp'
107
+ pageImageDpi: 150, // DPI quality (72, 150, 300, 600)
108
+ pageRenderEngine: "poppler", // 'poppler' (recommended) or 'pdfjs'
109
+ specificPages: [1, 2, 3], // Optional: only render specific pages
110
+ parallelProcessing: true, // Parallel rendering (default: true)
111
+ maxConcurrentPages: 10, // Max parallel pages (default: 10)
112
+ verbose: true, // Show progress
113
+ });
103
114
  ```
104
115
 
105
116
  **Features:**
106
117
 
107
- - 🎨 **Multiple formats** - PNG, JPG, WebP
108
- - 📐 **Quality control** - Adjustable DPI (72, 150, 300, 600) and quality
109
- - 📄 **Page selection** - Convert specific pages or ranges
110
- - 🖼️ **Thumbnails** - Generate low-res previews
111
- - 💾 **Buffer/Base64** - In-memory conversion for web apps
118
+ - 🎨 **Multiple formats** - JPG, PNG, WebP
119
+ - 📐 **Quality control** - Adjustable DPI (72, 150, 300, 600)
120
+ - 📄 **Page selection** - Render specific pages or all pages
121
+ - 🚀 **Parallel rendering** - Fast multi-page processing
122
+ - 📁 **Returns file paths** - Array of absolute paths to generated images
123
+ - 🔧 **Two engines** - Poppler (best quality) or PDF.js
124
+
125
+ **Output Structure:**
126
+
127
+ ```
128
+ page-images/
129
+ └── jpg/
130
+ ├── page-001.jpg
131
+ ├── page-002.jpg
132
+ └── page-003.jpg
133
+ ```
112
134
 
113
135
  See [PAGE-TO-IMAGE-FEATURE.md](./PAGE-TO-IMAGE-FEATURE.md) for complete page-to-image documentation.
114
136
 
@@ -123,17 +145,33 @@ const text = await extractText("document.pdf");
123
145
  console.log(`Extracted ${text.length} characters`);
124
146
  ```
125
147
 
126
- ### Images-Only Extraction
148
+ ### Extract Embedded Images
127
149
 
128
150
  ```typescript
129
- import { extractImages } from "pdf-plus";
151
+ import { extractImageFiles } from "pdf-plus";
130
152
 
131
- const images = await extractImages("document.pdf", {
132
- extractImageFiles: true,
133
- imageOutputDir: "./my-images",
134
- });
153
+ // Extract and save embedded images from PDF
154
+ const imagePaths = await extractImageFiles(
155
+ "document.pdf",
156
+ "./extracted-images" // Output directory for embedded images
157
+ );
135
158
 
136
- console.log(`Found ${images.length} images`);
159
+ console.log(`Extracted ${imagePaths.length} embedded images`);
160
+ ```
161
+
162
+ ### Generate Page Images (Render Pages)
163
+
164
+ ```typescript
165
+ import { generatePageImages } from "pdf-plus";
166
+
167
+ // Render PDF pages to image files
168
+ const imagePaths = await generatePageImages(
169
+ "document.pdf",
170
+ "./page-images" // Output directory for page images
171
+ );
172
+
173
+ console.log(`Generated ${imagePaths.length} page images`);
174
+ // Each page becomes an image: page-001.jpg, page-002.jpg, etc.
137
175
  ```
138
176
 
139
177
  ### Image Extraction with Optimization
@@ -364,10 +402,43 @@ Extract only image references.
364
402
 
365
403
  #### `extractImageFiles(pdfPath, outputDir, options)`
366
404
 
367
- Extract and save actual image files.
405
+ Extract and save embedded image files from PDF.
406
+
407
+ **Parameters:**
408
+
409
+ - `pdfPath` - Path to the PDF file
410
+ - `outputDir` - Output directory path where embedded images will be saved
411
+ - `options` - Optional extraction options
368
412
 
369
413
  **Returns:** `Promise<string[]>` - Array of saved file paths
370
414
 
415
+ #### `generatePageImages(pdfPath, outputDir, options)`
416
+
417
+ Render PDF pages to image files (page-to-image conversion).
418
+
419
+ **Parameters:**
420
+
421
+ - `pdfPath` - Path to the PDF file
422
+ - `outputDir` - Output directory path where page images will be saved
423
+ - `options` - Optional rendering options (pageImageFormat, pageImageDpi, pageRenderEngine, etc.)
424
+
425
+ **Returns:** `Promise<string[]>` - Array of absolute paths to generated page images
426
+
427
+ **Example:**
428
+
429
+ ```typescript
430
+ import { generatePageImages } from "pdf-plus";
431
+
432
+ const imagePaths = await generatePageImages("document.pdf", "./page-images", {
433
+ pageImageFormat: "jpg",
434
+ pageImageDpi: 150,
435
+ pageRenderEngine: "poppler",
436
+ });
437
+
438
+ console.log(`Generated ${imagePaths.length} page images`);
439
+ // Returns: ['/absolute/path/to/page-images/jpg/page-001.jpg', ...]
440
+ ```
441
+
371
442
  ### Options
372
443
 
373
444
  ```typescript
package/dist/index.d.mts CHANGED
@@ -660,6 +660,29 @@ declare class PDFExtractor {
660
660
  * Extract and save image files
661
661
  */
662
662
  extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
663
+ /**
664
+ * Generate page images (render PDF pages to image files)
665
+ *
666
+ * This is a simplified method to only render PDF pages to images
667
+ * without extracting embedded images or text.
668
+ *
669
+ * @param pdfPath - Path to the PDF file
670
+ * @param outputDir - Directory to save page images
671
+ * @param options - Optional configuration (pageImageFormat, pageImageDpi, pageRenderEngine, etc.)
672
+ * @returns Promise resolving to array of generated image file paths
673
+ *
674
+ * @example
675
+ * ```typescript
676
+ * const extractor = new PDFExtractor();
677
+ * const imagePaths = await extractor.generatePageImages('document.pdf', './page-images', {
678
+ * pageImageFormat: 'jpg',
679
+ * pageImageDpi: 150,
680
+ * pageRenderEngine: 'poppler'
681
+ * });
682
+ * console.log(`Generated ${imagePaths.length} page images`);
683
+ * ```
684
+ */
685
+ generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
663
686
  private validateConfiguration;
664
687
  private processResults;
665
688
  /**
@@ -1658,6 +1681,32 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
1658
1681
  * ```
1659
1682
  */
1660
1683
  declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
1684
+ /**
1685
+ * Generate page images from a PDF (render pages to image files)
1686
+ *
1687
+ * This is a convenience function to render PDF pages to images without
1688
+ * extracting embedded images or text. Perfect for creating page previews
1689
+ * or thumbnails.
1690
+ *
1691
+ * @param pdfPath - Path to the PDF file
1692
+ * @param outputDir - Directory to save page images
1693
+ * @param options - Page rendering options
1694
+ * @returns Promise resolving to array of generated image file paths
1695
+ *
1696
+ * @example
1697
+ * ```typescript
1698
+ * import { generatePageImages } from 'pdf-plus';
1699
+ *
1700
+ * const imagePaths = await generatePageImages('document.pdf', './page-images', {
1701
+ * pageImageFormat: 'jpg',
1702
+ * pageImageDpi: 150,
1703
+ * pageRenderEngine: 'poppler'
1704
+ * });
1705
+ *
1706
+ * console.log(`Generated ${imagePaths.length} page images`);
1707
+ * ```
1708
+ */
1709
+ declare function generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
1661
1710
  /**
1662
1711
  * Extract PDF content in streaming mode (Phase 4 - NEW!)
1663
1712
  *
@@ -1717,6 +1766,7 @@ declare const _default: {
1717
1766
  extractText: typeof extractText;
1718
1767
  extractImages: typeof extractImages;
1719
1768
  extractImageFiles: typeof extractImageFiles;
1769
+ generatePageImages: typeof generatePageImages;
1720
1770
  extractPdfStream: typeof extractPdfStream;
1721
1771
  validateConfig: typeof validateConfig;
1722
1772
  validateImageRefFormat: typeof validateImageRefFormat;
@@ -1724,4 +1774,4 @@ declare const _default: {
1724
1774
  version: string;
1725
1775
  };
1726
1776
 
1727
- export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
1777
+ export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, generatePageImages, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
package/dist/index.d.ts CHANGED
@@ -660,6 +660,29 @@ declare class PDFExtractor {
660
660
  * Extract and save image files
661
661
  */
662
662
  extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
663
+ /**
664
+ * Generate page images (render PDF pages to image files)
665
+ *
666
+ * This is a simplified method to only render PDF pages to images
667
+ * without extracting embedded images or text.
668
+ *
669
+ * @param pdfPath - Path to the PDF file
670
+ * @param outputDir - Directory to save page images
671
+ * @param options - Optional configuration (pageImageFormat, pageImageDpi, pageRenderEngine, etc.)
672
+ * @returns Promise resolving to array of generated image file paths
673
+ *
674
+ * @example
675
+ * ```typescript
676
+ * const extractor = new PDFExtractor();
677
+ * const imagePaths = await extractor.generatePageImages('document.pdf', './page-images', {
678
+ * pageImageFormat: 'jpg',
679
+ * pageImageDpi: 150,
680
+ * pageRenderEngine: 'poppler'
681
+ * });
682
+ * console.log(`Generated ${imagePaths.length} page images`);
683
+ * ```
684
+ */
685
+ generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
663
686
  private validateConfiguration;
664
687
  private processResults;
665
688
  /**
@@ -1658,6 +1681,32 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
1658
1681
  * ```
1659
1682
  */
1660
1683
  declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
1684
+ /**
1685
+ * Generate page images from a PDF (render pages to image files)
1686
+ *
1687
+ * This is a convenience function to render PDF pages to images without
1688
+ * extracting embedded images or text. Perfect for creating page previews
1689
+ * or thumbnails.
1690
+ *
1691
+ * @param pdfPath - Path to the PDF file
1692
+ * @param outputDir - Directory to save page images
1693
+ * @param options - Page rendering options
1694
+ * @returns Promise resolving to array of generated image file paths
1695
+ *
1696
+ * @example
1697
+ * ```typescript
1698
+ * import { generatePageImages } from 'pdf-plus';
1699
+ *
1700
+ * const imagePaths = await generatePageImages('document.pdf', './page-images', {
1701
+ * pageImageFormat: 'jpg',
1702
+ * pageImageDpi: 150,
1703
+ * pageRenderEngine: 'poppler'
1704
+ * });
1705
+ *
1706
+ * console.log(`Generated ${imagePaths.length} page images`);
1707
+ * ```
1708
+ */
1709
+ declare function generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
1661
1710
  /**
1662
1711
  * Extract PDF content in streaming mode (Phase 4 - NEW!)
1663
1712
  *
@@ -1717,6 +1766,7 @@ declare const _default: {
1717
1766
  extractText: typeof extractText;
1718
1767
  extractImages: typeof extractImages;
1719
1768
  extractImageFiles: typeof extractImageFiles;
1769
+ generatePageImages: typeof generatePageImages;
1720
1770
  extractPdfStream: typeof extractPdfStream;
1721
1771
  validateConfig: typeof validateConfig;
1722
1772
  validateImageRefFormat: typeof validateImageRefFormat;
@@ -1724,4 +1774,4 @@ declare const _default: {
1724
1774
  version: string;
1725
1775
  };
1726
1776
 
1727
- export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
1777
+ export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, generatePageImages, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };