pdf-plus 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,3 +1,202 @@
1
+ /**
2
+ * Types for streaming PDF extraction
3
+ */
4
+
5
+ /**
6
+ * Event types emitted during streaming extraction
7
+ */
8
+ type StreamEventType = "start" | "page" | "image" | "progress" | "complete" | "error";
9
+ /**
10
+ * Base event structure
11
+ */
12
+ interface StreamEvent {
13
+ type: StreamEventType;
14
+ timestamp: number;
15
+ }
16
+ /**
17
+ * Start event - emitted when extraction begins
18
+ */
19
+ interface StartEvent extends StreamEvent {
20
+ type: "start";
21
+ totalPages: number;
22
+ pdfPath: string;
23
+ }
24
+ /**
25
+ * Page event - emitted when a page is processed
26
+ */
27
+ interface PageEvent extends StreamEvent {
28
+ type: "page";
29
+ pageNumber: number;
30
+ totalPages: number;
31
+ textLength: number;
32
+ imageCount: number;
33
+ pageInfo?: PageInfo;
34
+ }
35
+ /**
36
+ * Image event - emitted when an image is extracted
37
+ */
38
+ interface ImageEvent extends StreamEvent {
39
+ type: "image";
40
+ image: ImageItem;
41
+ pageNumber: number;
42
+ imageIndex: number;
43
+ totalImages: number;
44
+ }
45
+ /**
46
+ * Progress event - emitted periodically during extraction
47
+ */
48
+ interface ProgressEvent extends StreamEvent {
49
+ type: "progress";
50
+ pagesProcessed: number;
51
+ totalPages: number;
52
+ imagesExtracted: number;
53
+ percentComplete: number;
54
+ estimatedTimeRemaining?: number;
55
+ }
56
+ /**
57
+ * Complete event - emitted when extraction finishes
58
+ */
59
+ interface CompleteEvent extends StreamEvent {
60
+ type: "complete";
61
+ totalPages: number;
62
+ totalImages: number;
63
+ totalTextLength: number;
64
+ duration: number;
65
+ }
66
+ /**
67
+ * Error event - emitted when an error occurs
68
+ */
69
+ interface ErrorEvent extends StreamEvent {
70
+ type: "error";
71
+ error: Error;
72
+ pageNumber?: number;
73
+ recoverable: boolean;
74
+ }
75
+ /**
76
+ * Union type of all stream events
77
+ */
78
+ type StreamEventUnion = StartEvent | PageEvent | ImageEvent | ProgressEvent | CompleteEvent | ErrorEvent;
79
+ /**
80
+ * Streaming extraction options
81
+ */
82
+ interface StreamingOptions$1 {
83
+ /**
84
+ * Enable streaming mode
85
+ * @default false
86
+ */
87
+ streamMode?: boolean;
88
+ /**
89
+ * Automatically enable streaming for PDFs with more than this many pages
90
+ * @default 100
91
+ */
92
+ autoStreamThreshold?: number;
93
+ /**
94
+ * Enable backpressure handling (pause extraction if consumer is slow)
95
+ * @default true
96
+ */
97
+ enableBackpressure?: boolean;
98
+ /**
99
+ * Maximum number of pages to buffer before pausing (backpressure)
100
+ * @default 10
101
+ */
102
+ maxBufferedPages?: number;
103
+ /**
104
+ * Emit progress events every N pages
105
+ * @default 5
106
+ */
107
+ progressInterval?: number;
108
+ /**
109
+ * Enable event callbacks (in addition to async iterator)
110
+ * @default false
111
+ */
112
+ enableEventCallbacks?: boolean;
113
+ }
114
+ /**
115
+ * Event callback function type
116
+ */
117
+ type StreamEventCallback = (event: StreamEventUnion) => void | Promise<void>;
118
+ /**
119
+ * Event callbacks map
120
+ */
121
+ interface StreamEventCallbacks {
122
+ onStart?: (event: StartEvent) => void | Promise<void>;
123
+ onPage?: (event: PageEvent) => void | Promise<void>;
124
+ onImage?: (event: ImageEvent) => void | Promise<void>;
125
+ onProgress?: (event: ProgressEvent) => void | Promise<void>;
126
+ onComplete?: (event: CompleteEvent) => void | Promise<void>;
127
+ onError?: (event: ErrorEvent) => void | Promise<void>;
128
+ onAny?: StreamEventCallback;
129
+ }
130
+ /**
131
+ * Streaming extraction result (async iterator)
132
+ */
133
+ interface StreamingExtractionResult {
134
+ /**
135
+ * Async iterator for streaming events
136
+ */
137
+ [Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
138
+ /**
139
+ * Register event callbacks
140
+ */
141
+ on(event: "start", callback: (event: StartEvent) => void | Promise<void>): this;
142
+ on(event: "page", callback: (event: PageEvent) => void | Promise<void>): this;
143
+ on(event: "image", callback: (event: ImageEvent) => void | Promise<void>): this;
144
+ on(event: "progress", callback: (event: ProgressEvent) => void | Promise<void>): this;
145
+ on(event: "complete", callback: (event: CompleteEvent) => void | Promise<void>): this;
146
+ on(event: "error", callback: (event: ErrorEvent) => void | Promise<void>): this;
147
+ on(event: "any", callback: StreamEventCallback): this;
148
+ /**
149
+ * Cancel the streaming extraction
150
+ */
151
+ cancel(): Promise<void>;
152
+ /**
153
+ * Pause the streaming extraction (backpressure)
154
+ */
155
+ pause(): void;
156
+ /**
157
+ * Resume the streaming extraction
158
+ */
159
+ resume(): void;
160
+ /**
161
+ * Get current streaming statistics
162
+ */
163
+ getStats(): StreamingStats;
164
+ }
165
+ /**
166
+ * Streaming statistics
167
+ */
168
+ interface StreamingStats {
169
+ pagesProcessed: number;
170
+ totalPages: number;
171
+ imagesExtracted: number;
172
+ bytesProcessed: number;
173
+ startTime: number;
174
+ elapsedTime: number;
175
+ isPaused: boolean;
176
+ isCancelled: boolean;
177
+ isComplete: boolean;
178
+ averagePageTime: number;
179
+ estimatedTimeRemaining: number;
180
+ }
181
+ /**
182
+ * Internal streaming state
183
+ */
184
+ interface StreamingState {
185
+ totalPages: number;
186
+ pagesProcessed: number;
187
+ imagesExtracted: number;
188
+ totalTextLength: number;
189
+ bytesProcessed: number;
190
+ startTime: number;
191
+ lastProgressTime: number;
192
+ isPaused: boolean;
193
+ isCancelled: boolean;
194
+ isComplete: boolean;
195
+ bufferedPages: number;
196
+ eventQueue: StreamEventUnion[];
197
+ callbacks: StreamEventCallbacks;
198
+ }
199
+
1
200
  /**
2
201
  * Core types for PDF content extraction
3
202
  */
@@ -86,9 +285,9 @@ interface StructuredPageData {
86
285
  totalImages: number;
87
286
  extractionOptions: ExtractionOptions;
88
287
  };
89
- pages: PageData[];
288
+ pages: PageData$1[];
90
289
  }
91
- interface PageData {
290
+ interface PageData$1 {
92
291
  pageNumber: number;
93
292
  text: {
94
293
  content: string;
@@ -98,6 +297,32 @@ interface PageData {
98
297
  };
99
298
  images: PageImageData[];
100
299
  imageCount: number;
300
+ pageImage?: {
301
+ path: string;
302
+ format: string;
303
+ width: number;
304
+ height: number;
305
+ size: number;
306
+ dpi?: number;
307
+ quality?: number;
308
+ };
309
+ thumbnail?: {
310
+ path: string;
311
+ format: string;
312
+ width: number;
313
+ height: number;
314
+ size: number;
315
+ quality?: number;
316
+ };
317
+ pageImageVariants?: Array<{
318
+ path: string;
319
+ format: string;
320
+ width: number;
321
+ height: number;
322
+ size: number;
323
+ quality: number;
324
+ dpi?: number;
325
+ }>;
101
326
  }
102
327
  interface PageImageData {
103
328
  id: string;
@@ -112,8 +337,10 @@ interface PageImageData {
112
337
  };
113
338
  format: string;
114
339
  size?: number;
340
+ width?: number;
341
+ height?: number;
342
+ mimeType?: string;
115
343
  }
116
- type ImageExtractionEngine = "pdf-lib" | "poppler" | "auto";
117
344
  interface ExtractionOptions {
118
345
  extractText?: boolean;
119
346
  extractImages?: boolean;
@@ -132,14 +359,116 @@ interface ExtractionOptions {
132
359
  extractTextItems?: boolean;
133
360
  specificPages?: number[];
134
361
  useCache?: boolean;
135
- /** Image extraction engine to use */
136
- imageEngine?: ImageExtractionEngine;
362
+ /** Enable image optimization after extraction (uses Jimp - pure JavaScript, default: false) */
363
+ optimizeImages?: boolean;
364
+ /** Image quality for optimization (0-100, default: 80) */
365
+ imageQuality?: number;
366
+ /**
367
+ * Convert JPEG 2000 images to JPG format for better compatibility.
368
+ * (default: true - convert JP2 to JPG)
369
+ */
370
+ convertJp2ToJpg?: boolean;
371
+ /**
372
+ * Preserve JPEG 2000 images in their original format.
373
+ * By default (false), JPEG 2000 images (jp2, jpx, j2c, jpm) are converted to JPG for better compatibility.
374
+ * Set to true to keep JPEG 2000 files in their original format.
375
+ *
376
+ * Note: JP2 images from PDFs are automatically decoded by PDF.js during extraction.
377
+ * This option only affects standalone JP2 files.
378
+ * (default: false - convert to JPG)
379
+ */
380
+ preserveJp2?: boolean;
381
+ /**
382
+ * Use Sharp library for ALL image processing operations (better quality & performance).
383
+ *
384
+ * When enabled, Sharp is used as the global image processing engine for:
385
+ * - JP2 to JPG conversion
386
+ * - Image optimization
387
+ * - Image resizing
388
+ * - Format conversions
389
+ *
390
+ * Sharp is an OPTIONAL dependency. Install it for better performance:
391
+ * ```bash
392
+ * npm install sharp
393
+ * ```
394
+ *
395
+ * If Sharp is not installed, the library will automatically fall back to pure JavaScript (Jimp).
396
+ *
397
+ * (default: false - use pure JS Jimp)
398
+ */
399
+ useSharp?: boolean;
400
+ /** Enable parallel processing for better performance (default: true) */
401
+ parallelProcessing?: boolean;
402
+ /** Maximum number of pages to process in parallel (default: 10) */
403
+ maxConcurrentPages?: number;
404
+ /** Maximum number of images per page to extract in parallel (default: 20) */
405
+ maxConcurrentImages?: number;
406
+ /** Maximum number of JP2 to JPG conversions in parallel (default: 5) */
407
+ maxConcurrentConversions?: number;
408
+ /** Maximum number of image optimizations in parallel (default: 5) */
409
+ maxConcurrentOptimizations?: number;
410
+ /** Enable worker threads for CPU-intensive operations (default: false) */
411
+ useWorkerThreads?: boolean;
412
+ /** Auto-scale workers based on system resources (default: true) */
413
+ autoScaleWorkers?: boolean;
414
+ /** Maximum number of worker threads (default: CPU cores - 1) */
415
+ maxWorkerThreads?: number;
416
+ /** Minimum number of worker threads to keep alive (default: 1) */
417
+ minWorkerThreads?: number;
418
+ /** Memory threshold for scaling down workers 0-1 (default: 0.8) */
419
+ memoryThreshold?: number;
420
+ /** CPU threshold for scaling up workers 0-1 (default: 0.9) */
421
+ cpuThreshold?: number;
422
+ /** Worker task timeout in milliseconds (default: 30000) */
423
+ workerTaskTimeout?: number;
424
+ /** Worker idle timeout in milliseconds (default: 60000) */
425
+ workerIdleTimeout?: number;
426
+ /** Memory limit per worker in MB (default: 512) */
427
+ workerMemoryLimit?: number;
428
+ /** Use workers for JP2 conversion (default: true) */
429
+ enableWorkerForConversion?: boolean;
430
+ /** Use workers for image optimization (default: true) */
431
+ enableWorkerForOptimization?: boolean;
432
+ /** Use workers for image decoding (default: true) */
433
+ enableWorkerForDecoding?: boolean;
434
+ /** Enable streaming mode for large PDFs (default: false) */
435
+ streamMode?: boolean;
436
+ /** Automatically enable streaming for PDFs with more than this many pages (default: 100) */
437
+ autoStreamThreshold?: number;
438
+ /** Enable backpressure handling (pause extraction if consumer is slow) (default: true) */
439
+ enableBackpressure?: boolean;
440
+ /** Maximum number of pages to buffer before pausing (default: 10) */
441
+ maxBufferedPages?: number;
442
+ /** Emit progress events every N pages (default: 5) */
443
+ progressInterval?: number;
444
+ /** Enable event callbacks in addition to async iterator (default: false) */
445
+ enableEventCallbacks?: boolean;
137
446
  cacheDir?: string;
138
447
  baseName?: string;
139
448
  verbose?: boolean;
140
449
  memoryLimit?: string;
141
450
  batchSize?: number;
142
451
  progressCallback?: (progress: ProgressInfo) => void;
452
+ /** Generate page images (default: false) */
453
+ generatePageImages?: boolean;
454
+ /** Generate thumbnails for pages (default: false) */
455
+ generateThumbnails?: boolean;
456
+ /** Include page images in structured output (default: false) */
457
+ includePageImagesInStructuredData?: boolean;
458
+ /** Page numbers to generate images for (default: all pages) */
459
+ pageNumbers?: number[];
460
+ /** Generate multiple quality variants of page images */
461
+ pageImageQualities?: number[];
462
+ /** DPI for page images (default: 150) */
463
+ pageImageDpi?: number;
464
+ /** Format for page images: 'png' | 'jpg' (default: 'png') */
465
+ pageImageFormat?: "png" | "jpg";
466
+ /** Quality for JPG page images (default: 90) */
467
+ pageImageQuality?: number;
468
+ /** Thumbnail width (default: 200) */
469
+ thumbnailWidth?: number;
470
+ /** Thumbnail quality for JPG (default: 80) */
471
+ thumbnailQuality?: number;
143
472
  }
144
473
  interface ProgressInfo {
145
474
  currentPage: number;
@@ -243,6 +572,7 @@ interface TemplateOptions {
243
572
  declare class PDFExtractor {
244
573
  private textExtractor;
245
574
  private imageExtractor;
575
+ private pageToImageConverter;
246
576
  private formatProcessor;
247
577
  private structuredDataGenerator;
248
578
  private cacheManager;
@@ -356,6 +686,14 @@ declare class PDFExtractor {
356
686
  totalCacheSize: number;
357
687
  cacheDir: string;
358
688
  };
689
+ /**
690
+ * Generate page images with multiple quality variants
691
+ */
692
+ private generatePageImagesWithVariants;
693
+ /**
694
+ * Generate thumbnails for pages
695
+ */
696
+ private generatePageThumbnails;
359
697
  private reportProgress;
360
698
  private createValidationError;
361
699
  private createExtractionError;
@@ -363,10 +701,70 @@ declare class PDFExtractor {
363
701
  declare const pdfExtractor: PDFExtractor;
364
702
 
365
703
  /**
366
- * Text extraction from PDF files
704
+ * Streaming PDF extractor for large PDFs
705
+ * Provides async iterator and event-based APIs
706
+ */
707
+
708
+ /**
709
+ * Streaming PDF extractor implementation
710
+ */
711
+ declare class StreamingPDFExtractor implements StreamingExtractionResult {
712
+ private state;
713
+ private options;
714
+ private pdfPath;
715
+ private extractor;
716
+ private eventQueue;
717
+ private resolveNext;
718
+ private extractionPromise;
719
+ constructor(pdfPath: string, options?: ExtractionOptions & StreamingOptions$1);
720
+ /**
721
+ * Async iterator implementation
722
+ */
723
+ [Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
724
+ /**
725
+ * Register event callbacks
726
+ */
727
+ on(event: "start", callback: (event: StartEvent) => void): this;
728
+ on(event: "page", callback: (event: PageEvent) => void): this;
729
+ on(event: "image", callback: (event: ImageEvent) => void): this;
730
+ on(event: "progress", callback: (event: ProgressEvent) => void): this;
731
+ on(event: "complete", callback: (event: CompleteEvent) => void): this;
732
+ on(event: "error", callback: (event: ErrorEvent) => void): this;
733
+ on(event: "any", callback: (event: StreamEventUnion) => void): this;
734
+ /**
735
+ * Cancel extraction
736
+ */
737
+ cancel(): Promise<void>;
738
+ /**
739
+ * Pause extraction (backpressure)
740
+ */
741
+ pause(): void;
742
+ /**
743
+ * Resume extraction
744
+ */
745
+ resume(): void;
746
+ /**
747
+ * Get streaming statistics
748
+ */
749
+ getStats(): StreamingStats;
750
+ /**
751
+ * Emit an event
752
+ */
753
+ private emitEvent;
754
+ /**
755
+ * Start the extraction process
756
+ */
757
+ private startExtraction;
758
+ }
759
+
760
+ /**
761
+ * Text extraction from PDF files using pdf.js
367
762
  *
368
- * Handles text extraction using pdf-parse library with support for
369
- * page-by-page extraction and metadata retrieval.
763
+ * Direct pdf.js-based text extraction with support for:
764
+ * - Page-by-page extraction with accurate boundaries
765
+ * - Text positioning and font information
766
+ * - Metadata retrieval
767
+ * - No external dependencies (uses pdf.js directly)
370
768
  *
371
769
  * @example
372
770
  * ```typescript
@@ -376,6 +774,19 @@ declare const pdfExtractor: PDFExtractor;
376
774
  * ```
377
775
  */
378
776
  declare class TextExtractor {
777
+ constructor();
778
+ /**
779
+ * Initialize pdf.js worker
780
+ */
781
+ private initializePdfjs;
782
+ /**
783
+ * Load PDF document
784
+ */
785
+ private loadDocument;
786
+ /**
787
+ * Extract text from a single page
788
+ */
789
+ private getPageText;
379
790
  /**
380
791
  * Extract text content from PDF
381
792
  *
@@ -384,6 +795,17 @@ declare class TextExtractor {
384
795
  * @throws {Error} When PDF extraction fails
385
796
  */
386
797
  extract(pdfPath: string): Promise<any>;
798
+ /**
799
+ * Extract text with metadata
800
+ *
801
+ * @param pdfPath - Path to the PDF file
802
+ * @returns Promise resolving to extraction result with text and metadata
803
+ * @throws {Error} When PDF extraction fails
804
+ */
805
+ extractWithMetadata(pdfPath: string): Promise<{
806
+ text: string;
807
+ metadata: any;
808
+ }>;
387
809
  /**
388
810
  * Extract text with page information
389
811
  *
@@ -393,11 +815,7 @@ declare class TextExtractor {
393
815
  */
394
816
  extractWithPages(pdfPath: string): Promise<any>;
395
817
  /**
396
- * Split text into approximate pages
397
- */
398
- private splitTextIntoPages;
399
- /**
400
- * Extract text items with position and metadata
818
+ * Extract text items with position and metadata using pdf.js
401
819
  */
402
820
  extractTextItems(pdfPath: string, options?: ExtractionOptions): Promise<TextItem[]>;
403
821
  /**
@@ -439,21 +857,112 @@ declare class TextExtractor {
439
857
  pageOffset?: number;
440
858
  includeImageRefs?: boolean;
441
859
  imageRefFormat?: string;
442
- imageEngine?: ImageExtractionEngine;
443
860
  }): Promise<{
444
861
  text: string;
445
- pages: PageData[];
862
+ pages: PageData$1[];
446
863
  }>;
447
864
  /**
448
865
  * Extract text with accurate page boundaries using pdf-lib + pdf-parse
449
866
  */
450
867
  extractWithAccuratePages(pdfPath: string): Promise<{
451
868
  fullText: string;
452
- pages: PageData[];
869
+ pages: PageData$1[];
453
870
  totalPages: number;
454
871
  }>;
455
872
  }
456
873
 
874
+ /**
875
+ * Structured text extractor using both pdf-lib and pdf.js for accurate page-by-page extraction
876
+ *
877
+ * Extracts text with rich metadata including page dimensions, rotation, word counts, and character counts.
878
+ * Uses pdf-lib for accurate page structure and pdf.js for text content.
879
+ */
880
+ interface PageData {
881
+ pageNumber: number;
882
+ text: string;
883
+ width: number;
884
+ height: number;
885
+ rotation: number;
886
+ mediaBox: number[];
887
+ textItems?: any[];
888
+ wordCount: number;
889
+ characterCount: number;
890
+ }
891
+ declare class StructuredTextExtractor {
892
+ private pdfLibDoc;
893
+ private pdfLibPages;
894
+ private textData;
895
+ constructor();
896
+ /**
897
+ * Initialize pdf.js worker
898
+ */
899
+ private initializePdfjs;
900
+ /**
901
+ * Process PDF with accurate page-by-page extraction
902
+ */
903
+ processPDF(pdfPath: string): Promise<{
904
+ totalPages: number;
905
+ pages: PageData[];
906
+ fullText: string;
907
+ }>;
908
+ /**
909
+ * Process with pdf-lib to get accurate page structure
910
+ */
911
+ private processPDFLib;
912
+ /**
913
+ * Process with pdf.js to extract text page by page
914
+ */
915
+ private processPDFjs;
916
+ /**
917
+ * Combine results from both libraries
918
+ */
919
+ private combineResults;
920
+ /**
921
+ * Extract text with page markers using accurate page boundaries
922
+ */
923
+ extractWithPageMarkers(pdfPath: string, pageMarkerFormat?: string, options?: {
924
+ includeImageRefs?: boolean;
925
+ imageRefFormat?: string;
926
+ }): Promise<{
927
+ text: string;
928
+ cleanText: string;
929
+ numPages: number;
930
+ pages: PageData[];
931
+ }>;
932
+ /**
933
+ * Get specific page data
934
+ */
935
+ getPage(pageNumber: number): PageData | null;
936
+ /**
937
+ * Get detailed page information including text positioning
938
+ */
939
+ getDetailedPageInfo(pdfPath: string, pageNumber: number): Promise<{
940
+ pageNumber: number;
941
+ text: string;
942
+ textItems: Array<{
943
+ text: string;
944
+ x: number;
945
+ y: number;
946
+ width: number;
947
+ height: number;
948
+ fontName?: string;
949
+ fontSize?: number;
950
+ }>;
951
+ dimensions: {
952
+ width: number;
953
+ height: number;
954
+ };
955
+ } | null>;
956
+ /**
957
+ * Count words in text
958
+ */
959
+ private countWords;
960
+ /**
961
+ * Process single page (for streaming/batch processing)
962
+ */
963
+ processSinglePage(pdfPath: string, pageNumber: number): Promise<PageData | null>;
964
+ }
965
+
457
966
  /**
458
967
  * Image extraction from PDF files using pdf-lib (clean implementation based on NestJS)
459
968
  *
@@ -530,6 +1039,381 @@ declare class ImageExtractor {
530
1039
  private createPngFromPdfMetadata;
531
1040
  }
532
1041
 
1042
+ /**
1043
+ * Types for PDF page to image conversion
1044
+ */
1045
+ /**
1046
+ * Image format for page conversion
1047
+ */
1048
+ type PageImageFormat = "png" | "jpg" | "jpeg" | "webp";
1049
+ /**
1050
+ * Options for converting PDF pages to images
1051
+ */
1052
+ interface PageToImageOptions {
1053
+ /**
1054
+ * Output directory for image files
1055
+ * @default './page-images'
1056
+ */
1057
+ outputDir?: string;
1058
+ /**
1059
+ * Image format
1060
+ * @default 'png'
1061
+ */
1062
+ format?: PageImageFormat;
1063
+ /**
1064
+ * JPEG quality (1-100, only for JPG format)
1065
+ * @default 90
1066
+ */
1067
+ quality?: number;
1068
+ /**
1069
+ * DPI (dots per inch) for rendering
1070
+ * Higher DPI = better quality but larger files
1071
+ * @default 72
1072
+ */
1073
+ dpi?: number;
1074
+ /**
1075
+ * Scale factor (multiplier for dimensions)
1076
+ * @default 1
1077
+ */
1078
+ scale?: number;
1079
+ /**
1080
+ * Specific pages to convert (1-based)
1081
+ * If not provided, converts all pages
1082
+ * @example [1, 3, 5]
1083
+ */
1084
+ pages?: number[];
1085
+ /**
1086
+ * Page range to convert (e.g., "1-5", "1,3,5-10")
1087
+ * If not provided, converts all pages
1088
+ * @example "1-5"
1089
+ */
1090
+ pageRange?: string;
1091
+ /**
1092
+ * Filename pattern for output files
1093
+ * Available placeholders: {page}, {total}, {name}
1094
+ * @default 'page-{page}.{ext}'
1095
+ */
1096
+ filenamePattern?: string;
1097
+ /**
1098
+ * Background color for transparent PDFs
1099
+ * @default '#FFFFFF'
1100
+ */
1101
+ backgroundColor?: string;
1102
+ /**
1103
+ * Enable transparent background (PNG only)
1104
+ * @default false
1105
+ */
1106
+ transparent?: boolean;
1107
+ /**
1108
+ * Crop to content (remove white margins)
1109
+ * @default false
1110
+ */
1111
+ cropToContent?: boolean;
1112
+ /**
1113
+ * Progress callback
1114
+ */
1115
+ onProgress?: (current: number, total: number, percentage: number) => void;
1116
+ /**
1117
+ * Callback when a page is converted
1118
+ */
1119
+ onPageComplete?: (pageNumber: number, filepath: string) => void;
1120
+ /**
1121
+ * Verbose logging
1122
+ * @default false
1123
+ */
1124
+ verbose?: boolean;
1125
+ }
1126
+ /**
1127
+ * Result of page to image conversion
1128
+ */
1129
+ interface PageImageResult {
1130
+ /**
1131
+ * Page number (1-based)
1132
+ */
1133
+ page: number;
1134
+ /**
1135
+ * Output file path
1136
+ */
1137
+ filepath: string;
1138
+ /**
1139
+ * Image width in pixels
1140
+ */
1141
+ width: number;
1142
+ /**
1143
+ * Image height in pixels
1144
+ */
1145
+ height: number;
1146
+ /**
1147
+ * File size in bytes
1148
+ */
1149
+ fileSize: number;
1150
+ /**
1151
+ * Image format
1152
+ */
1153
+ format: PageImageFormat;
1154
+ }
1155
+ /**
1156
+ * Result of converting all pages
1157
+ */
1158
+ interface PageToImageResult {
1159
+ /**
1160
+ * Array of converted page images
1161
+ */
1162
+ images: PageImageResult[];
1163
+ /**
1164
+ * Total number of pages converted
1165
+ */
1166
+ totalPages: number;
1167
+ /**
1168
+ * Output directory
1169
+ */
1170
+ outputDir: string;
1171
+ /**
1172
+ * Total size of all images in bytes
1173
+ */
1174
+ totalSize: number;
1175
+ }
1176
+ /**
1177
+ * Options for converting a single page
1178
+ */
1179
+ interface SinglePageOptions {
1180
+ /**
1181
+ * Image format
1182
+ * @default 'png'
1183
+ */
1184
+ format?: PageImageFormat;
1185
+ /**
1186
+ * JPEG quality (1-100)
1187
+ * @default 90
1188
+ */
1189
+ quality?: number;
1190
+ /**
1191
+ * DPI for rendering
1192
+ * @default 72
1193
+ */
1194
+ dpi?: number;
1195
+ /**
1196
+ * Scale factor
1197
+ * @default 1
1198
+ */
1199
+ scale?: number;
1200
+ /**
1201
+ * Background color
1202
+ * @default '#FFFFFF'
1203
+ */
1204
+ backgroundColor?: string;
1205
+ /**
1206
+ * Transparent background (PNG only)
1207
+ * @default false
1208
+ */
1209
+ transparent?: boolean;
1210
+ }
1211
+ /**
1212
+ * Thumbnail generation options
1213
+ */
1214
+ interface ThumbnailOptions extends SinglePageOptions {
1215
+ /**
1216
+ * Maximum width in pixels
1217
+ * @default 200
1218
+ */
1219
+ maxWidth?: number;
1220
+ /**
1221
+ * Maximum height in pixels
1222
+ * @default 200
1223
+ */
1224
+ maxHeight?: number;
1225
+ /**
1226
+ * Maintain aspect ratio
1227
+ * @default true
1228
+ */
1229
+ maintainAspectRatio?: boolean;
1230
+ }
1231
+
1232
+ /**
1233
+ * PDF Page to Image Converter using pdf.js
1234
+ *
1235
+ * Converts PDF pages to image files (PNG, JPG, WebP) with customizable options.
1236
+ * Uses Mozilla's pdf.js for high-quality rendering without external dependencies.
1237
+ */
1238
+
1239
+ /**
1240
+ * Page to Image Converter
1241
+ *
1242
+ * @example
1243
+ * ```typescript
1244
+ * const converter = new PageToImageConverter();
1245
+ * const result = await converter.convertToImages('document.pdf', {
1246
+ * outputDir: './pages',
1247
+ * format: 'png',
1248
+ * dpi: 150
1249
+ * });
1250
+ * ```
1251
+ */
1252
+ declare class PageToImageConverter {
1253
+ private pdfjs;
1254
+ /**
1255
+ * Get or load pdf.js module with proper worker configuration
1256
+ * Based on pdf-to-img library approach
1257
+ */
1258
+ private getPdfjs;
1259
+ /**
1260
+ * Convert all pages of a PDF to images
1261
+ *
1262
+ * @param pdfPath - Path to PDF file
1263
+ * @param options - Conversion options
1264
+ * @returns Conversion result with image paths
1265
+ */
1266
+ convertToImages(pdfPath: string, options?: PageToImageOptions): Promise<PageToImageResult>;
1267
+ /**
1268
+ * Convert a single page to an image file
1269
+ *
1270
+ * @param pdfPath - Path to PDF file
1271
+ * @param pageNumber - Page number (1-based)
1272
+ * @param outputPath - Output file path
1273
+ * @param options - Conversion options
1274
+ */
1275
+ convertPage(pdfPath: string, pageNumber: number, outputPath: string, options?: SinglePageOptions): Promise<PageImageResult>;
1276
+ /**
1277
+ * Convert a page to a buffer (no file write)
1278
+ *
1279
+ * @param pdfPath - Path to PDF file
1280
+ * @param pageNumber - Page number (1-based)
1281
+ * @param options - Conversion options
1282
+ * @returns Image buffer
1283
+ */
1284
+ convertPageToBuffer(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<Buffer>;
1285
+ /**
1286
+ * Convert a page to base64 string
1287
+ *
1288
+ * @param pdfPath - Path to PDF file
1289
+ * @param pageNumber - Page number (1-based)
1290
+ * @param options - Conversion options
1291
+ * @returns Base64 encoded image
1292
+ */
1293
+ convertPageToBase64(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<string>;
1294
+ /**
1295
+ * Generate thumbnails for all pages
1296
+ *
1297
+ * @param pdfPath - Path to PDF file
1298
+ * @param options - Thumbnail options
1299
+ * @returns Conversion result
1300
+ */
1301
+ generateThumbnails(pdfPath: string, options?: ThumbnailOptions & {
1302
+ outputDir?: string;
1303
+ }): Promise<PageToImageResult>;
1304
+ /**
1305
+ * Render a PDF page to image buffer
1306
+ *
1307
+ * Based on pdf-to-img library approach - let pdf.js handle canvas creation
1308
+ * @see https://github.com/k-yle/pdf-to-img
1309
+ */
1310
+ private renderPageToBuffer;
1311
+ /**
1312
+ * Convert canvas to image buffer
1313
+ */
1314
+ private canvasToBuffer;
1315
+ /**
1316
+ * Get page numbers to convert based on options
1317
+ */
1318
+ private getPageNumbers;
1319
+ /**
1320
+ * Parse page range string (e.g., "1-5", "1,3,5-10")
1321
+ */
1322
+ private parsePageRange;
1323
+ /**
1324
+ * Generate filename from pattern
1325
+ */
1326
+ private generateFilename;
1327
+ /**
1328
+ * Format bytes to human-readable string
1329
+ */
1330
+ private formatBytes;
1331
+ }
1332
+
1333
+ /**
1334
+ * Result of image optimization
1335
+ */
1336
+ interface OptimizationResult {
1337
+ success: boolean;
1338
+ originalSize: number;
1339
+ optimizedSize: number;
1340
+ savedBytes: number;
1341
+ savedPercent: number;
1342
+ engine: "jimp" | "sharp" | "none";
1343
+ error?: string;
1344
+ }
1345
+ /**
1346
+ * Options for image optimization
1347
+ */
1348
+ interface OptimizationOptions {
1349
+ quality?: number;
1350
+ verbose?: boolean;
1351
+ useSharp?: boolean;
1352
+ }
1353
+ /**
1354
+ * Image optimizer using Jimp (pure JavaScript)
1355
+ *
1356
+ * This class provides image optimization capabilities using Jimp, a pure JavaScript
1357
+ * image processing library with no native dependencies. It supports JPEG and PNG
1358
+ * optimization with quality control.
1359
+ *
1360
+ * @example
1361
+ * ```typescript
1362
+ * const result = await ImageOptimizer.optimizeFile('image.jpg', {
1363
+ * engine: 'auto',
1364
+ * quality: 80
1365
+ * });
1366
+ *
1367
+ * console.log(`Saved ${result.savedPercent.toFixed(1)}% using ${result.engine}`);
1368
+ * ```
1369
+ */
1370
+ declare class ImageOptimizer {
1371
+ /**
1372
+ * Optimize an image file in-place
1373
+ *
1374
+ * The original file will be replaced with the optimized version.
1375
+ * If optimization fails, the original file remains unchanged.
1376
+ *
1377
+ * @param filePath - Path to the image file to optimize
1378
+ * @param options - Optimization options
1379
+ * @returns Promise resolving to optimization result
1380
+ */
1381
+ static optimizeFile(filePath: string, options?: OptimizationOptions): Promise<OptimizationResult>;
1382
+ /**
1383
+ * Optimize using Sharp (optional dependency)
1384
+ */
1385
+ private static optimizeWithSharp;
1386
+ /**
1387
+ * Optimize using Jimp (pure JavaScript)
1388
+ */
1389
+ private static optimizeWithJimp;
1390
+ /**
1391
+ * Convert JPEG 2000 formats (jp2, jpx, j2c, jpm) to JPG
1392
+ *
1393
+ * JPEG 2000 files are not widely supported by browsers and image tools.
1394
+ * This method converts them to standard JPG format for better compatibility.
1395
+ *
1396
+ * Supports two conversion engines:
1397
+ * - Jimp (default): Pure JavaScript, works everywhere
1398
+ * - Sharp (optional): Better color preservation, requires native compilation
1399
+ *
1400
+ * @param jp2Path - Path to the JPEG 2000 file (jp2, jpx, j2c, or jpm)
1401
+ * @param options - Conversion options
1402
+ * @returns Promise resolving to conversion result with new file path
1403
+ */
1404
+ static convertJp2ToJpg(jp2Path: string, options?: {
1405
+ quality?: number;
1406
+ verbose?: boolean;
1407
+ useSharp?: boolean;
1408
+ }): Promise<{
1409
+ success: boolean;
1410
+ newPath?: string;
1411
+ originalSize?: number;
1412
+ newSize?: number;
1413
+ error?: string;
1414
+ }>;
1415
+ }
1416
+
533
1417
  /**
534
1418
  * Handles formatting of image references and text processing
535
1419
  */
@@ -597,12 +1481,23 @@ declare function validateImageRefFormat(format: string): ValidationError[];
597
1481
  */
598
1482
  declare function validateFilePath(filePath: string, extensions?: string[]): ValidationError[];
599
1483
 
1484
+ /**
1485
+ * pdf-plus - A comprehensive PDF content extraction library
1486
+ *
1487
+ * Main entry point for the PDF content extraction library.
1488
+ * Provides both high-level convenience functions and low-level access to extractors.
1489
+ *
1490
+ * @packageDocumentation
1491
+ */
1492
+
600
1493
  /**
601
1494
  * Extract content from a PDF file (convenience function)
602
1495
  *
1496
+ * Automatically switches to streaming mode for large PDFs if `autoStreamThreshold` is set.
1497
+ *
603
1498
  * @param pdfPath - Path to the PDF file
604
1499
  * @param options - Extraction options
605
- * @returns Promise resolving to extraction result
1500
+ * @returns Promise resolving to extraction result or streaming result
606
1501
  *
607
1502
  * @example
608
1503
  * ```typescript
@@ -616,8 +1511,17 @@ declare function validateFilePath(filePath: string, extensions?: string[]): Vali
616
1511
  *
617
1512
  * console.log(`Extracted ${result.images.length} images from ${result.document.pages} pages`);
618
1513
  * ```
1514
+ *
1515
+ * @example
1516
+ * ```typescript
1517
+ * // Auto-streaming for large PDFs
1518
+ * const result = await extractPdfContent('large-document.pdf', {
1519
+ * extractImageFiles: true,
1520
+ * autoStreamThreshold: 100, // Auto-stream if > 100 pages
1521
+ * });
1522
+ * ```
619
1523
  */
620
- declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult>;
1524
+ declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult | StreamingExtractionResult>;
621
1525
  /**
622
1526
  * Extract only text content from a PDF (convenience function)
623
1527
  *
@@ -674,22 +1578,70 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
674
1578
  * ```
675
1579
  */
676
1580
  declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
677
- declare const version = "1.0.0";
678
-
1581
+ /**
1582
+ * Extract PDF content in streaming mode (Phase 4 - NEW!)
1583
+ *
1584
+ * For large PDFs, this provides a streaming API that processes pages one at a time,
1585
+ * reducing memory usage and providing real-time progress updates.
1586
+ *
1587
+ * @param pdfPath - Path to the PDF file
1588
+ * @param options - Extraction and streaming options
1589
+ * @returns StreamingExtractionResult with async iterator and event callbacks
1590
+ *
1591
+ * @example
1592
+ * ```typescript
1593
+ * // Using async iterator
1594
+ * const stream = extractPdfStream('large-document.pdf', {
1595
+ * extractImageFiles: true,
1596
+ * imageOutputDir: './images',
1597
+ * streamMode: true
1598
+ * });
1599
+ *
1600
+ * for await (const event of stream) {
1601
+ * if (event.type === 'page') {
1602
+ * console.log(`Processed page ${event.pageNumber}/${event.totalPages}`);
1603
+ * } else if (event.type === 'progress') {
1604
+ * console.log(`Progress: ${event.percentComplete.toFixed(1)}%`);
1605
+ * }
1606
+ * }
1607
+ *
1608
+ * // Using event callbacks
1609
+ * const stream = extractPdfStream('large-document.pdf', { streamMode: true })
1610
+ * .on('page', (event) => console.log(`Page ${event.pageNumber} done`))
1611
+ * .on('progress', (event) => console.log(`${event.percentComplete}% complete`))
1612
+ * .on('complete', (event) => console.log(`Done! ${event.totalImages} images`));
1613
+ *
1614
+ * for await (const event of stream) {
1615
+ * // Events are also available via iterator
1616
+ * }
1617
+ * ```
1618
+ */
1619
+ declare function extractPdfStream(pdfPath: string, options?: Partial<ExtractionOptions>): StreamingExtractionResult;
1620
+ /**
1621
+ * Library version
1622
+ */
1623
+ declare const version = "1.0.3";
1624
+ /**
1625
+ * Default export containing all public APIs
1626
+ * Useful for CommonJS: const pdfPlus = require('pdf-plus');
1627
+ */
679
1628
  declare const _default: {
680
1629
  PDFExtractor: typeof PDFExtractor;
681
1630
  pdfExtractor: PDFExtractor;
1631
+ StreamingPDFExtractor: typeof StreamingPDFExtractor;
682
1632
  TextExtractor: typeof TextExtractor;
683
1633
  ImageExtractor: typeof ImageExtractor;
1634
+ ImageOptimizer: typeof ImageOptimizer;
684
1635
  FormatProcessor: typeof FormatProcessor;
685
1636
  extractPdfContent: typeof extractPdfContent;
686
1637
  extractText: typeof extractText;
687
1638
  extractImages: typeof extractImages;
688
1639
  extractImageFiles: typeof extractImageFiles;
1640
+ extractPdfStream: typeof extractPdfStream;
689
1641
  validateConfig: typeof validateConfig;
690
1642
  validateImageRefFormat: typeof validateImageRefFormat;
691
1643
  validateFilePath: typeof validateFilePath;
692
1644
  version: string;
693
1645
  };
694
1646
 
695
- export { type AnalyticsData, type DocumentMetadata, type DocumentSummary, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageExtractionEngine, ImageExtractor, type ImageItem, type MemoryUsage, type OCROptions, PDFExtractor, type PageInfo, type Position, type ProcessingPhase, type ProgressInfo, type StreamingOptions, type TemplateOptions, TextExtractor, type TextItem, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
1647
+ export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };