pdf-plus 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,3 +1,202 @@
1
+ /**
2
+ * Types for streaming PDF extraction
3
+ */
4
+
5
+ /**
6
+ * Event types emitted during streaming extraction
7
+ */
8
+ type StreamEventType = "start" | "page" | "image" | "progress" | "complete" | "error";
9
+ /**
10
+ * Base event structure
11
+ */
12
+ interface StreamEvent {
13
+ type: StreamEventType;
14
+ timestamp: number;
15
+ }
16
+ /**
17
+ * Start event - emitted when extraction begins
18
+ */
19
+ interface StartEvent extends StreamEvent {
20
+ type: "start";
21
+ totalPages: number;
22
+ pdfPath: string;
23
+ }
24
+ /**
25
+ * Page event - emitted when a page is processed
26
+ */
27
+ interface PageEvent extends StreamEvent {
28
+ type: "page";
29
+ pageNumber: number;
30
+ totalPages: number;
31
+ textLength: number;
32
+ imageCount: number;
33
+ pageInfo?: PageInfo;
34
+ }
35
+ /**
36
+ * Image event - emitted when an image is extracted
37
+ */
38
+ interface ImageEvent extends StreamEvent {
39
+ type: "image";
40
+ image: ImageItem;
41
+ pageNumber: number;
42
+ imageIndex: number;
43
+ totalImages: number;
44
+ }
45
+ /**
46
+ * Progress event - emitted periodically during extraction
47
+ */
48
+ interface ProgressEvent extends StreamEvent {
49
+ type: "progress";
50
+ pagesProcessed: number;
51
+ totalPages: number;
52
+ imagesExtracted: number;
53
+ percentComplete: number;
54
+ estimatedTimeRemaining?: number;
55
+ }
56
+ /**
57
+ * Complete event - emitted when extraction finishes
58
+ */
59
+ interface CompleteEvent extends StreamEvent {
60
+ type: "complete";
61
+ totalPages: number;
62
+ totalImages: number;
63
+ totalTextLength: number;
64
+ duration: number;
65
+ }
66
+ /**
67
+ * Error event - emitted when an error occurs
68
+ */
69
+ interface ErrorEvent extends StreamEvent {
70
+ type: "error";
71
+ error: Error;
72
+ pageNumber?: number;
73
+ recoverable: boolean;
74
+ }
75
+ /**
76
+ * Union type of all stream events
77
+ */
78
+ type StreamEventUnion = StartEvent | PageEvent | ImageEvent | ProgressEvent | CompleteEvent | ErrorEvent;
79
+ /**
80
+ * Streaming extraction options
81
+ */
82
+ interface StreamingOptions$1 {
83
+ /**
84
+ * Enable streaming mode
85
+ * @default false
86
+ */
87
+ streamMode?: boolean;
88
+ /**
89
+ * Automatically enable streaming for PDFs with more than this many pages
90
+ * @default 100
91
+ */
92
+ autoStreamThreshold?: number;
93
+ /**
94
+ * Enable backpressure handling (pause extraction if consumer is slow)
95
+ * @default true
96
+ */
97
+ enableBackpressure?: boolean;
98
+ /**
99
+ * Maximum number of pages to buffer before pausing (backpressure)
100
+ * @default 10
101
+ */
102
+ maxBufferedPages?: number;
103
+ /**
104
+ * Emit progress events every N pages
105
+ * @default 5
106
+ */
107
+ progressInterval?: number;
108
+ /**
109
+ * Enable event callbacks (in addition to async iterator)
110
+ * @default false
111
+ */
112
+ enableEventCallbacks?: boolean;
113
+ }
114
+ /**
115
+ * Event callback function type
116
+ */
117
+ type StreamEventCallback = (event: StreamEventUnion) => void | Promise<void>;
118
+ /**
119
+ * Event callbacks map
120
+ */
121
+ interface StreamEventCallbacks {
122
+ onStart?: (event: StartEvent) => void | Promise<void>;
123
+ onPage?: (event: PageEvent) => void | Promise<void>;
124
+ onImage?: (event: ImageEvent) => void | Promise<void>;
125
+ onProgress?: (event: ProgressEvent) => void | Promise<void>;
126
+ onComplete?: (event: CompleteEvent) => void | Promise<void>;
127
+ onError?: (event: ErrorEvent) => void | Promise<void>;
128
+ onAny?: StreamEventCallback;
129
+ }
130
+ /**
131
+ * Streaming extraction result (async iterator)
132
+ */
133
+ interface StreamingExtractionResult {
134
+ /**
135
+ * Async iterator for streaming events
136
+ */
137
+ [Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
138
+ /**
139
+ * Register event callbacks
140
+ */
141
+ on(event: "start", callback: (event: StartEvent) => void | Promise<void>): this;
142
+ on(event: "page", callback: (event: PageEvent) => void | Promise<void>): this;
143
+ on(event: "image", callback: (event: ImageEvent) => void | Promise<void>): this;
144
+ on(event: "progress", callback: (event: ProgressEvent) => void | Promise<void>): this;
145
+ on(event: "complete", callback: (event: CompleteEvent) => void | Promise<void>): this;
146
+ on(event: "error", callback: (event: ErrorEvent) => void | Promise<void>): this;
147
+ on(event: "any", callback: StreamEventCallback): this;
148
+ /**
149
+ * Cancel the streaming extraction
150
+ */
151
+ cancel(): Promise<void>;
152
+ /**
153
+ * Pause the streaming extraction (backpressure)
154
+ */
155
+ pause(): void;
156
+ /**
157
+ * Resume the streaming extraction
158
+ */
159
+ resume(): void;
160
+ /**
161
+ * Get current streaming statistics
162
+ */
163
+ getStats(): StreamingStats;
164
+ }
165
+ /**
166
+ * Streaming statistics
167
+ */
168
+ interface StreamingStats {
169
+ pagesProcessed: number;
170
+ totalPages: number;
171
+ imagesExtracted: number;
172
+ bytesProcessed: number;
173
+ startTime: number;
174
+ elapsedTime: number;
175
+ isPaused: boolean;
176
+ isCancelled: boolean;
177
+ isComplete: boolean;
178
+ averagePageTime: number;
179
+ estimatedTimeRemaining: number;
180
+ }
181
+ /**
182
+ * Internal streaming state
183
+ */
184
+ interface StreamingState {
185
+ totalPages: number;
186
+ pagesProcessed: number;
187
+ imagesExtracted: number;
188
+ totalTextLength: number;
189
+ bytesProcessed: number;
190
+ startTime: number;
191
+ lastProgressTime: number;
192
+ isPaused: boolean;
193
+ isCancelled: boolean;
194
+ isComplete: boolean;
195
+ bufferedPages: number;
196
+ eventQueue: StreamEventUnion[];
197
+ callbacks: StreamEventCallbacks;
198
+ }
199
+
1
200
  /**
2
201
  * Core types for PDF content extraction
3
202
  */
@@ -63,6 +262,7 @@ interface ExtractionResult {
63
262
  pages: PageInfo[];
64
263
  images: ImageItem[];
65
264
  textItems: TextItem[];
265
+ text: string;
66
266
  textWithRefs: string;
67
267
  cleanText: string;
68
268
  summary?: DocumentSummary;
@@ -85,9 +285,9 @@ interface StructuredPageData {
85
285
  totalImages: number;
86
286
  extractionOptions: ExtractionOptions;
87
287
  };
88
- pages: PageData[];
288
+ pages: PageData$1[];
89
289
  }
90
- interface PageData {
290
+ interface PageData$1 {
91
291
  pageNumber: number;
92
292
  text: {
93
293
  content: string;
@@ -97,6 +297,32 @@ interface PageData {
97
297
  };
98
298
  images: PageImageData[];
99
299
  imageCount: number;
300
+ pageImage?: {
301
+ path: string;
302
+ format: string;
303
+ width: number;
304
+ height: number;
305
+ size: number;
306
+ dpi?: number;
307
+ quality?: number;
308
+ };
309
+ thumbnail?: {
310
+ path: string;
311
+ format: string;
312
+ width: number;
313
+ height: number;
314
+ size: number;
315
+ quality?: number;
316
+ };
317
+ pageImageVariants?: Array<{
318
+ path: string;
319
+ format: string;
320
+ width: number;
321
+ height: number;
322
+ size: number;
323
+ quality: number;
324
+ dpi?: number;
325
+ }>;
100
326
  }
101
327
  interface PageImageData {
102
328
  id: string;
@@ -111,8 +337,10 @@ interface PageImageData {
111
337
  };
112
338
  format: string;
113
339
  size?: number;
340
+ width?: number;
341
+ height?: number;
342
+ mimeType?: string;
114
343
  }
115
- type ImageExtractionEngine = "pdf-lib" | "poppler" | "auto";
116
344
  interface ExtractionOptions {
117
345
  extractText?: boolean;
118
346
  extractImages?: boolean;
@@ -131,14 +359,116 @@ interface ExtractionOptions {
131
359
  extractTextItems?: boolean;
132
360
  specificPages?: number[];
133
361
  useCache?: boolean;
134
- /** Image extraction engine to use */
135
- imageEngine?: ImageExtractionEngine;
362
+ /** Enable image optimization after extraction (uses Jimp - pure JavaScript, default: false) */
363
+ optimizeImages?: boolean;
364
+ /** Image quality for optimization (0-100, default: 80) */
365
+ imageQuality?: number;
366
+ /**
367
+ * Convert JPEG 2000 images to JPG format for better compatibility.
368
+ * (default: true - convert JP2 to JPG)
369
+ */
370
+ convertJp2ToJpg?: boolean;
371
+ /**
372
+ * Preserve JPEG 2000 images in their original format.
373
+ * By default (false), JPEG 2000 images (jp2, jpx, j2c, jpm) are converted to JPG for better compatibility.
374
+ * Set to true to keep JPEG 2000 files in their original format.
375
+ *
376
+ * Note: JP2 images from PDFs are automatically decoded by PDF.js during extraction.
377
+ * This option only affects standalone JP2 files.
378
+ * (default: false - convert to JPG)
379
+ */
380
+ preserveJp2?: boolean;
381
+ /**
382
+ * Use Sharp library for ALL image processing operations (better quality & performance).
383
+ *
384
+ * When enabled, Sharp is used as the global image processing engine for:
385
+ * - JP2 to JPG conversion
386
+ * - Image optimization
387
+ * - Image resizing
388
+ * - Format conversions
389
+ *
390
+ * Sharp is an OPTIONAL dependency. Install it for better performance:
391
+ * ```bash
392
+ * npm install sharp
393
+ * ```
394
+ *
395
+ * If Sharp is not installed, the library will automatically fall back to pure JavaScript (Jimp).
396
+ *
397
+ * (default: false - use pure JS Jimp)
398
+ */
399
+ useSharp?: boolean;
400
+ /** Enable parallel processing for better performance (default: true) */
401
+ parallelProcessing?: boolean;
402
+ /** Maximum number of pages to process in parallel (default: 10) */
403
+ maxConcurrentPages?: number;
404
+ /** Maximum number of images per page to extract in parallel (default: 20) */
405
+ maxConcurrentImages?: number;
406
+ /** Maximum number of JP2 to JPG conversions in parallel (default: 5) */
407
+ maxConcurrentConversions?: number;
408
+ /** Maximum number of image optimizations in parallel (default: 5) */
409
+ maxConcurrentOptimizations?: number;
410
+ /** Enable worker threads for CPU-intensive operations (default: false) */
411
+ useWorkerThreads?: boolean;
412
+ /** Auto-scale workers based on system resources (default: true) */
413
+ autoScaleWorkers?: boolean;
414
+ /** Maximum number of worker threads (default: CPU cores - 1) */
415
+ maxWorkerThreads?: number;
416
+ /** Minimum number of worker threads to keep alive (default: 1) */
417
+ minWorkerThreads?: number;
418
+ /** Memory threshold for scaling down workers 0-1 (default: 0.8) */
419
+ memoryThreshold?: number;
420
+ /** CPU threshold for scaling up workers 0-1 (default: 0.9) */
421
+ cpuThreshold?: number;
422
+ /** Worker task timeout in milliseconds (default: 30000) */
423
+ workerTaskTimeout?: number;
424
+ /** Worker idle timeout in milliseconds (default: 60000) */
425
+ workerIdleTimeout?: number;
426
+ /** Memory limit per worker in MB (default: 512) */
427
+ workerMemoryLimit?: number;
428
+ /** Use workers for JP2 conversion (default: true) */
429
+ enableWorkerForConversion?: boolean;
430
+ /** Use workers for image optimization (default: true) */
431
+ enableWorkerForOptimization?: boolean;
432
+ /** Use workers for image decoding (default: true) */
433
+ enableWorkerForDecoding?: boolean;
434
+ /** Enable streaming mode for large PDFs (default: false) */
435
+ streamMode?: boolean;
436
+ /** Automatically enable streaming for PDFs with more than this many pages (default: 100) */
437
+ autoStreamThreshold?: number;
438
+ /** Enable backpressure handling (pause extraction if consumer is slow) (default: true) */
439
+ enableBackpressure?: boolean;
440
+ /** Maximum number of pages to buffer before pausing (default: 10) */
441
+ maxBufferedPages?: number;
442
+ /** Emit progress events every N pages (default: 5) */
443
+ progressInterval?: number;
444
+ /** Enable event callbacks in addition to async iterator (default: false) */
445
+ enableEventCallbacks?: boolean;
136
446
  cacheDir?: string;
137
447
  baseName?: string;
138
448
  verbose?: boolean;
139
449
  memoryLimit?: string;
140
450
  batchSize?: number;
141
451
  progressCallback?: (progress: ProgressInfo) => void;
452
+ /** Generate page images (default: false) */
453
+ generatePageImages?: boolean;
454
+ /** Generate thumbnails for pages (default: false) */
455
+ generateThumbnails?: boolean;
456
+ /** Include page images in structured output (default: false) */
457
+ includePageImagesInStructuredData?: boolean;
458
+ /** Page numbers to generate images for (default: all pages) */
459
+ pageNumbers?: number[];
460
+ /** Generate multiple quality variants of page images */
461
+ pageImageQualities?: number[];
462
+ /** DPI for page images (default: 150) */
463
+ pageImageDpi?: number;
464
+ /** Format for page images: 'png' | 'jpg' (default: 'png') */
465
+ pageImageFormat?: "png" | "jpg";
466
+ /** Quality for JPG page images (default: 90) */
467
+ pageImageQuality?: number;
468
+ /** Thumbnail width (default: 200) */
469
+ thumbnailWidth?: number;
470
+ /** Thumbnail quality for JPG (default: 80) */
471
+ thumbnailQuality?: number;
142
472
  }
143
473
  interface ProgressInfo {
144
474
  currentPage: number;
@@ -242,6 +572,7 @@ interface TemplateOptions {
242
572
  declare class PDFExtractor {
243
573
  private textExtractor;
244
574
  private imageExtractor;
575
+ private pageToImageConverter;
245
576
  private formatProcessor;
246
577
  private structuredDataGenerator;
247
578
  private cacheManager;
@@ -355,6 +686,14 @@ declare class PDFExtractor {
355
686
  totalCacheSize: number;
356
687
  cacheDir: string;
357
688
  };
689
+ /**
690
+ * Generate page images with multiple quality variants
691
+ */
692
+ private generatePageImagesWithVariants;
693
+ /**
694
+ * Generate thumbnails for pages
695
+ */
696
+ private generatePageThumbnails;
358
697
  private reportProgress;
359
698
  private createValidationError;
360
699
  private createExtractionError;
@@ -362,10 +701,70 @@ declare class PDFExtractor {
362
701
  declare const pdfExtractor: PDFExtractor;
363
702
 
364
703
  /**
365
- * Text extraction from PDF files
704
+ * Streaming PDF extractor for large PDFs
705
+ * Provides async iterator and event-based APIs
706
+ */
707
+
708
+ /**
709
+ * Streaming PDF extractor implementation
710
+ */
711
+ declare class StreamingPDFExtractor implements StreamingExtractionResult {
712
+ private state;
713
+ private options;
714
+ private pdfPath;
715
+ private extractor;
716
+ private eventQueue;
717
+ private resolveNext;
718
+ private extractionPromise;
719
+ constructor(pdfPath: string, options?: ExtractionOptions & StreamingOptions$1);
720
+ /**
721
+ * Async iterator implementation
722
+ */
723
+ [Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
724
+ /**
725
+ * Register event callbacks
726
+ */
727
+ on(event: "start", callback: (event: StartEvent) => void): this;
728
+ on(event: "page", callback: (event: PageEvent) => void): this;
729
+ on(event: "image", callback: (event: ImageEvent) => void): this;
730
+ on(event: "progress", callback: (event: ProgressEvent) => void): this;
731
+ on(event: "complete", callback: (event: CompleteEvent) => void): this;
732
+ on(event: "error", callback: (event: ErrorEvent) => void): this;
733
+ on(event: "any", callback: (event: StreamEventUnion) => void): this;
734
+ /**
735
+ * Cancel extraction
736
+ */
737
+ cancel(): Promise<void>;
738
+ /**
739
+ * Pause extraction (backpressure)
740
+ */
741
+ pause(): void;
742
+ /**
743
+ * Resume extraction
744
+ */
745
+ resume(): void;
746
+ /**
747
+ * Get streaming statistics
748
+ */
749
+ getStats(): StreamingStats;
750
+ /**
751
+ * Emit an event
752
+ */
753
+ private emitEvent;
754
+ /**
755
+ * Start the extraction process
756
+ */
757
+ private startExtraction;
758
+ }
759
+
760
+ /**
761
+ * Text extraction from PDF files using pdf.js
366
762
  *
367
- * Handles text extraction using pdf-parse library with support for
368
- * page-by-page extraction and metadata retrieval.
763
+ * Direct pdf.js-based text extraction with support for:
764
+ * - Page-by-page extraction with accurate boundaries
765
+ * - Text positioning and font information
766
+ * - Metadata retrieval
767
+ * - No external dependencies (uses pdf.js directly)
369
768
  *
370
769
  * @example
371
770
  * ```typescript
@@ -375,6 +774,19 @@ declare const pdfExtractor: PDFExtractor;
375
774
  * ```
376
775
  */
377
776
  declare class TextExtractor {
777
+ constructor();
778
+ /**
779
+ * Initialize pdf.js worker
780
+ */
781
+ private initializePdfjs;
782
+ /**
783
+ * Load PDF document
784
+ */
785
+ private loadDocument;
786
+ /**
787
+ * Extract text from a single page
788
+ */
789
+ private getPageText;
378
790
  /**
379
791
  * Extract text content from PDF
380
792
  *
@@ -383,6 +795,17 @@ declare class TextExtractor {
383
795
  * @throws {Error} When PDF extraction fails
384
796
  */
385
797
  extract(pdfPath: string): Promise<any>;
798
+ /**
799
+ * Extract text with metadata
800
+ *
801
+ * @param pdfPath - Path to the PDF file
802
+ * @returns Promise resolving to extraction result with text and metadata
803
+ * @throws {Error} When PDF extraction fails
804
+ */
805
+ extractWithMetadata(pdfPath: string): Promise<{
806
+ text: string;
807
+ metadata: any;
808
+ }>;
386
809
  /**
387
810
  * Extract text with page information
388
811
  *
@@ -392,11 +815,7 @@ declare class TextExtractor {
392
815
  */
393
816
  extractWithPages(pdfPath: string): Promise<any>;
394
817
  /**
395
- * Split text into approximate pages
396
- */
397
- private splitTextIntoPages;
398
- /**
399
- * Extract text items with position and metadata
818
+ * Extract text items with position and metadata using pdf.js
400
819
  */
401
820
  extractTextItems(pdfPath: string, options?: ExtractionOptions): Promise<TextItem[]>;
402
821
  /**
@@ -438,21 +857,112 @@ declare class TextExtractor {
438
857
  pageOffset?: number;
439
858
  includeImageRefs?: boolean;
440
859
  imageRefFormat?: string;
441
- imageEngine?: ImageExtractionEngine;
442
860
  }): Promise<{
443
861
  text: string;
444
- pages: PageData[];
862
+ pages: PageData$1[];
445
863
  }>;
446
864
  /**
447
865
  * Extract text with accurate page boundaries using pdf-lib + pdf-parse
448
866
  */
449
867
  extractWithAccuratePages(pdfPath: string): Promise<{
450
868
  fullText: string;
451
- pages: PageData[];
869
+ pages: PageData$1[];
452
870
  totalPages: number;
453
871
  }>;
454
872
  }
455
873
 
874
+ /**
875
+ * Structured text extractor using both pdf-lib and pdf.js for accurate page-by-page extraction
876
+ *
877
+ * Extracts text with rich metadata including page dimensions, rotation, word counts, and character counts.
878
+ * Uses pdf-lib for accurate page structure and pdf.js for text content.
879
+ */
880
+ interface PageData {
881
+ pageNumber: number;
882
+ text: string;
883
+ width: number;
884
+ height: number;
885
+ rotation: number;
886
+ mediaBox: number[];
887
+ textItems?: any[];
888
+ wordCount: number;
889
+ characterCount: number;
890
+ }
891
+ declare class StructuredTextExtractor {
892
+ private pdfLibDoc;
893
+ private pdfLibPages;
894
+ private textData;
895
+ constructor();
896
+ /**
897
+ * Initialize pdf.js worker
898
+ */
899
+ private initializePdfjs;
900
+ /**
901
+ * Process PDF with accurate page-by-page extraction
902
+ */
903
+ processPDF(pdfPath: string): Promise<{
904
+ totalPages: number;
905
+ pages: PageData[];
906
+ fullText: string;
907
+ }>;
908
+ /**
909
+ * Process with pdf-lib to get accurate page structure
910
+ */
911
+ private processPDFLib;
912
+ /**
913
+ * Process with pdf.js to extract text page by page
914
+ */
915
+ private processPDFjs;
916
+ /**
917
+ * Combine results from both libraries
918
+ */
919
+ private combineResults;
920
+ /**
921
+ * Extract text with page markers using accurate page boundaries
922
+ */
923
+ extractWithPageMarkers(pdfPath: string, pageMarkerFormat?: string, options?: {
924
+ includeImageRefs?: boolean;
925
+ imageRefFormat?: string;
926
+ }): Promise<{
927
+ text: string;
928
+ cleanText: string;
929
+ numPages: number;
930
+ pages: PageData[];
931
+ }>;
932
+ /**
933
+ * Get specific page data
934
+ */
935
+ getPage(pageNumber: number): PageData | null;
936
+ /**
937
+ * Get detailed page information including text positioning
938
+ */
939
+ getDetailedPageInfo(pdfPath: string, pageNumber: number): Promise<{
940
+ pageNumber: number;
941
+ text: string;
942
+ textItems: Array<{
943
+ text: string;
944
+ x: number;
945
+ y: number;
946
+ width: number;
947
+ height: number;
948
+ fontName?: string;
949
+ fontSize?: number;
950
+ }>;
951
+ dimensions: {
952
+ width: number;
953
+ height: number;
954
+ };
955
+ } | null>;
956
+ /**
957
+ * Count words in text
958
+ */
959
+ private countWords;
960
+ /**
961
+ * Process single page (for streaming/batch processing)
962
+ */
963
+ processSinglePage(pdfPath: string, pageNumber: number): Promise<PageData | null>;
964
+ }
965
+
456
966
  /**
457
967
  * Image extraction from PDF files using pdf-lib (clean implementation based on NestJS)
458
968
  *
@@ -529,6 +1039,381 @@ declare class ImageExtractor {
529
1039
  private createPngFromPdfMetadata;
530
1040
  }
531
1041
 
1042
+ /**
1043
+ * Types for PDF page to image conversion
1044
+ */
1045
+ /**
1046
+ * Image format for page conversion
1047
+ */
1048
+ type PageImageFormat = "png" | "jpg" | "jpeg" | "webp";
1049
+ /**
1050
+ * Options for converting PDF pages to images
1051
+ */
1052
+ interface PageToImageOptions {
1053
+ /**
1054
+ * Output directory for image files
1055
+ * @default './page-images'
1056
+ */
1057
+ outputDir?: string;
1058
+ /**
1059
+ * Image format
1060
+ * @default 'png'
1061
+ */
1062
+ format?: PageImageFormat;
1063
+ /**
1064
+ * JPEG quality (1-100, only for JPG format)
1065
+ * @default 90
1066
+ */
1067
+ quality?: number;
1068
+ /**
1069
+ * DPI (dots per inch) for rendering
1070
+ * Higher DPI = better quality but larger files
1071
+ * @default 72
1072
+ */
1073
+ dpi?: number;
1074
+ /**
1075
+ * Scale factor (multiplier for dimensions)
1076
+ * @default 1
1077
+ */
1078
+ scale?: number;
1079
+ /**
1080
+ * Specific pages to convert (1-based)
1081
+ * If not provided, converts all pages
1082
+ * @example [1, 3, 5]
1083
+ */
1084
+ pages?: number[];
1085
+ /**
1086
+ * Page range to convert (e.g., "1-5", "1,3,5-10")
1087
+ * If not provided, converts all pages
1088
+ * @example "1-5"
1089
+ */
1090
+ pageRange?: string;
1091
+ /**
1092
+ * Filename pattern for output files
1093
+ * Available placeholders: {page}, {total}, {name}
1094
+ * @default 'page-{page}.{ext}'
1095
+ */
1096
+ filenamePattern?: string;
1097
+ /**
1098
+ * Background color for transparent PDFs
1099
+ * @default '#FFFFFF'
1100
+ */
1101
+ backgroundColor?: string;
1102
+ /**
1103
+ * Enable transparent background (PNG only)
1104
+ * @default false
1105
+ */
1106
+ transparent?: boolean;
1107
+ /**
1108
+ * Crop to content (remove white margins)
1109
+ * @default false
1110
+ */
1111
+ cropToContent?: boolean;
1112
+ /**
1113
+ * Progress callback
1114
+ */
1115
+ onProgress?: (current: number, total: number, percentage: number) => void;
1116
+ /**
1117
+ * Callback when a page is converted
1118
+ */
1119
+ onPageComplete?: (pageNumber: number, filepath: string) => void;
1120
+ /**
1121
+ * Verbose logging
1122
+ * @default false
1123
+ */
1124
+ verbose?: boolean;
1125
+ }
1126
+ /**
1127
+ * Result of page to image conversion
1128
+ */
1129
+ interface PageImageResult {
1130
+ /**
1131
+ * Page number (1-based)
1132
+ */
1133
+ page: number;
1134
+ /**
1135
+ * Output file path
1136
+ */
1137
+ filepath: string;
1138
+ /**
1139
+ * Image width in pixels
1140
+ */
1141
+ width: number;
1142
+ /**
1143
+ * Image height in pixels
1144
+ */
1145
+ height: number;
1146
+ /**
1147
+ * File size in bytes
1148
+ */
1149
+ fileSize: number;
1150
+ /**
1151
+ * Image format
1152
+ */
1153
+ format: PageImageFormat;
1154
+ }
1155
+ /**
1156
+ * Result of converting all pages
1157
+ */
1158
+ interface PageToImageResult {
1159
+ /**
1160
+ * Array of converted page images
1161
+ */
1162
+ images: PageImageResult[];
1163
+ /**
1164
+ * Total number of pages converted
1165
+ */
1166
+ totalPages: number;
1167
+ /**
1168
+ * Output directory
1169
+ */
1170
+ outputDir: string;
1171
+ /**
1172
+ * Total size of all images in bytes
1173
+ */
1174
+ totalSize: number;
1175
+ }
1176
+ /**
1177
+ * Options for converting a single page
1178
+ */
1179
+ interface SinglePageOptions {
1180
+ /**
1181
+ * Image format
1182
+ * @default 'png'
1183
+ */
1184
+ format?: PageImageFormat;
1185
+ /**
1186
+ * JPEG quality (1-100)
1187
+ * @default 90
1188
+ */
1189
+ quality?: number;
1190
+ /**
1191
+ * DPI for rendering
1192
+ * @default 72
1193
+ */
1194
+ dpi?: number;
1195
+ /**
1196
+ * Scale factor
1197
+ * @default 1
1198
+ */
1199
+ scale?: number;
1200
+ /**
1201
+ * Background color
1202
+ * @default '#FFFFFF'
1203
+ */
1204
+ backgroundColor?: string;
1205
+ /**
1206
+ * Transparent background (PNG only)
1207
+ * @default false
1208
+ */
1209
+ transparent?: boolean;
1210
+ }
1211
+ /**
1212
+ * Thumbnail generation options
1213
+ */
1214
+ interface ThumbnailOptions extends SinglePageOptions {
1215
+ /**
1216
+ * Maximum width in pixels
1217
+ * @default 200
1218
+ */
1219
+ maxWidth?: number;
1220
+ /**
1221
+ * Maximum height in pixels
1222
+ * @default 200
1223
+ */
1224
+ maxHeight?: number;
1225
+ /**
1226
+ * Maintain aspect ratio
1227
+ * @default true
1228
+ */
1229
+ maintainAspectRatio?: boolean;
1230
+ }
1231
+
1232
+ /**
1233
+ * PDF Page to Image Converter using pdf.js
1234
+ *
1235
+ * Converts PDF pages to image files (PNG, JPG, WebP) with customizable options.
1236
+ * Uses Mozilla's pdf.js for high-quality rendering without external dependencies.
1237
+ */
1238
+
1239
+ /**
1240
+ * Page to Image Converter
1241
+ *
1242
+ * @example
1243
+ * ```typescript
1244
+ * const converter = new PageToImageConverter();
1245
+ * const result = await converter.convertToImages('document.pdf', {
1246
+ * outputDir: './pages',
1247
+ * format: 'png',
1248
+ * dpi: 150
1249
+ * });
1250
+ * ```
1251
+ */
1252
+ declare class PageToImageConverter {
1253
+ private pdfjs;
1254
+ /**
1255
+ * Get or load pdf.js module with proper worker configuration
1256
+ * Based on pdf-to-img library approach
1257
+ */
1258
+ private getPdfjs;
1259
+ /**
1260
+ * Convert all pages of a PDF to images
1261
+ *
1262
+ * @param pdfPath - Path to PDF file
1263
+ * @param options - Conversion options
1264
+ * @returns Conversion result with image paths
1265
+ */
1266
+ convertToImages(pdfPath: string, options?: PageToImageOptions): Promise<PageToImageResult>;
1267
+ /**
1268
+ * Convert a single page to an image file
1269
+ *
1270
+ * @param pdfPath - Path to PDF file
1271
+ * @param pageNumber - Page number (1-based)
1272
+ * @param outputPath - Output file path
1273
+ * @param options - Conversion options
1274
+ */
1275
+ convertPage(pdfPath: string, pageNumber: number, outputPath: string, options?: SinglePageOptions): Promise<PageImageResult>;
1276
+ /**
1277
+ * Convert a page to a buffer (no file write)
1278
+ *
1279
+ * @param pdfPath - Path to PDF file
1280
+ * @param pageNumber - Page number (1-based)
1281
+ * @param options - Conversion options
1282
+ * @returns Image buffer
1283
+ */
1284
+ convertPageToBuffer(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<Buffer>;
1285
+ /**
1286
+ * Convert a page to base64 string
1287
+ *
1288
+ * @param pdfPath - Path to PDF file
1289
+ * @param pageNumber - Page number (1-based)
1290
+ * @param options - Conversion options
1291
+ * @returns Base64 encoded image
1292
+ */
1293
+ convertPageToBase64(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<string>;
1294
+ /**
1295
+ * Generate thumbnails for all pages
1296
+ *
1297
+ * @param pdfPath - Path to PDF file
1298
+ * @param options - Thumbnail options
1299
+ * @returns Conversion result
1300
+ */
1301
+ generateThumbnails(pdfPath: string, options?: ThumbnailOptions & {
1302
+ outputDir?: string;
1303
+ }): Promise<PageToImageResult>;
1304
+ /**
1305
+ * Render a PDF page to image buffer
1306
+ *
1307
+ * Based on pdf-to-img library approach - let pdf.js handle canvas creation
1308
+ * @see https://github.com/k-yle/pdf-to-img
1309
+ */
1310
+ private renderPageToBuffer;
1311
+ /**
1312
+ * Convert canvas to image buffer
1313
+ */
1314
+ private canvasToBuffer;
1315
+ /**
1316
+ * Get page numbers to convert based on options
1317
+ */
1318
+ private getPageNumbers;
1319
+ /**
1320
+ * Parse page range string (e.g., "1-5", "1,3,5-10")
1321
+ */
1322
+ private parsePageRange;
1323
+ /**
1324
+ * Generate filename from pattern
1325
+ */
1326
+ private generateFilename;
1327
+ /**
1328
+ * Format bytes to human-readable string
1329
+ */
1330
+ private formatBytes;
1331
+ }
1332
+
1333
+ /**
1334
+ * Result of image optimization
1335
+ */
1336
+ interface OptimizationResult {
1337
+ success: boolean;
1338
+ originalSize: number;
1339
+ optimizedSize: number;
1340
+ savedBytes: number;
1341
+ savedPercent: number;
1342
+ engine: "jimp" | "sharp" | "none";
1343
+ error?: string;
1344
+ }
1345
+ /**
1346
+ * Options for image optimization
1347
+ */
1348
+ interface OptimizationOptions {
1349
+ quality?: number;
1350
+ verbose?: boolean;
1351
+ useSharp?: boolean;
1352
+ }
1353
+ /**
1354
+ * Image optimizer using Jimp (pure JavaScript)
1355
+ *
1356
+ * This class provides image optimization capabilities using Jimp, a pure JavaScript
1357
+ * image processing library with no native dependencies. It supports JPEG and PNG
1358
+ * optimization with quality control.
1359
+ *
1360
+ * @example
1361
+ * ```typescript
1362
+ * const result = await ImageOptimizer.optimizeFile('image.jpg', {
1363
+ * engine: 'auto',
1364
+ * quality: 80
1365
+ * });
1366
+ *
1367
+ * console.log(`Saved ${result.savedPercent.toFixed(1)}% using ${result.engine}`);
1368
+ * ```
1369
+ */
1370
+ declare class ImageOptimizer {
1371
+ /**
1372
+ * Optimize an image file in-place
1373
+ *
1374
+ * The original file will be replaced with the optimized version.
1375
+ * If optimization fails, the original file remains unchanged.
1376
+ *
1377
+ * @param filePath - Path to the image file to optimize
1378
+ * @param options - Optimization options
1379
+ * @returns Promise resolving to optimization result
1380
+ */
1381
+ static optimizeFile(filePath: string, options?: OptimizationOptions): Promise<OptimizationResult>;
1382
+ /**
1383
+ * Optimize using Sharp (optional dependency)
1384
+ */
1385
+ private static optimizeWithSharp;
1386
+ /**
1387
+ * Optimize using Jimp (pure JavaScript)
1388
+ */
1389
+ private static optimizeWithJimp;
1390
+ /**
1391
+ * Convert JPEG 2000 formats (jp2, jpx, j2c, jpm) to JPG
1392
+ *
1393
+ * JPEG 2000 files are not widely supported by browsers and image tools.
1394
+ * This method converts them to standard JPG format for better compatibility.
1395
+ *
1396
+ * Supports two conversion engines:
1397
+ * - Jimp (default): Pure JavaScript, works everywhere
1398
+ * - Sharp (optional): Better color preservation, requires native compilation
1399
+ *
1400
+ * @param jp2Path - Path to the JPEG 2000 file (jp2, jpx, j2c, or jpm)
1401
+ * @param options - Conversion options
1402
+ * @returns Promise resolving to conversion result with new file path
1403
+ */
1404
+ static convertJp2ToJpg(jp2Path: string, options?: {
1405
+ quality?: number;
1406
+ verbose?: boolean;
1407
+ useSharp?: boolean;
1408
+ }): Promise<{
1409
+ success: boolean;
1410
+ newPath?: string;
1411
+ originalSize?: number;
1412
+ newSize?: number;
1413
+ error?: string;
1414
+ }>;
1415
+ }
1416
+
532
1417
  /**
533
1418
  * Handles formatting of image references and text processing
534
1419
  */
@@ -596,12 +1481,23 @@ declare function validateImageRefFormat(format: string): ValidationError[];
596
1481
  */
597
1482
  declare function validateFilePath(filePath: string, extensions?: string[]): ValidationError[];
598
1483
 
1484
+ /**
1485
+ * pdf-plus - A comprehensive PDF content extraction library
1486
+ *
1487
+ * Main entry point for the PDF content extraction library.
1488
+ * Provides both high-level convenience functions and low-level access to extractors.
1489
+ *
1490
+ * @packageDocumentation
1491
+ */
1492
+
599
1493
  /**
600
1494
  * Extract content from a PDF file (convenience function)
601
1495
  *
1496
+ * Automatically switches to streaming mode for large PDFs if `autoStreamThreshold` is set.
1497
+ *
602
1498
  * @param pdfPath - Path to the PDF file
603
1499
  * @param options - Extraction options
604
- * @returns Promise resolving to extraction result
1500
+ * @returns Promise resolving to extraction result or streaming result
605
1501
  *
606
1502
  * @example
607
1503
  * ```typescript
@@ -615,8 +1511,17 @@ declare function validateFilePath(filePath: string, extensions?: string[]): Vali
615
1511
  *
616
1512
  * console.log(`Extracted ${result.images.length} images from ${result.document.pages} pages`);
617
1513
  * ```
1514
+ *
1515
+ * @example
1516
+ * ```typescript
1517
+ * // Auto-streaming for large PDFs
1518
+ * const result = await extractPdfContent('large-document.pdf', {
1519
+ * extractImageFiles: true,
1520
+ * autoStreamThreshold: 100, // Auto-stream if > 100 pages
1521
+ * });
1522
+ * ```
618
1523
  */
619
- declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult>;
1524
+ declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult | StreamingExtractionResult>;
620
1525
  /**
621
1526
  * Extract only text content from a PDF (convenience function)
622
1527
  *
@@ -673,22 +1578,70 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
673
1578
  * ```
674
1579
  */
675
1580
  declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
676
- declare const version = "1.0.0";
677
-
1581
+ /**
1582
+ * Extract PDF content in streaming mode (Phase 4 - NEW!)
1583
+ *
1584
+ * For large PDFs, this provides a streaming API that processes pages one at a time,
1585
+ * reducing memory usage and providing real-time progress updates.
1586
+ *
1587
+ * @param pdfPath - Path to the PDF file
1588
+ * @param options - Extraction and streaming options
1589
+ * @returns StreamingExtractionResult with async iterator and event callbacks
1590
+ *
1591
+ * @example
1592
+ * ```typescript
1593
+ * // Using async iterator
1594
+ * const stream = extractPdfStream('large-document.pdf', {
1595
+ * extractImageFiles: true,
1596
+ * imageOutputDir: './images',
1597
+ * streamMode: true
1598
+ * });
1599
+ *
1600
+ * for await (const event of stream) {
1601
+ * if (event.type === 'page') {
1602
+ * console.log(`Processed page ${event.pageNumber}/${event.totalPages}`);
1603
+ * } else if (event.type === 'progress') {
1604
+ * console.log(`Progress: ${event.percentComplete.toFixed(1)}%`);
1605
+ * }
1606
+ * }
1607
+ *
1608
+ * // Using event callbacks
1609
+ * const stream = extractPdfStream('large-document.pdf', { streamMode: true })
1610
+ * .on('page', (event) => console.log(`Page ${event.pageNumber} done`))
1611
+ * .on('progress', (event) => console.log(`${event.percentComplete}% complete`))
1612
+ * .on('complete', (event) => console.log(`Done! ${event.totalImages} images`));
1613
+ *
1614
+ * for await (const event of stream) {
1615
+ * // Events are also available via iterator
1616
+ * }
1617
+ * ```
1618
+ */
1619
+ declare function extractPdfStream(pdfPath: string, options?: Partial<ExtractionOptions>): StreamingExtractionResult;
1620
+ /**
1621
+ * Library version
1622
+ */
1623
+ declare const version = "1.0.3";
1624
+ /**
1625
+ * Default export containing all public APIs
1626
+ * Useful for CommonJS: const pdfPlus = require('pdf-plus');
1627
+ */
678
1628
  declare const _default: {
679
1629
  PDFExtractor: typeof PDFExtractor;
680
1630
  pdfExtractor: PDFExtractor;
1631
+ StreamingPDFExtractor: typeof StreamingPDFExtractor;
681
1632
  TextExtractor: typeof TextExtractor;
682
1633
  ImageExtractor: typeof ImageExtractor;
1634
+ ImageOptimizer: typeof ImageOptimizer;
683
1635
  FormatProcessor: typeof FormatProcessor;
684
1636
  extractPdfContent: typeof extractPdfContent;
685
1637
  extractText: typeof extractText;
686
1638
  extractImages: typeof extractImages;
687
1639
  extractImageFiles: typeof extractImageFiles;
1640
+ extractPdfStream: typeof extractPdfStream;
688
1641
  validateConfig: typeof validateConfig;
689
1642
  validateImageRefFormat: typeof validateImageRefFormat;
690
1643
  validateFilePath: typeof validateFilePath;
691
1644
  version: string;
692
1645
  };
693
1646
 
694
- export { type AnalyticsData, type DocumentMetadata, type DocumentSummary, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageExtractionEngine, ImageExtractor, type ImageItem, type MemoryUsage, type OCROptions, PDFExtractor, type PageInfo, type Position, type ProcessingPhase, type ProgressInfo, type StreamingOptions, type TemplateOptions, TextExtractor, type TextItem, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
1647
+ export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };