pdf-plus 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +303 -2
- package/dist/index.d.mts +973 -21
- package/dist/index.d.ts +973 -21
- package/dist/index.js +35 -36
- package/dist/index.mjs +35 -36
- package/dist/workers/image-decoder.worker.d.mts +2 -0
- package/dist/workers/image-decoder.worker.d.ts +2 -0
- package/dist/workers/image-decoder.worker.js +2 -0
- package/dist/workers/image-decoder.worker.mjs +2 -0
- package/dist/workers/jp2-converter.worker.d.mts +2 -0
- package/dist/workers/jp2-converter.worker.d.ts +2 -0
- package/dist/workers/jp2-converter.worker.js +2 -0
- package/dist/workers/jp2-converter.worker.mjs +2 -0
- package/package.json +22 -7
package/dist/index.d.mts
CHANGED
|
@@ -1,3 +1,202 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Types for streaming PDF extraction
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Event types emitted during streaming extraction
|
|
7
|
+
*/
|
|
8
|
+
type StreamEventType = "start" | "page" | "image" | "progress" | "complete" | "error";
|
|
9
|
+
/**
|
|
10
|
+
* Base event structure
|
|
11
|
+
*/
|
|
12
|
+
interface StreamEvent {
|
|
13
|
+
type: StreamEventType;
|
|
14
|
+
timestamp: number;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Start event - emitted when extraction begins
|
|
18
|
+
*/
|
|
19
|
+
interface StartEvent extends StreamEvent {
|
|
20
|
+
type: "start";
|
|
21
|
+
totalPages: number;
|
|
22
|
+
pdfPath: string;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Page event - emitted when a page is processed
|
|
26
|
+
*/
|
|
27
|
+
interface PageEvent extends StreamEvent {
|
|
28
|
+
type: "page";
|
|
29
|
+
pageNumber: number;
|
|
30
|
+
totalPages: number;
|
|
31
|
+
textLength: number;
|
|
32
|
+
imageCount: number;
|
|
33
|
+
pageInfo?: PageInfo;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Image event - emitted when an image is extracted
|
|
37
|
+
*/
|
|
38
|
+
interface ImageEvent extends StreamEvent {
|
|
39
|
+
type: "image";
|
|
40
|
+
image: ImageItem;
|
|
41
|
+
pageNumber: number;
|
|
42
|
+
imageIndex: number;
|
|
43
|
+
totalImages: number;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Progress event - emitted periodically during extraction
|
|
47
|
+
*/
|
|
48
|
+
interface ProgressEvent extends StreamEvent {
|
|
49
|
+
type: "progress";
|
|
50
|
+
pagesProcessed: number;
|
|
51
|
+
totalPages: number;
|
|
52
|
+
imagesExtracted: number;
|
|
53
|
+
percentComplete: number;
|
|
54
|
+
estimatedTimeRemaining?: number;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Complete event - emitted when extraction finishes
|
|
58
|
+
*/
|
|
59
|
+
interface CompleteEvent extends StreamEvent {
|
|
60
|
+
type: "complete";
|
|
61
|
+
totalPages: number;
|
|
62
|
+
totalImages: number;
|
|
63
|
+
totalTextLength: number;
|
|
64
|
+
duration: number;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Error event - emitted when an error occurs
|
|
68
|
+
*/
|
|
69
|
+
interface ErrorEvent extends StreamEvent {
|
|
70
|
+
type: "error";
|
|
71
|
+
error: Error;
|
|
72
|
+
pageNumber?: number;
|
|
73
|
+
recoverable: boolean;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Union type of all stream events
|
|
77
|
+
*/
|
|
78
|
+
type StreamEventUnion = StartEvent | PageEvent | ImageEvent | ProgressEvent | CompleteEvent | ErrorEvent;
|
|
79
|
+
/**
|
|
80
|
+
* Streaming extraction options
|
|
81
|
+
*/
|
|
82
|
+
interface StreamingOptions$1 {
|
|
83
|
+
/**
|
|
84
|
+
* Enable streaming mode
|
|
85
|
+
* @default false
|
|
86
|
+
*/
|
|
87
|
+
streamMode?: boolean;
|
|
88
|
+
/**
|
|
89
|
+
* Automatically enable streaming for PDFs with more than this many pages
|
|
90
|
+
* @default 100
|
|
91
|
+
*/
|
|
92
|
+
autoStreamThreshold?: number;
|
|
93
|
+
/**
|
|
94
|
+
* Enable backpressure handling (pause extraction if consumer is slow)
|
|
95
|
+
* @default true
|
|
96
|
+
*/
|
|
97
|
+
enableBackpressure?: boolean;
|
|
98
|
+
/**
|
|
99
|
+
* Maximum number of pages to buffer before pausing (backpressure)
|
|
100
|
+
* @default 10
|
|
101
|
+
*/
|
|
102
|
+
maxBufferedPages?: number;
|
|
103
|
+
/**
|
|
104
|
+
* Emit progress events every N pages
|
|
105
|
+
* @default 5
|
|
106
|
+
*/
|
|
107
|
+
progressInterval?: number;
|
|
108
|
+
/**
|
|
109
|
+
* Enable event callbacks (in addition to async iterator)
|
|
110
|
+
* @default false
|
|
111
|
+
*/
|
|
112
|
+
enableEventCallbacks?: boolean;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Event callback function type
|
|
116
|
+
*/
|
|
117
|
+
type StreamEventCallback = (event: StreamEventUnion) => void | Promise<void>;
|
|
118
|
+
/**
|
|
119
|
+
* Event callbacks map
|
|
120
|
+
*/
|
|
121
|
+
interface StreamEventCallbacks {
|
|
122
|
+
onStart?: (event: StartEvent) => void | Promise<void>;
|
|
123
|
+
onPage?: (event: PageEvent) => void | Promise<void>;
|
|
124
|
+
onImage?: (event: ImageEvent) => void | Promise<void>;
|
|
125
|
+
onProgress?: (event: ProgressEvent) => void | Promise<void>;
|
|
126
|
+
onComplete?: (event: CompleteEvent) => void | Promise<void>;
|
|
127
|
+
onError?: (event: ErrorEvent) => void | Promise<void>;
|
|
128
|
+
onAny?: StreamEventCallback;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Streaming extraction result (async iterator)
|
|
132
|
+
*/
|
|
133
|
+
interface StreamingExtractionResult {
|
|
134
|
+
/**
|
|
135
|
+
* Async iterator for streaming events
|
|
136
|
+
*/
|
|
137
|
+
[Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
|
|
138
|
+
/**
|
|
139
|
+
* Register event callbacks
|
|
140
|
+
*/
|
|
141
|
+
on(event: "start", callback: (event: StartEvent) => void | Promise<void>): this;
|
|
142
|
+
on(event: "page", callback: (event: PageEvent) => void | Promise<void>): this;
|
|
143
|
+
on(event: "image", callback: (event: ImageEvent) => void | Promise<void>): this;
|
|
144
|
+
on(event: "progress", callback: (event: ProgressEvent) => void | Promise<void>): this;
|
|
145
|
+
on(event: "complete", callback: (event: CompleteEvent) => void | Promise<void>): this;
|
|
146
|
+
on(event: "error", callback: (event: ErrorEvent) => void | Promise<void>): this;
|
|
147
|
+
on(event: "any", callback: StreamEventCallback): this;
|
|
148
|
+
/**
|
|
149
|
+
* Cancel the streaming extraction
|
|
150
|
+
*/
|
|
151
|
+
cancel(): Promise<void>;
|
|
152
|
+
/**
|
|
153
|
+
* Pause the streaming extraction (backpressure)
|
|
154
|
+
*/
|
|
155
|
+
pause(): void;
|
|
156
|
+
/**
|
|
157
|
+
* Resume the streaming extraction
|
|
158
|
+
*/
|
|
159
|
+
resume(): void;
|
|
160
|
+
/**
|
|
161
|
+
* Get current streaming statistics
|
|
162
|
+
*/
|
|
163
|
+
getStats(): StreamingStats;
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Streaming statistics
|
|
167
|
+
*/
|
|
168
|
+
interface StreamingStats {
|
|
169
|
+
pagesProcessed: number;
|
|
170
|
+
totalPages: number;
|
|
171
|
+
imagesExtracted: number;
|
|
172
|
+
bytesProcessed: number;
|
|
173
|
+
startTime: number;
|
|
174
|
+
elapsedTime: number;
|
|
175
|
+
isPaused: boolean;
|
|
176
|
+
isCancelled: boolean;
|
|
177
|
+
isComplete: boolean;
|
|
178
|
+
averagePageTime: number;
|
|
179
|
+
estimatedTimeRemaining: number;
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Internal streaming state
|
|
183
|
+
*/
|
|
184
|
+
interface StreamingState {
|
|
185
|
+
totalPages: number;
|
|
186
|
+
pagesProcessed: number;
|
|
187
|
+
imagesExtracted: number;
|
|
188
|
+
totalTextLength: number;
|
|
189
|
+
bytesProcessed: number;
|
|
190
|
+
startTime: number;
|
|
191
|
+
lastProgressTime: number;
|
|
192
|
+
isPaused: boolean;
|
|
193
|
+
isCancelled: boolean;
|
|
194
|
+
isComplete: boolean;
|
|
195
|
+
bufferedPages: number;
|
|
196
|
+
eventQueue: StreamEventUnion[];
|
|
197
|
+
callbacks: StreamEventCallbacks;
|
|
198
|
+
}
|
|
199
|
+
|
|
1
200
|
/**
|
|
2
201
|
* Core types for PDF content extraction
|
|
3
202
|
*/
|
|
@@ -86,9 +285,9 @@ interface StructuredPageData {
|
|
|
86
285
|
totalImages: number;
|
|
87
286
|
extractionOptions: ExtractionOptions;
|
|
88
287
|
};
|
|
89
|
-
pages: PageData[];
|
|
288
|
+
pages: PageData$1[];
|
|
90
289
|
}
|
|
91
|
-
interface PageData {
|
|
290
|
+
interface PageData$1 {
|
|
92
291
|
pageNumber: number;
|
|
93
292
|
text: {
|
|
94
293
|
content: string;
|
|
@@ -98,6 +297,32 @@ interface PageData {
|
|
|
98
297
|
};
|
|
99
298
|
images: PageImageData[];
|
|
100
299
|
imageCount: number;
|
|
300
|
+
pageImage?: {
|
|
301
|
+
path: string;
|
|
302
|
+
format: string;
|
|
303
|
+
width: number;
|
|
304
|
+
height: number;
|
|
305
|
+
size: number;
|
|
306
|
+
dpi?: number;
|
|
307
|
+
quality?: number;
|
|
308
|
+
};
|
|
309
|
+
thumbnail?: {
|
|
310
|
+
path: string;
|
|
311
|
+
format: string;
|
|
312
|
+
width: number;
|
|
313
|
+
height: number;
|
|
314
|
+
size: number;
|
|
315
|
+
quality?: number;
|
|
316
|
+
};
|
|
317
|
+
pageImageVariants?: Array<{
|
|
318
|
+
path: string;
|
|
319
|
+
format: string;
|
|
320
|
+
width: number;
|
|
321
|
+
height: number;
|
|
322
|
+
size: number;
|
|
323
|
+
quality: number;
|
|
324
|
+
dpi?: number;
|
|
325
|
+
}>;
|
|
101
326
|
}
|
|
102
327
|
interface PageImageData {
|
|
103
328
|
id: string;
|
|
@@ -112,8 +337,10 @@ interface PageImageData {
|
|
|
112
337
|
};
|
|
113
338
|
format: string;
|
|
114
339
|
size?: number;
|
|
340
|
+
width?: number;
|
|
341
|
+
height?: number;
|
|
342
|
+
mimeType?: string;
|
|
115
343
|
}
|
|
116
|
-
type ImageExtractionEngine = "pdf-lib" | "poppler" | "auto";
|
|
117
344
|
interface ExtractionOptions {
|
|
118
345
|
extractText?: boolean;
|
|
119
346
|
extractImages?: boolean;
|
|
@@ -132,14 +359,116 @@ interface ExtractionOptions {
|
|
|
132
359
|
extractTextItems?: boolean;
|
|
133
360
|
specificPages?: number[];
|
|
134
361
|
useCache?: boolean;
|
|
135
|
-
/**
|
|
136
|
-
|
|
362
|
+
/** Enable image optimization after extraction (uses Jimp - pure JavaScript, default: false) */
|
|
363
|
+
optimizeImages?: boolean;
|
|
364
|
+
/** Image quality for optimization (0-100, default: 80) */
|
|
365
|
+
imageQuality?: number;
|
|
366
|
+
/**
|
|
367
|
+
* Convert JPEG 2000 images to JPG format for better compatibility.
|
|
368
|
+
* (default: true - convert JP2 to JPG)
|
|
369
|
+
*/
|
|
370
|
+
convertJp2ToJpg?: boolean;
|
|
371
|
+
/**
|
|
372
|
+
* Preserve JPEG 2000 images in their original format.
|
|
373
|
+
* By default (false), JPEG 2000 images (jp2, jpx, j2c, jpm) are converted to JPG for better compatibility.
|
|
374
|
+
* Set to true to keep JPEG 2000 files in their original format.
|
|
375
|
+
*
|
|
376
|
+
* Note: JP2 images from PDFs are automatically decoded by PDF.js during extraction.
|
|
377
|
+
* This option only affects standalone JP2 files.
|
|
378
|
+
* (default: false - convert to JPG)
|
|
379
|
+
*/
|
|
380
|
+
preserveJp2?: boolean;
|
|
381
|
+
/**
|
|
382
|
+
* Use Sharp library for ALL image processing operations (better quality & performance).
|
|
383
|
+
*
|
|
384
|
+
* When enabled, Sharp is used as the global image processing engine for:
|
|
385
|
+
* - JP2 to JPG conversion
|
|
386
|
+
* - Image optimization
|
|
387
|
+
* - Image resizing
|
|
388
|
+
* - Format conversions
|
|
389
|
+
*
|
|
390
|
+
* Sharp is an OPTIONAL dependency. Install it for better performance:
|
|
391
|
+
* ```bash
|
|
392
|
+
* npm install sharp
|
|
393
|
+
* ```
|
|
394
|
+
*
|
|
395
|
+
* If Sharp is not installed, the library will automatically fall back to pure JavaScript (Jimp).
|
|
396
|
+
*
|
|
397
|
+
* (default: false - use pure JS Jimp)
|
|
398
|
+
*/
|
|
399
|
+
useSharp?: boolean;
|
|
400
|
+
/** Enable parallel processing for better performance (default: true) */
|
|
401
|
+
parallelProcessing?: boolean;
|
|
402
|
+
/** Maximum number of pages to process in parallel (default: 10) */
|
|
403
|
+
maxConcurrentPages?: number;
|
|
404
|
+
/** Maximum number of images per page to extract in parallel (default: 20) */
|
|
405
|
+
maxConcurrentImages?: number;
|
|
406
|
+
/** Maximum number of JP2 to JPG conversions in parallel (default: 5) */
|
|
407
|
+
maxConcurrentConversions?: number;
|
|
408
|
+
/** Maximum number of image optimizations in parallel (default: 5) */
|
|
409
|
+
maxConcurrentOptimizations?: number;
|
|
410
|
+
/** Enable worker threads for CPU-intensive operations (default: false) */
|
|
411
|
+
useWorkerThreads?: boolean;
|
|
412
|
+
/** Auto-scale workers based on system resources (default: true) */
|
|
413
|
+
autoScaleWorkers?: boolean;
|
|
414
|
+
/** Maximum number of worker threads (default: CPU cores - 1) */
|
|
415
|
+
maxWorkerThreads?: number;
|
|
416
|
+
/** Minimum number of worker threads to keep alive (default: 1) */
|
|
417
|
+
minWorkerThreads?: number;
|
|
418
|
+
/** Memory threshold for scaling down workers 0-1 (default: 0.8) */
|
|
419
|
+
memoryThreshold?: number;
|
|
420
|
+
/** CPU threshold for scaling up workers 0-1 (default: 0.9) */
|
|
421
|
+
cpuThreshold?: number;
|
|
422
|
+
/** Worker task timeout in milliseconds (default: 30000) */
|
|
423
|
+
workerTaskTimeout?: number;
|
|
424
|
+
/** Worker idle timeout in milliseconds (default: 60000) */
|
|
425
|
+
workerIdleTimeout?: number;
|
|
426
|
+
/** Memory limit per worker in MB (default: 512) */
|
|
427
|
+
workerMemoryLimit?: number;
|
|
428
|
+
/** Use workers for JP2 conversion (default: true) */
|
|
429
|
+
enableWorkerForConversion?: boolean;
|
|
430
|
+
/** Use workers for image optimization (default: true) */
|
|
431
|
+
enableWorkerForOptimization?: boolean;
|
|
432
|
+
/** Use workers for image decoding (default: true) */
|
|
433
|
+
enableWorkerForDecoding?: boolean;
|
|
434
|
+
/** Enable streaming mode for large PDFs (default: false) */
|
|
435
|
+
streamMode?: boolean;
|
|
436
|
+
/** Automatically enable streaming for PDFs with more than this many pages (default: 100) */
|
|
437
|
+
autoStreamThreshold?: number;
|
|
438
|
+
/** Enable backpressure handling (pause extraction if consumer is slow) (default: true) */
|
|
439
|
+
enableBackpressure?: boolean;
|
|
440
|
+
/** Maximum number of pages to buffer before pausing (default: 10) */
|
|
441
|
+
maxBufferedPages?: number;
|
|
442
|
+
/** Emit progress events every N pages (default: 5) */
|
|
443
|
+
progressInterval?: number;
|
|
444
|
+
/** Enable event callbacks in addition to async iterator (default: false) */
|
|
445
|
+
enableEventCallbacks?: boolean;
|
|
137
446
|
cacheDir?: string;
|
|
138
447
|
baseName?: string;
|
|
139
448
|
verbose?: boolean;
|
|
140
449
|
memoryLimit?: string;
|
|
141
450
|
batchSize?: number;
|
|
142
451
|
progressCallback?: (progress: ProgressInfo) => void;
|
|
452
|
+
/** Generate page images (default: false) */
|
|
453
|
+
generatePageImages?: boolean;
|
|
454
|
+
/** Generate thumbnails for pages (default: false) */
|
|
455
|
+
generateThumbnails?: boolean;
|
|
456
|
+
/** Include page images in structured output (default: false) */
|
|
457
|
+
includePageImagesInStructuredData?: boolean;
|
|
458
|
+
/** Page numbers to generate images for (default: all pages) */
|
|
459
|
+
pageNumbers?: number[];
|
|
460
|
+
/** Generate multiple quality variants of page images */
|
|
461
|
+
pageImageQualities?: number[];
|
|
462
|
+
/** DPI for page images (default: 150) */
|
|
463
|
+
pageImageDpi?: number;
|
|
464
|
+
/** Format for page images: 'png' | 'jpg' (default: 'png') */
|
|
465
|
+
pageImageFormat?: "png" | "jpg";
|
|
466
|
+
/** Quality for JPG page images (default: 90) */
|
|
467
|
+
pageImageQuality?: number;
|
|
468
|
+
/** Thumbnail width (default: 200) */
|
|
469
|
+
thumbnailWidth?: number;
|
|
470
|
+
/** Thumbnail quality for JPG (default: 80) */
|
|
471
|
+
thumbnailQuality?: number;
|
|
143
472
|
}
|
|
144
473
|
interface ProgressInfo {
|
|
145
474
|
currentPage: number;
|
|
@@ -243,6 +572,7 @@ interface TemplateOptions {
|
|
|
243
572
|
declare class PDFExtractor {
|
|
244
573
|
private textExtractor;
|
|
245
574
|
private imageExtractor;
|
|
575
|
+
private pageToImageConverter;
|
|
246
576
|
private formatProcessor;
|
|
247
577
|
private structuredDataGenerator;
|
|
248
578
|
private cacheManager;
|
|
@@ -356,6 +686,14 @@ declare class PDFExtractor {
|
|
|
356
686
|
totalCacheSize: number;
|
|
357
687
|
cacheDir: string;
|
|
358
688
|
};
|
|
689
|
+
/**
|
|
690
|
+
* Generate page images with multiple quality variants
|
|
691
|
+
*/
|
|
692
|
+
private generatePageImagesWithVariants;
|
|
693
|
+
/**
|
|
694
|
+
* Generate thumbnails for pages
|
|
695
|
+
*/
|
|
696
|
+
private generatePageThumbnails;
|
|
359
697
|
private reportProgress;
|
|
360
698
|
private createValidationError;
|
|
361
699
|
private createExtractionError;
|
|
@@ -363,10 +701,70 @@ declare class PDFExtractor {
|
|
|
363
701
|
declare const pdfExtractor: PDFExtractor;
|
|
364
702
|
|
|
365
703
|
/**
|
|
366
|
-
*
|
|
704
|
+
* Streaming PDF extractor for large PDFs
|
|
705
|
+
* Provides async iterator and event-based APIs
|
|
706
|
+
*/
|
|
707
|
+
|
|
708
|
+
/**
|
|
709
|
+
* Streaming PDF extractor implementation
|
|
710
|
+
*/
|
|
711
|
+
declare class StreamingPDFExtractor implements StreamingExtractionResult {
|
|
712
|
+
private state;
|
|
713
|
+
private options;
|
|
714
|
+
private pdfPath;
|
|
715
|
+
private extractor;
|
|
716
|
+
private eventQueue;
|
|
717
|
+
private resolveNext;
|
|
718
|
+
private extractionPromise;
|
|
719
|
+
constructor(pdfPath: string, options?: ExtractionOptions & StreamingOptions$1);
|
|
720
|
+
/**
|
|
721
|
+
* Async iterator implementation
|
|
722
|
+
*/
|
|
723
|
+
[Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
|
|
724
|
+
/**
|
|
725
|
+
* Register event callbacks
|
|
726
|
+
*/
|
|
727
|
+
on(event: "start", callback: (event: StartEvent) => void): this;
|
|
728
|
+
on(event: "page", callback: (event: PageEvent) => void): this;
|
|
729
|
+
on(event: "image", callback: (event: ImageEvent) => void): this;
|
|
730
|
+
on(event: "progress", callback: (event: ProgressEvent) => void): this;
|
|
731
|
+
on(event: "complete", callback: (event: CompleteEvent) => void): this;
|
|
732
|
+
on(event: "error", callback: (event: ErrorEvent) => void): this;
|
|
733
|
+
on(event: "any", callback: (event: StreamEventUnion) => void): this;
|
|
734
|
+
/**
|
|
735
|
+
* Cancel extraction
|
|
736
|
+
*/
|
|
737
|
+
cancel(): Promise<void>;
|
|
738
|
+
/**
|
|
739
|
+
* Pause extraction (backpressure)
|
|
740
|
+
*/
|
|
741
|
+
pause(): void;
|
|
742
|
+
/**
|
|
743
|
+
* Resume extraction
|
|
744
|
+
*/
|
|
745
|
+
resume(): void;
|
|
746
|
+
/**
|
|
747
|
+
* Get streaming statistics
|
|
748
|
+
*/
|
|
749
|
+
getStats(): StreamingStats;
|
|
750
|
+
/**
|
|
751
|
+
* Emit an event
|
|
752
|
+
*/
|
|
753
|
+
private emitEvent;
|
|
754
|
+
/**
|
|
755
|
+
* Start the extraction process
|
|
756
|
+
*/
|
|
757
|
+
private startExtraction;
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
/**
|
|
761
|
+
* Text extraction from PDF files using pdf.js
|
|
367
762
|
*
|
|
368
|
-
*
|
|
369
|
-
*
|
|
763
|
+
* Direct pdf.js-based text extraction with support for:
|
|
764
|
+
* - Page-by-page extraction with accurate boundaries
|
|
765
|
+
* - Text positioning and font information
|
|
766
|
+
* - Metadata retrieval
|
|
767
|
+
* - No external dependencies (uses pdf.js directly)
|
|
370
768
|
*
|
|
371
769
|
* @example
|
|
372
770
|
* ```typescript
|
|
@@ -376,6 +774,19 @@ declare const pdfExtractor: PDFExtractor;
|
|
|
376
774
|
* ```
|
|
377
775
|
*/
|
|
378
776
|
declare class TextExtractor {
|
|
777
|
+
constructor();
|
|
778
|
+
/**
|
|
779
|
+
* Initialize pdf.js worker
|
|
780
|
+
*/
|
|
781
|
+
private initializePdfjs;
|
|
782
|
+
/**
|
|
783
|
+
* Load PDF document
|
|
784
|
+
*/
|
|
785
|
+
private loadDocument;
|
|
786
|
+
/**
|
|
787
|
+
* Extract text from a single page
|
|
788
|
+
*/
|
|
789
|
+
private getPageText;
|
|
379
790
|
/**
|
|
380
791
|
* Extract text content from PDF
|
|
381
792
|
*
|
|
@@ -384,6 +795,17 @@ declare class TextExtractor {
|
|
|
384
795
|
* @throws {Error} When PDF extraction fails
|
|
385
796
|
*/
|
|
386
797
|
extract(pdfPath: string): Promise<any>;
|
|
798
|
+
/**
|
|
799
|
+
* Extract text with metadata
|
|
800
|
+
*
|
|
801
|
+
* @param pdfPath - Path to the PDF file
|
|
802
|
+
* @returns Promise resolving to extraction result with text and metadata
|
|
803
|
+
* @throws {Error} When PDF extraction fails
|
|
804
|
+
*/
|
|
805
|
+
extractWithMetadata(pdfPath: string): Promise<{
|
|
806
|
+
text: string;
|
|
807
|
+
metadata: any;
|
|
808
|
+
}>;
|
|
387
809
|
/**
|
|
388
810
|
* Extract text with page information
|
|
389
811
|
*
|
|
@@ -393,11 +815,7 @@ declare class TextExtractor {
|
|
|
393
815
|
*/
|
|
394
816
|
extractWithPages(pdfPath: string): Promise<any>;
|
|
395
817
|
/**
|
|
396
|
-
*
|
|
397
|
-
*/
|
|
398
|
-
private splitTextIntoPages;
|
|
399
|
-
/**
|
|
400
|
-
* Extract text items with position and metadata
|
|
818
|
+
* Extract text items with position and metadata using pdf.js
|
|
401
819
|
*/
|
|
402
820
|
extractTextItems(pdfPath: string, options?: ExtractionOptions): Promise<TextItem[]>;
|
|
403
821
|
/**
|
|
@@ -439,21 +857,112 @@ declare class TextExtractor {
|
|
|
439
857
|
pageOffset?: number;
|
|
440
858
|
includeImageRefs?: boolean;
|
|
441
859
|
imageRefFormat?: string;
|
|
442
|
-
imageEngine?: ImageExtractionEngine;
|
|
443
860
|
}): Promise<{
|
|
444
861
|
text: string;
|
|
445
|
-
pages: PageData[];
|
|
862
|
+
pages: PageData$1[];
|
|
446
863
|
}>;
|
|
447
864
|
/**
|
|
448
865
|
* Extract text with accurate page boundaries using pdf-lib + pdf-parse
|
|
449
866
|
*/
|
|
450
867
|
extractWithAccuratePages(pdfPath: string): Promise<{
|
|
451
868
|
fullText: string;
|
|
452
|
-
pages: PageData[];
|
|
869
|
+
pages: PageData$1[];
|
|
453
870
|
totalPages: number;
|
|
454
871
|
}>;
|
|
455
872
|
}
|
|
456
873
|
|
|
874
|
+
/**
|
|
875
|
+
* Structured text extractor using both pdf-lib and pdf.js for accurate page-by-page extraction
|
|
876
|
+
*
|
|
877
|
+
* Extracts text with rich metadata including page dimensions, rotation, word counts, and character counts.
|
|
878
|
+
* Uses pdf-lib for accurate page structure and pdf.js for text content.
|
|
879
|
+
*/
|
|
880
|
+
interface PageData {
|
|
881
|
+
pageNumber: number;
|
|
882
|
+
text: string;
|
|
883
|
+
width: number;
|
|
884
|
+
height: number;
|
|
885
|
+
rotation: number;
|
|
886
|
+
mediaBox: number[];
|
|
887
|
+
textItems?: any[];
|
|
888
|
+
wordCount: number;
|
|
889
|
+
characterCount: number;
|
|
890
|
+
}
|
|
891
|
+
declare class StructuredTextExtractor {
|
|
892
|
+
private pdfLibDoc;
|
|
893
|
+
private pdfLibPages;
|
|
894
|
+
private textData;
|
|
895
|
+
constructor();
|
|
896
|
+
/**
|
|
897
|
+
* Initialize pdf.js worker
|
|
898
|
+
*/
|
|
899
|
+
private initializePdfjs;
|
|
900
|
+
/**
|
|
901
|
+
* Process PDF with accurate page-by-page extraction
|
|
902
|
+
*/
|
|
903
|
+
processPDF(pdfPath: string): Promise<{
|
|
904
|
+
totalPages: number;
|
|
905
|
+
pages: PageData[];
|
|
906
|
+
fullText: string;
|
|
907
|
+
}>;
|
|
908
|
+
/**
|
|
909
|
+
* Process with pdf-lib to get accurate page structure
|
|
910
|
+
*/
|
|
911
|
+
private processPDFLib;
|
|
912
|
+
/**
|
|
913
|
+
* Process with pdf.js to extract text page by page
|
|
914
|
+
*/
|
|
915
|
+
private processPDFjs;
|
|
916
|
+
/**
|
|
917
|
+
* Combine results from both libraries
|
|
918
|
+
*/
|
|
919
|
+
private combineResults;
|
|
920
|
+
/**
|
|
921
|
+
* Extract text with page markers using accurate page boundaries
|
|
922
|
+
*/
|
|
923
|
+
extractWithPageMarkers(pdfPath: string, pageMarkerFormat?: string, options?: {
|
|
924
|
+
includeImageRefs?: boolean;
|
|
925
|
+
imageRefFormat?: string;
|
|
926
|
+
}): Promise<{
|
|
927
|
+
text: string;
|
|
928
|
+
cleanText: string;
|
|
929
|
+
numPages: number;
|
|
930
|
+
pages: PageData[];
|
|
931
|
+
}>;
|
|
932
|
+
/**
|
|
933
|
+
* Get specific page data
|
|
934
|
+
*/
|
|
935
|
+
getPage(pageNumber: number): PageData | null;
|
|
936
|
+
/**
|
|
937
|
+
* Get detailed page information including text positioning
|
|
938
|
+
*/
|
|
939
|
+
getDetailedPageInfo(pdfPath: string, pageNumber: number): Promise<{
|
|
940
|
+
pageNumber: number;
|
|
941
|
+
text: string;
|
|
942
|
+
textItems: Array<{
|
|
943
|
+
text: string;
|
|
944
|
+
x: number;
|
|
945
|
+
y: number;
|
|
946
|
+
width: number;
|
|
947
|
+
height: number;
|
|
948
|
+
fontName?: string;
|
|
949
|
+
fontSize?: number;
|
|
950
|
+
}>;
|
|
951
|
+
dimensions: {
|
|
952
|
+
width: number;
|
|
953
|
+
height: number;
|
|
954
|
+
};
|
|
955
|
+
} | null>;
|
|
956
|
+
/**
|
|
957
|
+
* Count words in text
|
|
958
|
+
*/
|
|
959
|
+
private countWords;
|
|
960
|
+
/**
|
|
961
|
+
* Process single page (for streaming/batch processing)
|
|
962
|
+
*/
|
|
963
|
+
processSinglePage(pdfPath: string, pageNumber: number): Promise<PageData | null>;
|
|
964
|
+
}
|
|
965
|
+
|
|
457
966
|
/**
|
|
458
967
|
* Image extraction from PDF files using pdf-lib (clean implementation based on NestJS)
|
|
459
968
|
*
|
|
@@ -530,6 +1039,381 @@ declare class ImageExtractor {
|
|
|
530
1039
|
private createPngFromPdfMetadata;
|
|
531
1040
|
}
|
|
532
1041
|
|
|
1042
|
+
/**
|
|
1043
|
+
* Types for PDF page to image conversion
|
|
1044
|
+
*/
|
|
1045
|
+
/**
|
|
1046
|
+
* Image format for page conversion
|
|
1047
|
+
*/
|
|
1048
|
+
type PageImageFormat = "png" | "jpg" | "jpeg" | "webp";
|
|
1049
|
+
/**
|
|
1050
|
+
* Options for converting PDF pages to images
|
|
1051
|
+
*/
|
|
1052
|
+
interface PageToImageOptions {
|
|
1053
|
+
/**
|
|
1054
|
+
* Output directory for image files
|
|
1055
|
+
* @default './page-images'
|
|
1056
|
+
*/
|
|
1057
|
+
outputDir?: string;
|
|
1058
|
+
/**
|
|
1059
|
+
* Image format
|
|
1060
|
+
* @default 'png'
|
|
1061
|
+
*/
|
|
1062
|
+
format?: PageImageFormat;
|
|
1063
|
+
/**
|
|
1064
|
+
* JPEG quality (1-100, only for JPG format)
|
|
1065
|
+
* @default 90
|
|
1066
|
+
*/
|
|
1067
|
+
quality?: number;
|
|
1068
|
+
/**
|
|
1069
|
+
* DPI (dots per inch) for rendering
|
|
1070
|
+
* Higher DPI = better quality but larger files
|
|
1071
|
+
* @default 72
|
|
1072
|
+
*/
|
|
1073
|
+
dpi?: number;
|
|
1074
|
+
/**
|
|
1075
|
+
* Scale factor (multiplier for dimensions)
|
|
1076
|
+
* @default 1
|
|
1077
|
+
*/
|
|
1078
|
+
scale?: number;
|
|
1079
|
+
/**
|
|
1080
|
+
* Specific pages to convert (1-based)
|
|
1081
|
+
* If not provided, converts all pages
|
|
1082
|
+
* @example [1, 3, 5]
|
|
1083
|
+
*/
|
|
1084
|
+
pages?: number[];
|
|
1085
|
+
/**
|
|
1086
|
+
* Page range to convert (e.g., "1-5", "1,3,5-10")
|
|
1087
|
+
* If not provided, converts all pages
|
|
1088
|
+
* @example "1-5"
|
|
1089
|
+
*/
|
|
1090
|
+
pageRange?: string;
|
|
1091
|
+
/**
|
|
1092
|
+
* Filename pattern for output files
|
|
1093
|
+
* Available placeholders: {page}, {total}, {name}
|
|
1094
|
+
* @default 'page-{page}.{ext}'
|
|
1095
|
+
*/
|
|
1096
|
+
filenamePattern?: string;
|
|
1097
|
+
/**
|
|
1098
|
+
* Background color for transparent PDFs
|
|
1099
|
+
* @default '#FFFFFF'
|
|
1100
|
+
*/
|
|
1101
|
+
backgroundColor?: string;
|
|
1102
|
+
/**
|
|
1103
|
+
* Enable transparent background (PNG only)
|
|
1104
|
+
* @default false
|
|
1105
|
+
*/
|
|
1106
|
+
transparent?: boolean;
|
|
1107
|
+
/**
|
|
1108
|
+
* Crop to content (remove white margins)
|
|
1109
|
+
* @default false
|
|
1110
|
+
*/
|
|
1111
|
+
cropToContent?: boolean;
|
|
1112
|
+
/**
|
|
1113
|
+
* Progress callback
|
|
1114
|
+
*/
|
|
1115
|
+
onProgress?: (current: number, total: number, percentage: number) => void;
|
|
1116
|
+
/**
|
|
1117
|
+
* Callback when a page is converted
|
|
1118
|
+
*/
|
|
1119
|
+
onPageComplete?: (pageNumber: number, filepath: string) => void;
|
|
1120
|
+
/**
|
|
1121
|
+
* Verbose logging
|
|
1122
|
+
* @default false
|
|
1123
|
+
*/
|
|
1124
|
+
verbose?: boolean;
|
|
1125
|
+
}
|
|
1126
|
+
/**
|
|
1127
|
+
* Result of page to image conversion
|
|
1128
|
+
*/
|
|
1129
|
+
interface PageImageResult {
|
|
1130
|
+
/**
|
|
1131
|
+
* Page number (1-based)
|
|
1132
|
+
*/
|
|
1133
|
+
page: number;
|
|
1134
|
+
/**
|
|
1135
|
+
* Output file path
|
|
1136
|
+
*/
|
|
1137
|
+
filepath: string;
|
|
1138
|
+
/**
|
|
1139
|
+
* Image width in pixels
|
|
1140
|
+
*/
|
|
1141
|
+
width: number;
|
|
1142
|
+
/**
|
|
1143
|
+
* Image height in pixels
|
|
1144
|
+
*/
|
|
1145
|
+
height: number;
|
|
1146
|
+
/**
|
|
1147
|
+
* File size in bytes
|
|
1148
|
+
*/
|
|
1149
|
+
fileSize: number;
|
|
1150
|
+
/**
|
|
1151
|
+
* Image format
|
|
1152
|
+
*/
|
|
1153
|
+
format: PageImageFormat;
|
|
1154
|
+
}
|
|
1155
|
+
/**
|
|
1156
|
+
* Result of converting all pages
|
|
1157
|
+
*/
|
|
1158
|
+
interface PageToImageResult {
|
|
1159
|
+
/**
|
|
1160
|
+
* Array of converted page images
|
|
1161
|
+
*/
|
|
1162
|
+
images: PageImageResult[];
|
|
1163
|
+
/**
|
|
1164
|
+
* Total number of pages converted
|
|
1165
|
+
*/
|
|
1166
|
+
totalPages: number;
|
|
1167
|
+
/**
|
|
1168
|
+
* Output directory
|
|
1169
|
+
*/
|
|
1170
|
+
outputDir: string;
|
|
1171
|
+
/**
|
|
1172
|
+
* Total size of all images in bytes
|
|
1173
|
+
*/
|
|
1174
|
+
totalSize: number;
|
|
1175
|
+
}
|
|
1176
|
+
/**
|
|
1177
|
+
* Options for converting a single page
|
|
1178
|
+
*/
|
|
1179
|
+
interface SinglePageOptions {
|
|
1180
|
+
/**
|
|
1181
|
+
* Image format
|
|
1182
|
+
* @default 'png'
|
|
1183
|
+
*/
|
|
1184
|
+
format?: PageImageFormat;
|
|
1185
|
+
/**
|
|
1186
|
+
* JPEG quality (1-100)
|
|
1187
|
+
* @default 90
|
|
1188
|
+
*/
|
|
1189
|
+
quality?: number;
|
|
1190
|
+
/**
|
|
1191
|
+
* DPI for rendering
|
|
1192
|
+
* @default 72
|
|
1193
|
+
*/
|
|
1194
|
+
dpi?: number;
|
|
1195
|
+
/**
|
|
1196
|
+
* Scale factor
|
|
1197
|
+
* @default 1
|
|
1198
|
+
*/
|
|
1199
|
+
scale?: number;
|
|
1200
|
+
/**
|
|
1201
|
+
* Background color
|
|
1202
|
+
* @default '#FFFFFF'
|
|
1203
|
+
*/
|
|
1204
|
+
backgroundColor?: string;
|
|
1205
|
+
/**
|
|
1206
|
+
* Transparent background (PNG only)
|
|
1207
|
+
* @default false
|
|
1208
|
+
*/
|
|
1209
|
+
transparent?: boolean;
|
|
1210
|
+
}
|
|
1211
|
+
/**
|
|
1212
|
+
* Thumbnail generation options
|
|
1213
|
+
*/
|
|
1214
|
+
interface ThumbnailOptions extends SinglePageOptions {
|
|
1215
|
+
/**
|
|
1216
|
+
* Maximum width in pixels
|
|
1217
|
+
* @default 200
|
|
1218
|
+
*/
|
|
1219
|
+
maxWidth?: number;
|
|
1220
|
+
/**
|
|
1221
|
+
* Maximum height in pixels
|
|
1222
|
+
* @default 200
|
|
1223
|
+
*/
|
|
1224
|
+
maxHeight?: number;
|
|
1225
|
+
/**
|
|
1226
|
+
* Maintain aspect ratio
|
|
1227
|
+
* @default true
|
|
1228
|
+
*/
|
|
1229
|
+
maintainAspectRatio?: boolean;
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
/**
|
|
1233
|
+
* PDF Page to Image Converter using pdf.js
|
|
1234
|
+
*
|
|
1235
|
+
* Converts PDF pages to image files (PNG, JPG, WebP) with customizable options.
|
|
1236
|
+
* Uses Mozilla's pdf.js for high-quality rendering without external dependencies.
|
|
1237
|
+
*/
|
|
1238
|
+
|
|
1239
|
+
/**
|
|
1240
|
+
* Page to Image Converter
|
|
1241
|
+
*
|
|
1242
|
+
* @example
|
|
1243
|
+
* ```typescript
|
|
1244
|
+
* const converter = new PageToImageConverter();
|
|
1245
|
+
* const result = await converter.convertToImages('document.pdf', {
|
|
1246
|
+
* outputDir: './pages',
|
|
1247
|
+
* format: 'png',
|
|
1248
|
+
* dpi: 150
|
|
1249
|
+
* });
|
|
1250
|
+
* ```
|
|
1251
|
+
*/
|
|
1252
|
+
declare class PageToImageConverter {
|
|
1253
|
+
private pdfjs;
|
|
1254
|
+
/**
|
|
1255
|
+
* Get or load pdf.js module with proper worker configuration
|
|
1256
|
+
* Based on pdf-to-img library approach
|
|
1257
|
+
*/
|
|
1258
|
+
private getPdfjs;
|
|
1259
|
+
/**
|
|
1260
|
+
* Convert all pages of a PDF to images
|
|
1261
|
+
*
|
|
1262
|
+
* @param pdfPath - Path to PDF file
|
|
1263
|
+
* @param options - Conversion options
|
|
1264
|
+
* @returns Conversion result with image paths
|
|
1265
|
+
*/
|
|
1266
|
+
convertToImages(pdfPath: string, options?: PageToImageOptions): Promise<PageToImageResult>;
|
|
1267
|
+
/**
|
|
1268
|
+
* Convert a single page to an image file
|
|
1269
|
+
*
|
|
1270
|
+
* @param pdfPath - Path to PDF file
|
|
1271
|
+
* @param pageNumber - Page number (1-based)
|
|
1272
|
+
* @param outputPath - Output file path
|
|
1273
|
+
* @param options - Conversion options
|
|
1274
|
+
*/
|
|
1275
|
+
convertPage(pdfPath: string, pageNumber: number, outputPath: string, options?: SinglePageOptions): Promise<PageImageResult>;
|
|
1276
|
+
/**
|
|
1277
|
+
* Convert a page to a buffer (no file write)
|
|
1278
|
+
*
|
|
1279
|
+
* @param pdfPath - Path to PDF file
|
|
1280
|
+
* @param pageNumber - Page number (1-based)
|
|
1281
|
+
* @param options - Conversion options
|
|
1282
|
+
* @returns Image buffer
|
|
1283
|
+
*/
|
|
1284
|
+
convertPageToBuffer(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<Buffer>;
|
|
1285
|
+
/**
|
|
1286
|
+
* Convert a page to base64 string
|
|
1287
|
+
*
|
|
1288
|
+
* @param pdfPath - Path to PDF file
|
|
1289
|
+
* @param pageNumber - Page number (1-based)
|
|
1290
|
+
* @param options - Conversion options
|
|
1291
|
+
* @returns Base64 encoded image
|
|
1292
|
+
*/
|
|
1293
|
+
convertPageToBase64(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<string>;
|
|
1294
|
+
/**
|
|
1295
|
+
* Generate thumbnails for all pages
|
|
1296
|
+
*
|
|
1297
|
+
* @param pdfPath - Path to PDF file
|
|
1298
|
+
* @param options - Thumbnail options
|
|
1299
|
+
* @returns Conversion result
|
|
1300
|
+
*/
|
|
1301
|
+
generateThumbnails(pdfPath: string, options?: ThumbnailOptions & {
|
|
1302
|
+
outputDir?: string;
|
|
1303
|
+
}): Promise<PageToImageResult>;
|
|
1304
|
+
/**
|
|
1305
|
+
* Render a PDF page to image buffer
|
|
1306
|
+
*
|
|
1307
|
+
* Based on pdf-to-img library approach - let pdf.js handle canvas creation
|
|
1308
|
+
* @see https://github.com/k-yle/pdf-to-img
|
|
1309
|
+
*/
|
|
1310
|
+
private renderPageToBuffer;
|
|
1311
|
+
/**
|
|
1312
|
+
* Convert canvas to image buffer
|
|
1313
|
+
*/
|
|
1314
|
+
private canvasToBuffer;
|
|
1315
|
+
/**
|
|
1316
|
+
* Get page numbers to convert based on options
|
|
1317
|
+
*/
|
|
1318
|
+
private getPageNumbers;
|
|
1319
|
+
/**
|
|
1320
|
+
* Parse page range string (e.g., "1-5", "1,3,5-10")
|
|
1321
|
+
*/
|
|
1322
|
+
private parsePageRange;
|
|
1323
|
+
/**
|
|
1324
|
+
* Generate filename from pattern
|
|
1325
|
+
*/
|
|
1326
|
+
private generateFilename;
|
|
1327
|
+
/**
|
|
1328
|
+
* Format bytes to human-readable string
|
|
1329
|
+
*/
|
|
1330
|
+
private formatBytes;
|
|
1331
|
+
}
|
|
1332
|
+
|
|
1333
|
+
/**
|
|
1334
|
+
* Result of image optimization
|
|
1335
|
+
*/
|
|
1336
|
+
interface OptimizationResult {
|
|
1337
|
+
success: boolean;
|
|
1338
|
+
originalSize: number;
|
|
1339
|
+
optimizedSize: number;
|
|
1340
|
+
savedBytes: number;
|
|
1341
|
+
savedPercent: number;
|
|
1342
|
+
engine: "jimp" | "sharp" | "none";
|
|
1343
|
+
error?: string;
|
|
1344
|
+
}
|
|
1345
|
+
/**
|
|
1346
|
+
* Options for image optimization
|
|
1347
|
+
*/
|
|
1348
|
+
interface OptimizationOptions {
|
|
1349
|
+
quality?: number;
|
|
1350
|
+
verbose?: boolean;
|
|
1351
|
+
useSharp?: boolean;
|
|
1352
|
+
}
|
|
1353
|
+
/**
|
|
1354
|
+
* Image optimizer using Jimp (pure JavaScript)
|
|
1355
|
+
*
|
|
1356
|
+
* This class provides image optimization capabilities using Jimp, a pure JavaScript
|
|
1357
|
+
* image processing library with no native dependencies. It supports JPEG and PNG
|
|
1358
|
+
* optimization with quality control.
|
|
1359
|
+
*
|
|
1360
|
+
* @example
|
|
1361
|
+
* ```typescript
|
|
1362
|
+
* const result = await ImageOptimizer.optimizeFile('image.jpg', {
|
|
1363
|
+
* engine: 'auto',
|
|
1364
|
+
* quality: 80
|
|
1365
|
+
* });
|
|
1366
|
+
*
|
|
1367
|
+
* console.log(`Saved ${result.savedPercent.toFixed(1)}% using ${result.engine}`);
|
|
1368
|
+
* ```
|
|
1369
|
+
*/
|
|
1370
|
+
declare class ImageOptimizer {
|
|
1371
|
+
/**
|
|
1372
|
+
* Optimize an image file in-place
|
|
1373
|
+
*
|
|
1374
|
+
* The original file will be replaced with the optimized version.
|
|
1375
|
+
* If optimization fails, the original file remains unchanged.
|
|
1376
|
+
*
|
|
1377
|
+
* @param filePath - Path to the image file to optimize
|
|
1378
|
+
* @param options - Optimization options
|
|
1379
|
+
* @returns Promise resolving to optimization result
|
|
1380
|
+
*/
|
|
1381
|
+
static optimizeFile(filePath: string, options?: OptimizationOptions): Promise<OptimizationResult>;
|
|
1382
|
+
/**
|
|
1383
|
+
* Optimize using Sharp (optional dependency)
|
|
1384
|
+
*/
|
|
1385
|
+
private static optimizeWithSharp;
|
|
1386
|
+
/**
|
|
1387
|
+
* Optimize using Jimp (pure JavaScript)
|
|
1388
|
+
*/
|
|
1389
|
+
private static optimizeWithJimp;
|
|
1390
|
+
/**
|
|
1391
|
+
* Convert JPEG 2000 formats (jp2, jpx, j2c, jpm) to JPG
|
|
1392
|
+
*
|
|
1393
|
+
* JPEG 2000 files are not widely supported by browsers and image tools.
|
|
1394
|
+
* This method converts them to standard JPG format for better compatibility.
|
|
1395
|
+
*
|
|
1396
|
+
* Supports two conversion engines:
|
|
1397
|
+
* - Jimp (default): Pure JavaScript, works everywhere
|
|
1398
|
+
* - Sharp (optional): Better color preservation, requires native compilation
|
|
1399
|
+
*
|
|
1400
|
+
* @param jp2Path - Path to the JPEG 2000 file (jp2, jpx, j2c, or jpm)
|
|
1401
|
+
* @param options - Conversion options
|
|
1402
|
+
* @returns Promise resolving to conversion result with new file path
|
|
1403
|
+
*/
|
|
1404
|
+
static convertJp2ToJpg(jp2Path: string, options?: {
|
|
1405
|
+
quality?: number;
|
|
1406
|
+
verbose?: boolean;
|
|
1407
|
+
useSharp?: boolean;
|
|
1408
|
+
}): Promise<{
|
|
1409
|
+
success: boolean;
|
|
1410
|
+
newPath?: string;
|
|
1411
|
+
originalSize?: number;
|
|
1412
|
+
newSize?: number;
|
|
1413
|
+
error?: string;
|
|
1414
|
+
}>;
|
|
1415
|
+
}
|
|
1416
|
+
|
|
533
1417
|
/**
|
|
534
1418
|
* Handles formatting of image references and text processing
|
|
535
1419
|
*/
|
|
@@ -597,12 +1481,23 @@ declare function validateImageRefFormat(format: string): ValidationError[];
|
|
|
597
1481
|
*/
|
|
598
1482
|
declare function validateFilePath(filePath: string, extensions?: string[]): ValidationError[];
|
|
599
1483
|
|
|
1484
|
+
/**
|
|
1485
|
+
* pdf-plus - A comprehensive PDF content extraction library
|
|
1486
|
+
*
|
|
1487
|
+
* Main entry point for the PDF content extraction library.
|
|
1488
|
+
* Provides both high-level convenience functions and low-level access to extractors.
|
|
1489
|
+
*
|
|
1490
|
+
* @packageDocumentation
|
|
1491
|
+
*/
|
|
1492
|
+
|
|
600
1493
|
/**
|
|
601
1494
|
* Extract content from a PDF file (convenience function)
|
|
602
1495
|
*
|
|
1496
|
+
* Automatically switches to streaming mode for large PDFs if `autoStreamThreshold` is set.
|
|
1497
|
+
*
|
|
603
1498
|
* @param pdfPath - Path to the PDF file
|
|
604
1499
|
* @param options - Extraction options
|
|
605
|
-
* @returns Promise resolving to extraction result
|
|
1500
|
+
* @returns Promise resolving to extraction result or streaming result
|
|
606
1501
|
*
|
|
607
1502
|
* @example
|
|
608
1503
|
* ```typescript
|
|
@@ -616,8 +1511,17 @@ declare function validateFilePath(filePath: string, extensions?: string[]): Vali
|
|
|
616
1511
|
*
|
|
617
1512
|
* console.log(`Extracted ${result.images.length} images from ${result.document.pages} pages`);
|
|
618
1513
|
* ```
|
|
1514
|
+
*
|
|
1515
|
+
* @example
|
|
1516
|
+
* ```typescript
|
|
1517
|
+
* // Auto-streaming for large PDFs
|
|
1518
|
+
* const result = await extractPdfContent('large-document.pdf', {
|
|
1519
|
+
* extractImageFiles: true,
|
|
1520
|
+
* autoStreamThreshold: 100, // Auto-stream if > 100 pages
|
|
1521
|
+
* });
|
|
1522
|
+
* ```
|
|
619
1523
|
*/
|
|
620
|
-
declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult>;
|
|
1524
|
+
declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult | StreamingExtractionResult>;
|
|
621
1525
|
/**
|
|
622
1526
|
* Extract only text content from a PDF (convenience function)
|
|
623
1527
|
*
|
|
@@ -674,22 +1578,70 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
|
|
|
674
1578
|
* ```
|
|
675
1579
|
*/
|
|
676
1580
|
declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
677
|
-
|
|
678
|
-
|
|
1581
|
+
/**
|
|
1582
|
+
* Extract PDF content in streaming mode (Phase 4 - NEW!)
|
|
1583
|
+
*
|
|
1584
|
+
* For large PDFs, this provides a streaming API that processes pages one at a time,
|
|
1585
|
+
* reducing memory usage and providing real-time progress updates.
|
|
1586
|
+
*
|
|
1587
|
+
* @param pdfPath - Path to the PDF file
|
|
1588
|
+
* @param options - Extraction and streaming options
|
|
1589
|
+
* @returns StreamingExtractionResult with async iterator and event callbacks
|
|
1590
|
+
*
|
|
1591
|
+
* @example
|
|
1592
|
+
* ```typescript
|
|
1593
|
+
* // Using async iterator
|
|
1594
|
+
* const stream = extractPdfStream('large-document.pdf', {
|
|
1595
|
+
* extractImageFiles: true,
|
|
1596
|
+
* imageOutputDir: './images',
|
|
1597
|
+
* streamMode: true
|
|
1598
|
+
* });
|
|
1599
|
+
*
|
|
1600
|
+
* for await (const event of stream) {
|
|
1601
|
+
* if (event.type === 'page') {
|
|
1602
|
+
* console.log(`Processed page ${event.pageNumber}/${event.totalPages}`);
|
|
1603
|
+
* } else if (event.type === 'progress') {
|
|
1604
|
+
* console.log(`Progress: ${event.percentComplete.toFixed(1)}%`);
|
|
1605
|
+
* }
|
|
1606
|
+
* }
|
|
1607
|
+
*
|
|
1608
|
+
* // Using event callbacks
|
|
1609
|
+
* const stream = extractPdfStream('large-document.pdf', { streamMode: true })
|
|
1610
|
+
* .on('page', (event) => console.log(`Page ${event.pageNumber} done`))
|
|
1611
|
+
* .on('progress', (event) => console.log(`${event.percentComplete}% complete`))
|
|
1612
|
+
* .on('complete', (event) => console.log(`Done! ${event.totalImages} images`));
|
|
1613
|
+
*
|
|
1614
|
+
* for await (const event of stream) {
|
|
1615
|
+
* // Events are also available via iterator
|
|
1616
|
+
* }
|
|
1617
|
+
* ```
|
|
1618
|
+
*/
|
|
1619
|
+
declare function extractPdfStream(pdfPath: string, options?: Partial<ExtractionOptions>): StreamingExtractionResult;
|
|
1620
|
+
/**
|
|
1621
|
+
* Library version
|
|
1622
|
+
*/
|
|
1623
|
+
declare const version = "1.0.3";
|
|
1624
|
+
/**
|
|
1625
|
+
* Default export containing all public APIs
|
|
1626
|
+
* Useful for CommonJS: const pdfPlus = require('pdf-plus');
|
|
1627
|
+
*/
|
|
679
1628
|
declare const _default: {
|
|
680
1629
|
PDFExtractor: typeof PDFExtractor;
|
|
681
1630
|
pdfExtractor: PDFExtractor;
|
|
1631
|
+
StreamingPDFExtractor: typeof StreamingPDFExtractor;
|
|
682
1632
|
TextExtractor: typeof TextExtractor;
|
|
683
1633
|
ImageExtractor: typeof ImageExtractor;
|
|
1634
|
+
ImageOptimizer: typeof ImageOptimizer;
|
|
684
1635
|
FormatProcessor: typeof FormatProcessor;
|
|
685
1636
|
extractPdfContent: typeof extractPdfContent;
|
|
686
1637
|
extractText: typeof extractText;
|
|
687
1638
|
extractImages: typeof extractImages;
|
|
688
1639
|
extractImageFiles: typeof extractImageFiles;
|
|
1640
|
+
extractPdfStream: typeof extractPdfStream;
|
|
689
1641
|
validateConfig: typeof validateConfig;
|
|
690
1642
|
validateImageRefFormat: typeof validateImageRefFormat;
|
|
691
1643
|
validateFilePath: typeof validateFilePath;
|
|
692
1644
|
version: string;
|
|
693
1645
|
};
|
|
694
1646
|
|
|
695
|
-
export { type AnalyticsData, type DocumentMetadata, type DocumentSummary, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type
|
|
1647
|
+
export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
|