pdf-plus 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +303 -2
- package/dist/index.d.mts +974 -21
- package/dist/index.d.ts +974 -21
- package/dist/index.js +36 -35
- package/dist/index.mjs +36 -35
- package/dist/workers/image-decoder.worker.d.mts +2 -0
- package/dist/workers/image-decoder.worker.d.ts +2 -0
- package/dist/workers/image-decoder.worker.js +2 -0
- package/dist/workers/image-decoder.worker.mjs +2 -0
- package/dist/workers/jp2-converter.worker.d.mts +2 -0
- package/dist/workers/jp2-converter.worker.d.ts +2 -0
- package/dist/workers/jp2-converter.worker.js +2 -0
- package/dist/workers/jp2-converter.worker.mjs +2 -0
- package/package.json +22 -7
package/dist/index.d.mts
CHANGED
|
@@ -1,3 +1,202 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Types for streaming PDF extraction
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Event types emitted during streaming extraction
|
|
7
|
+
*/
|
|
8
|
+
type StreamEventType = "start" | "page" | "image" | "progress" | "complete" | "error";
|
|
9
|
+
/**
|
|
10
|
+
* Base event structure
|
|
11
|
+
*/
|
|
12
|
+
interface StreamEvent {
|
|
13
|
+
type: StreamEventType;
|
|
14
|
+
timestamp: number;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Start event - emitted when extraction begins
|
|
18
|
+
*/
|
|
19
|
+
interface StartEvent extends StreamEvent {
|
|
20
|
+
type: "start";
|
|
21
|
+
totalPages: number;
|
|
22
|
+
pdfPath: string;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Page event - emitted when a page is processed
|
|
26
|
+
*/
|
|
27
|
+
interface PageEvent extends StreamEvent {
|
|
28
|
+
type: "page";
|
|
29
|
+
pageNumber: number;
|
|
30
|
+
totalPages: number;
|
|
31
|
+
textLength: number;
|
|
32
|
+
imageCount: number;
|
|
33
|
+
pageInfo?: PageInfo;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Image event - emitted when an image is extracted
|
|
37
|
+
*/
|
|
38
|
+
interface ImageEvent extends StreamEvent {
|
|
39
|
+
type: "image";
|
|
40
|
+
image: ImageItem;
|
|
41
|
+
pageNumber: number;
|
|
42
|
+
imageIndex: number;
|
|
43
|
+
totalImages: number;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Progress event - emitted periodically during extraction
|
|
47
|
+
*/
|
|
48
|
+
interface ProgressEvent extends StreamEvent {
|
|
49
|
+
type: "progress";
|
|
50
|
+
pagesProcessed: number;
|
|
51
|
+
totalPages: number;
|
|
52
|
+
imagesExtracted: number;
|
|
53
|
+
percentComplete: number;
|
|
54
|
+
estimatedTimeRemaining?: number;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Complete event - emitted when extraction finishes
|
|
58
|
+
*/
|
|
59
|
+
interface CompleteEvent extends StreamEvent {
|
|
60
|
+
type: "complete";
|
|
61
|
+
totalPages: number;
|
|
62
|
+
totalImages: number;
|
|
63
|
+
totalTextLength: number;
|
|
64
|
+
duration: number;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Error event - emitted when an error occurs
|
|
68
|
+
*/
|
|
69
|
+
interface ErrorEvent extends StreamEvent {
|
|
70
|
+
type: "error";
|
|
71
|
+
error: Error;
|
|
72
|
+
pageNumber?: number;
|
|
73
|
+
recoverable: boolean;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Union type of all stream events
|
|
77
|
+
*/
|
|
78
|
+
type StreamEventUnion = StartEvent | PageEvent | ImageEvent | ProgressEvent | CompleteEvent | ErrorEvent;
|
|
79
|
+
/**
|
|
80
|
+
* Streaming extraction options
|
|
81
|
+
*/
|
|
82
|
+
interface StreamingOptions$1 {
|
|
83
|
+
/**
|
|
84
|
+
* Enable streaming mode
|
|
85
|
+
* @default false
|
|
86
|
+
*/
|
|
87
|
+
streamMode?: boolean;
|
|
88
|
+
/**
|
|
89
|
+
* Automatically enable streaming for PDFs with more than this many pages
|
|
90
|
+
* @default 100
|
|
91
|
+
*/
|
|
92
|
+
autoStreamThreshold?: number;
|
|
93
|
+
/**
|
|
94
|
+
* Enable backpressure handling (pause extraction if consumer is slow)
|
|
95
|
+
* @default true
|
|
96
|
+
*/
|
|
97
|
+
enableBackpressure?: boolean;
|
|
98
|
+
/**
|
|
99
|
+
* Maximum number of pages to buffer before pausing (backpressure)
|
|
100
|
+
* @default 10
|
|
101
|
+
*/
|
|
102
|
+
maxBufferedPages?: number;
|
|
103
|
+
/**
|
|
104
|
+
* Emit progress events every N pages
|
|
105
|
+
* @default 5
|
|
106
|
+
*/
|
|
107
|
+
progressInterval?: number;
|
|
108
|
+
/**
|
|
109
|
+
* Enable event callbacks (in addition to async iterator)
|
|
110
|
+
* @default false
|
|
111
|
+
*/
|
|
112
|
+
enableEventCallbacks?: boolean;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Event callback function type
|
|
116
|
+
*/
|
|
117
|
+
type StreamEventCallback = (event: StreamEventUnion) => void | Promise<void>;
|
|
118
|
+
/**
|
|
119
|
+
* Event callbacks map
|
|
120
|
+
*/
|
|
121
|
+
interface StreamEventCallbacks {
|
|
122
|
+
onStart?: (event: StartEvent) => void | Promise<void>;
|
|
123
|
+
onPage?: (event: PageEvent) => void | Promise<void>;
|
|
124
|
+
onImage?: (event: ImageEvent) => void | Promise<void>;
|
|
125
|
+
onProgress?: (event: ProgressEvent) => void | Promise<void>;
|
|
126
|
+
onComplete?: (event: CompleteEvent) => void | Promise<void>;
|
|
127
|
+
onError?: (event: ErrorEvent) => void | Promise<void>;
|
|
128
|
+
onAny?: StreamEventCallback;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Streaming extraction result (async iterator)
|
|
132
|
+
*/
|
|
133
|
+
interface StreamingExtractionResult {
|
|
134
|
+
/**
|
|
135
|
+
* Async iterator for streaming events
|
|
136
|
+
*/
|
|
137
|
+
[Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
|
|
138
|
+
/**
|
|
139
|
+
* Register event callbacks
|
|
140
|
+
*/
|
|
141
|
+
on(event: "start", callback: (event: StartEvent) => void | Promise<void>): this;
|
|
142
|
+
on(event: "page", callback: (event: PageEvent) => void | Promise<void>): this;
|
|
143
|
+
on(event: "image", callback: (event: ImageEvent) => void | Promise<void>): this;
|
|
144
|
+
on(event: "progress", callback: (event: ProgressEvent) => void | Promise<void>): this;
|
|
145
|
+
on(event: "complete", callback: (event: CompleteEvent) => void | Promise<void>): this;
|
|
146
|
+
on(event: "error", callback: (event: ErrorEvent) => void | Promise<void>): this;
|
|
147
|
+
on(event: "any", callback: StreamEventCallback): this;
|
|
148
|
+
/**
|
|
149
|
+
* Cancel the streaming extraction
|
|
150
|
+
*/
|
|
151
|
+
cancel(): Promise<void>;
|
|
152
|
+
/**
|
|
153
|
+
* Pause the streaming extraction (backpressure)
|
|
154
|
+
*/
|
|
155
|
+
pause(): void;
|
|
156
|
+
/**
|
|
157
|
+
* Resume the streaming extraction
|
|
158
|
+
*/
|
|
159
|
+
resume(): void;
|
|
160
|
+
/**
|
|
161
|
+
* Get current streaming statistics
|
|
162
|
+
*/
|
|
163
|
+
getStats(): StreamingStats;
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Streaming statistics
|
|
167
|
+
*/
|
|
168
|
+
interface StreamingStats {
|
|
169
|
+
pagesProcessed: number;
|
|
170
|
+
totalPages: number;
|
|
171
|
+
imagesExtracted: number;
|
|
172
|
+
bytesProcessed: number;
|
|
173
|
+
startTime: number;
|
|
174
|
+
elapsedTime: number;
|
|
175
|
+
isPaused: boolean;
|
|
176
|
+
isCancelled: boolean;
|
|
177
|
+
isComplete: boolean;
|
|
178
|
+
averagePageTime: number;
|
|
179
|
+
estimatedTimeRemaining: number;
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Internal streaming state
|
|
183
|
+
*/
|
|
184
|
+
interface StreamingState {
|
|
185
|
+
totalPages: number;
|
|
186
|
+
pagesProcessed: number;
|
|
187
|
+
imagesExtracted: number;
|
|
188
|
+
totalTextLength: number;
|
|
189
|
+
bytesProcessed: number;
|
|
190
|
+
startTime: number;
|
|
191
|
+
lastProgressTime: number;
|
|
192
|
+
isPaused: boolean;
|
|
193
|
+
isCancelled: boolean;
|
|
194
|
+
isComplete: boolean;
|
|
195
|
+
bufferedPages: number;
|
|
196
|
+
eventQueue: StreamEventUnion[];
|
|
197
|
+
callbacks: StreamEventCallbacks;
|
|
198
|
+
}
|
|
199
|
+
|
|
1
200
|
/**
|
|
2
201
|
* Core types for PDF content extraction
|
|
3
202
|
*/
|
|
@@ -63,6 +262,7 @@ interface ExtractionResult {
|
|
|
63
262
|
pages: PageInfo[];
|
|
64
263
|
images: ImageItem[];
|
|
65
264
|
textItems: TextItem[];
|
|
265
|
+
text: string;
|
|
66
266
|
textWithRefs: string;
|
|
67
267
|
cleanText: string;
|
|
68
268
|
summary?: DocumentSummary;
|
|
@@ -85,9 +285,9 @@ interface StructuredPageData {
|
|
|
85
285
|
totalImages: number;
|
|
86
286
|
extractionOptions: ExtractionOptions;
|
|
87
287
|
};
|
|
88
|
-
pages: PageData[];
|
|
288
|
+
pages: PageData$1[];
|
|
89
289
|
}
|
|
90
|
-
interface PageData {
|
|
290
|
+
interface PageData$1 {
|
|
91
291
|
pageNumber: number;
|
|
92
292
|
text: {
|
|
93
293
|
content: string;
|
|
@@ -97,6 +297,32 @@ interface PageData {
|
|
|
97
297
|
};
|
|
98
298
|
images: PageImageData[];
|
|
99
299
|
imageCount: number;
|
|
300
|
+
pageImage?: {
|
|
301
|
+
path: string;
|
|
302
|
+
format: string;
|
|
303
|
+
width: number;
|
|
304
|
+
height: number;
|
|
305
|
+
size: number;
|
|
306
|
+
dpi?: number;
|
|
307
|
+
quality?: number;
|
|
308
|
+
};
|
|
309
|
+
thumbnail?: {
|
|
310
|
+
path: string;
|
|
311
|
+
format: string;
|
|
312
|
+
width: number;
|
|
313
|
+
height: number;
|
|
314
|
+
size: number;
|
|
315
|
+
quality?: number;
|
|
316
|
+
};
|
|
317
|
+
pageImageVariants?: Array<{
|
|
318
|
+
path: string;
|
|
319
|
+
format: string;
|
|
320
|
+
width: number;
|
|
321
|
+
height: number;
|
|
322
|
+
size: number;
|
|
323
|
+
quality: number;
|
|
324
|
+
dpi?: number;
|
|
325
|
+
}>;
|
|
100
326
|
}
|
|
101
327
|
interface PageImageData {
|
|
102
328
|
id: string;
|
|
@@ -111,8 +337,10 @@ interface PageImageData {
|
|
|
111
337
|
};
|
|
112
338
|
format: string;
|
|
113
339
|
size?: number;
|
|
340
|
+
width?: number;
|
|
341
|
+
height?: number;
|
|
342
|
+
mimeType?: string;
|
|
114
343
|
}
|
|
115
|
-
type ImageExtractionEngine = "pdf-lib" | "poppler" | "auto";
|
|
116
344
|
interface ExtractionOptions {
|
|
117
345
|
extractText?: boolean;
|
|
118
346
|
extractImages?: boolean;
|
|
@@ -131,14 +359,116 @@ interface ExtractionOptions {
|
|
|
131
359
|
extractTextItems?: boolean;
|
|
132
360
|
specificPages?: number[];
|
|
133
361
|
useCache?: boolean;
|
|
134
|
-
/**
|
|
135
|
-
|
|
362
|
+
/** Enable image optimization after extraction (uses Jimp - pure JavaScript, default: false) */
|
|
363
|
+
optimizeImages?: boolean;
|
|
364
|
+
/** Image quality for optimization (0-100, default: 80) */
|
|
365
|
+
imageQuality?: number;
|
|
366
|
+
/**
|
|
367
|
+
* Convert JPEG 2000 images to JPG format for better compatibility.
|
|
368
|
+
* (default: true - convert JP2 to JPG)
|
|
369
|
+
*/
|
|
370
|
+
convertJp2ToJpg?: boolean;
|
|
371
|
+
/**
|
|
372
|
+
* Preserve JPEG 2000 images in their original format.
|
|
373
|
+
* By default (false), JPEG 2000 images (jp2, jpx, j2c, jpm) are converted to JPG for better compatibility.
|
|
374
|
+
* Set to true to keep JPEG 2000 files in their original format.
|
|
375
|
+
*
|
|
376
|
+
* Note: JP2 images from PDFs are automatically decoded by PDF.js during extraction.
|
|
377
|
+
* This option only affects standalone JP2 files.
|
|
378
|
+
* (default: false - convert to JPG)
|
|
379
|
+
*/
|
|
380
|
+
preserveJp2?: boolean;
|
|
381
|
+
/**
|
|
382
|
+
* Use Sharp library for ALL image processing operations (better quality & performance).
|
|
383
|
+
*
|
|
384
|
+
* When enabled, Sharp is used as the global image processing engine for:
|
|
385
|
+
* - JP2 to JPG conversion
|
|
386
|
+
* - Image optimization
|
|
387
|
+
* - Image resizing
|
|
388
|
+
* - Format conversions
|
|
389
|
+
*
|
|
390
|
+
* Sharp is an OPTIONAL dependency. Install it for better performance:
|
|
391
|
+
* ```bash
|
|
392
|
+
* npm install sharp
|
|
393
|
+
* ```
|
|
394
|
+
*
|
|
395
|
+
* If Sharp is not installed, the library will automatically fall back to pure JavaScript (Jimp).
|
|
396
|
+
*
|
|
397
|
+
* (default: false - use pure JS Jimp)
|
|
398
|
+
*/
|
|
399
|
+
useSharp?: boolean;
|
|
400
|
+
/** Enable parallel processing for better performance (default: true) */
|
|
401
|
+
parallelProcessing?: boolean;
|
|
402
|
+
/** Maximum number of pages to process in parallel (default: 10) */
|
|
403
|
+
maxConcurrentPages?: number;
|
|
404
|
+
/** Maximum number of images per page to extract in parallel (default: 20) */
|
|
405
|
+
maxConcurrentImages?: number;
|
|
406
|
+
/** Maximum number of JP2 to JPG conversions in parallel (default: 5) */
|
|
407
|
+
maxConcurrentConversions?: number;
|
|
408
|
+
/** Maximum number of image optimizations in parallel (default: 5) */
|
|
409
|
+
maxConcurrentOptimizations?: number;
|
|
410
|
+
/** Enable worker threads for CPU-intensive operations (default: false) */
|
|
411
|
+
useWorkerThreads?: boolean;
|
|
412
|
+
/** Auto-scale workers based on system resources (default: true) */
|
|
413
|
+
autoScaleWorkers?: boolean;
|
|
414
|
+
/** Maximum number of worker threads (default: CPU cores - 1) */
|
|
415
|
+
maxWorkerThreads?: number;
|
|
416
|
+
/** Minimum number of worker threads to keep alive (default: 1) */
|
|
417
|
+
minWorkerThreads?: number;
|
|
418
|
+
/** Memory threshold for scaling down workers 0-1 (default: 0.8) */
|
|
419
|
+
memoryThreshold?: number;
|
|
420
|
+
/** CPU threshold for scaling up workers 0-1 (default: 0.9) */
|
|
421
|
+
cpuThreshold?: number;
|
|
422
|
+
/** Worker task timeout in milliseconds (default: 30000) */
|
|
423
|
+
workerTaskTimeout?: number;
|
|
424
|
+
/** Worker idle timeout in milliseconds (default: 60000) */
|
|
425
|
+
workerIdleTimeout?: number;
|
|
426
|
+
/** Memory limit per worker in MB (default: 512) */
|
|
427
|
+
workerMemoryLimit?: number;
|
|
428
|
+
/** Use workers for JP2 conversion (default: true) */
|
|
429
|
+
enableWorkerForConversion?: boolean;
|
|
430
|
+
/** Use workers for image optimization (default: true) */
|
|
431
|
+
enableWorkerForOptimization?: boolean;
|
|
432
|
+
/** Use workers for image decoding (default: true) */
|
|
433
|
+
enableWorkerForDecoding?: boolean;
|
|
434
|
+
/** Enable streaming mode for large PDFs (default: false) */
|
|
435
|
+
streamMode?: boolean;
|
|
436
|
+
/** Automatically enable streaming for PDFs with more than this many pages (default: 100) */
|
|
437
|
+
autoStreamThreshold?: number;
|
|
438
|
+
/** Enable backpressure handling (pause extraction if consumer is slow) (default: true) */
|
|
439
|
+
enableBackpressure?: boolean;
|
|
440
|
+
/** Maximum number of pages to buffer before pausing (default: 10) */
|
|
441
|
+
maxBufferedPages?: number;
|
|
442
|
+
/** Emit progress events every N pages (default: 5) */
|
|
443
|
+
progressInterval?: number;
|
|
444
|
+
/** Enable event callbacks in addition to async iterator (default: false) */
|
|
445
|
+
enableEventCallbacks?: boolean;
|
|
136
446
|
cacheDir?: string;
|
|
137
447
|
baseName?: string;
|
|
138
448
|
verbose?: boolean;
|
|
139
449
|
memoryLimit?: string;
|
|
140
450
|
batchSize?: number;
|
|
141
451
|
progressCallback?: (progress: ProgressInfo) => void;
|
|
452
|
+
/** Generate page images (default: false) */
|
|
453
|
+
generatePageImages?: boolean;
|
|
454
|
+
/** Generate thumbnails for pages (default: false) */
|
|
455
|
+
generateThumbnails?: boolean;
|
|
456
|
+
/** Include page images in structured output (default: false) */
|
|
457
|
+
includePageImagesInStructuredData?: boolean;
|
|
458
|
+
/** Page numbers to generate images for (default: all pages) */
|
|
459
|
+
pageNumbers?: number[];
|
|
460
|
+
/** Generate multiple quality variants of page images */
|
|
461
|
+
pageImageQualities?: number[];
|
|
462
|
+
/** DPI for page images (default: 150) */
|
|
463
|
+
pageImageDpi?: number;
|
|
464
|
+
/** Format for page images: 'png' | 'jpg' (default: 'png') */
|
|
465
|
+
pageImageFormat?: "png" | "jpg";
|
|
466
|
+
/** Quality for JPG page images (default: 90) */
|
|
467
|
+
pageImageQuality?: number;
|
|
468
|
+
/** Thumbnail width (default: 200) */
|
|
469
|
+
thumbnailWidth?: number;
|
|
470
|
+
/** Thumbnail quality for JPG (default: 80) */
|
|
471
|
+
thumbnailQuality?: number;
|
|
142
472
|
}
|
|
143
473
|
interface ProgressInfo {
|
|
144
474
|
currentPage: number;
|
|
@@ -242,6 +572,7 @@ interface TemplateOptions {
|
|
|
242
572
|
declare class PDFExtractor {
|
|
243
573
|
private textExtractor;
|
|
244
574
|
private imageExtractor;
|
|
575
|
+
private pageToImageConverter;
|
|
245
576
|
private formatProcessor;
|
|
246
577
|
private structuredDataGenerator;
|
|
247
578
|
private cacheManager;
|
|
@@ -355,6 +686,14 @@ declare class PDFExtractor {
|
|
|
355
686
|
totalCacheSize: number;
|
|
356
687
|
cacheDir: string;
|
|
357
688
|
};
|
|
689
|
+
/**
|
|
690
|
+
* Generate page images with multiple quality variants
|
|
691
|
+
*/
|
|
692
|
+
private generatePageImagesWithVariants;
|
|
693
|
+
/**
|
|
694
|
+
* Generate thumbnails for pages
|
|
695
|
+
*/
|
|
696
|
+
private generatePageThumbnails;
|
|
358
697
|
private reportProgress;
|
|
359
698
|
private createValidationError;
|
|
360
699
|
private createExtractionError;
|
|
@@ -362,10 +701,70 @@ declare class PDFExtractor {
|
|
|
362
701
|
declare const pdfExtractor: PDFExtractor;
|
|
363
702
|
|
|
364
703
|
/**
|
|
365
|
-
*
|
|
704
|
+
* Streaming PDF extractor for large PDFs
|
|
705
|
+
* Provides async iterator and event-based APIs
|
|
706
|
+
*/
|
|
707
|
+
|
|
708
|
+
/**
|
|
709
|
+
* Streaming PDF extractor implementation
|
|
710
|
+
*/
|
|
711
|
+
declare class StreamingPDFExtractor implements StreamingExtractionResult {
|
|
712
|
+
private state;
|
|
713
|
+
private options;
|
|
714
|
+
private pdfPath;
|
|
715
|
+
private extractor;
|
|
716
|
+
private eventQueue;
|
|
717
|
+
private resolveNext;
|
|
718
|
+
private extractionPromise;
|
|
719
|
+
constructor(pdfPath: string, options?: ExtractionOptions & StreamingOptions$1);
|
|
720
|
+
/**
|
|
721
|
+
* Async iterator implementation
|
|
722
|
+
*/
|
|
723
|
+
[Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
|
|
724
|
+
/**
|
|
725
|
+
* Register event callbacks
|
|
726
|
+
*/
|
|
727
|
+
on(event: "start", callback: (event: StartEvent) => void): this;
|
|
728
|
+
on(event: "page", callback: (event: PageEvent) => void): this;
|
|
729
|
+
on(event: "image", callback: (event: ImageEvent) => void): this;
|
|
730
|
+
on(event: "progress", callback: (event: ProgressEvent) => void): this;
|
|
731
|
+
on(event: "complete", callback: (event: CompleteEvent) => void): this;
|
|
732
|
+
on(event: "error", callback: (event: ErrorEvent) => void): this;
|
|
733
|
+
on(event: "any", callback: (event: StreamEventUnion) => void): this;
|
|
734
|
+
/**
|
|
735
|
+
* Cancel extraction
|
|
736
|
+
*/
|
|
737
|
+
cancel(): Promise<void>;
|
|
738
|
+
/**
|
|
739
|
+
* Pause extraction (backpressure)
|
|
740
|
+
*/
|
|
741
|
+
pause(): void;
|
|
742
|
+
/**
|
|
743
|
+
* Resume extraction
|
|
744
|
+
*/
|
|
745
|
+
resume(): void;
|
|
746
|
+
/**
|
|
747
|
+
* Get streaming statistics
|
|
748
|
+
*/
|
|
749
|
+
getStats(): StreamingStats;
|
|
750
|
+
/**
|
|
751
|
+
* Emit an event
|
|
752
|
+
*/
|
|
753
|
+
private emitEvent;
|
|
754
|
+
/**
|
|
755
|
+
* Start the extraction process
|
|
756
|
+
*/
|
|
757
|
+
private startExtraction;
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
/**
|
|
761
|
+
* Text extraction from PDF files using pdf.js
|
|
366
762
|
*
|
|
367
|
-
*
|
|
368
|
-
*
|
|
763
|
+
* Direct pdf.js-based text extraction with support for:
|
|
764
|
+
* - Page-by-page extraction with accurate boundaries
|
|
765
|
+
* - Text positioning and font information
|
|
766
|
+
* - Metadata retrieval
|
|
767
|
+
* - No external dependencies (uses pdf.js directly)
|
|
369
768
|
*
|
|
370
769
|
* @example
|
|
371
770
|
* ```typescript
|
|
@@ -375,6 +774,19 @@ declare const pdfExtractor: PDFExtractor;
|
|
|
375
774
|
* ```
|
|
376
775
|
*/
|
|
377
776
|
declare class TextExtractor {
|
|
777
|
+
constructor();
|
|
778
|
+
/**
|
|
779
|
+
* Initialize pdf.js worker
|
|
780
|
+
*/
|
|
781
|
+
private initializePdfjs;
|
|
782
|
+
/**
|
|
783
|
+
* Load PDF document
|
|
784
|
+
*/
|
|
785
|
+
private loadDocument;
|
|
786
|
+
/**
|
|
787
|
+
* Extract text from a single page
|
|
788
|
+
*/
|
|
789
|
+
private getPageText;
|
|
378
790
|
/**
|
|
379
791
|
* Extract text content from PDF
|
|
380
792
|
*
|
|
@@ -383,6 +795,17 @@ declare class TextExtractor {
|
|
|
383
795
|
* @throws {Error} When PDF extraction fails
|
|
384
796
|
*/
|
|
385
797
|
extract(pdfPath: string): Promise<any>;
|
|
798
|
+
/**
|
|
799
|
+
* Extract text with metadata
|
|
800
|
+
*
|
|
801
|
+
* @param pdfPath - Path to the PDF file
|
|
802
|
+
* @returns Promise resolving to extraction result with text and metadata
|
|
803
|
+
* @throws {Error} When PDF extraction fails
|
|
804
|
+
*/
|
|
805
|
+
extractWithMetadata(pdfPath: string): Promise<{
|
|
806
|
+
text: string;
|
|
807
|
+
metadata: any;
|
|
808
|
+
}>;
|
|
386
809
|
/**
|
|
387
810
|
* Extract text with page information
|
|
388
811
|
*
|
|
@@ -392,11 +815,7 @@ declare class TextExtractor {
|
|
|
392
815
|
*/
|
|
393
816
|
extractWithPages(pdfPath: string): Promise<any>;
|
|
394
817
|
/**
|
|
395
|
-
*
|
|
396
|
-
*/
|
|
397
|
-
private splitTextIntoPages;
|
|
398
|
-
/**
|
|
399
|
-
* Extract text items with position and metadata
|
|
818
|
+
* Extract text items with position and metadata using pdf.js
|
|
400
819
|
*/
|
|
401
820
|
extractTextItems(pdfPath: string, options?: ExtractionOptions): Promise<TextItem[]>;
|
|
402
821
|
/**
|
|
@@ -438,21 +857,112 @@ declare class TextExtractor {
|
|
|
438
857
|
pageOffset?: number;
|
|
439
858
|
includeImageRefs?: boolean;
|
|
440
859
|
imageRefFormat?: string;
|
|
441
|
-
imageEngine?: ImageExtractionEngine;
|
|
442
860
|
}): Promise<{
|
|
443
861
|
text: string;
|
|
444
|
-
pages: PageData[];
|
|
862
|
+
pages: PageData$1[];
|
|
445
863
|
}>;
|
|
446
864
|
/**
|
|
447
865
|
* Extract text with accurate page boundaries using pdf-lib + pdf-parse
|
|
448
866
|
*/
|
|
449
867
|
extractWithAccuratePages(pdfPath: string): Promise<{
|
|
450
868
|
fullText: string;
|
|
451
|
-
pages: PageData[];
|
|
869
|
+
pages: PageData$1[];
|
|
452
870
|
totalPages: number;
|
|
453
871
|
}>;
|
|
454
872
|
}
|
|
455
873
|
|
|
874
|
+
/**
|
|
875
|
+
* Structured text extractor using both pdf-lib and pdf.js for accurate page-by-page extraction
|
|
876
|
+
*
|
|
877
|
+
* Extracts text with rich metadata including page dimensions, rotation, word counts, and character counts.
|
|
878
|
+
* Uses pdf-lib for accurate page structure and pdf.js for text content.
|
|
879
|
+
*/
|
|
880
|
+
interface PageData {
|
|
881
|
+
pageNumber: number;
|
|
882
|
+
text: string;
|
|
883
|
+
width: number;
|
|
884
|
+
height: number;
|
|
885
|
+
rotation: number;
|
|
886
|
+
mediaBox: number[];
|
|
887
|
+
textItems?: any[];
|
|
888
|
+
wordCount: number;
|
|
889
|
+
characterCount: number;
|
|
890
|
+
}
|
|
891
|
+
declare class StructuredTextExtractor {
|
|
892
|
+
private pdfLibDoc;
|
|
893
|
+
private pdfLibPages;
|
|
894
|
+
private textData;
|
|
895
|
+
constructor();
|
|
896
|
+
/**
|
|
897
|
+
* Initialize pdf.js worker
|
|
898
|
+
*/
|
|
899
|
+
private initializePdfjs;
|
|
900
|
+
/**
|
|
901
|
+
* Process PDF with accurate page-by-page extraction
|
|
902
|
+
*/
|
|
903
|
+
processPDF(pdfPath: string): Promise<{
|
|
904
|
+
totalPages: number;
|
|
905
|
+
pages: PageData[];
|
|
906
|
+
fullText: string;
|
|
907
|
+
}>;
|
|
908
|
+
/**
|
|
909
|
+
* Process with pdf-lib to get accurate page structure
|
|
910
|
+
*/
|
|
911
|
+
private processPDFLib;
|
|
912
|
+
/**
|
|
913
|
+
* Process with pdf.js to extract text page by page
|
|
914
|
+
*/
|
|
915
|
+
private processPDFjs;
|
|
916
|
+
/**
|
|
917
|
+
* Combine results from both libraries
|
|
918
|
+
*/
|
|
919
|
+
private combineResults;
|
|
920
|
+
/**
|
|
921
|
+
* Extract text with page markers using accurate page boundaries
|
|
922
|
+
*/
|
|
923
|
+
extractWithPageMarkers(pdfPath: string, pageMarkerFormat?: string, options?: {
|
|
924
|
+
includeImageRefs?: boolean;
|
|
925
|
+
imageRefFormat?: string;
|
|
926
|
+
}): Promise<{
|
|
927
|
+
text: string;
|
|
928
|
+
cleanText: string;
|
|
929
|
+
numPages: number;
|
|
930
|
+
pages: PageData[];
|
|
931
|
+
}>;
|
|
932
|
+
/**
|
|
933
|
+
* Get specific page data
|
|
934
|
+
*/
|
|
935
|
+
getPage(pageNumber: number): PageData | null;
|
|
936
|
+
/**
|
|
937
|
+
* Get detailed page information including text positioning
|
|
938
|
+
*/
|
|
939
|
+
getDetailedPageInfo(pdfPath: string, pageNumber: number): Promise<{
|
|
940
|
+
pageNumber: number;
|
|
941
|
+
text: string;
|
|
942
|
+
textItems: Array<{
|
|
943
|
+
text: string;
|
|
944
|
+
x: number;
|
|
945
|
+
y: number;
|
|
946
|
+
width: number;
|
|
947
|
+
height: number;
|
|
948
|
+
fontName?: string;
|
|
949
|
+
fontSize?: number;
|
|
950
|
+
}>;
|
|
951
|
+
dimensions: {
|
|
952
|
+
width: number;
|
|
953
|
+
height: number;
|
|
954
|
+
};
|
|
955
|
+
} | null>;
|
|
956
|
+
/**
|
|
957
|
+
* Count words in text
|
|
958
|
+
*/
|
|
959
|
+
private countWords;
|
|
960
|
+
/**
|
|
961
|
+
* Process single page (for streaming/batch processing)
|
|
962
|
+
*/
|
|
963
|
+
processSinglePage(pdfPath: string, pageNumber: number): Promise<PageData | null>;
|
|
964
|
+
}
|
|
965
|
+
|
|
456
966
|
/**
|
|
457
967
|
* Image extraction from PDF files using pdf-lib (clean implementation based on NestJS)
|
|
458
968
|
*
|
|
@@ -529,6 +1039,381 @@ declare class ImageExtractor {
|
|
|
529
1039
|
private createPngFromPdfMetadata;
|
|
530
1040
|
}
|
|
531
1041
|
|
|
1042
|
+
/**
|
|
1043
|
+
* Types for PDF page to image conversion
|
|
1044
|
+
*/
|
|
1045
|
+
/**
|
|
1046
|
+
* Image format for page conversion
|
|
1047
|
+
*/
|
|
1048
|
+
type PageImageFormat = "png" | "jpg" | "jpeg" | "webp";
|
|
1049
|
+
/**
|
|
1050
|
+
* Options for converting PDF pages to images
|
|
1051
|
+
*/
|
|
1052
|
+
interface PageToImageOptions {
|
|
1053
|
+
/**
|
|
1054
|
+
* Output directory for image files
|
|
1055
|
+
* @default './page-images'
|
|
1056
|
+
*/
|
|
1057
|
+
outputDir?: string;
|
|
1058
|
+
/**
|
|
1059
|
+
* Image format
|
|
1060
|
+
* @default 'png'
|
|
1061
|
+
*/
|
|
1062
|
+
format?: PageImageFormat;
|
|
1063
|
+
/**
|
|
1064
|
+
* JPEG quality (1-100, only for JPG format)
|
|
1065
|
+
* @default 90
|
|
1066
|
+
*/
|
|
1067
|
+
quality?: number;
|
|
1068
|
+
/**
|
|
1069
|
+
* DPI (dots per inch) for rendering
|
|
1070
|
+
* Higher DPI = better quality but larger files
|
|
1071
|
+
* @default 72
|
|
1072
|
+
*/
|
|
1073
|
+
dpi?: number;
|
|
1074
|
+
/**
|
|
1075
|
+
* Scale factor (multiplier for dimensions)
|
|
1076
|
+
* @default 1
|
|
1077
|
+
*/
|
|
1078
|
+
scale?: number;
|
|
1079
|
+
/**
|
|
1080
|
+
* Specific pages to convert (1-based)
|
|
1081
|
+
* If not provided, converts all pages
|
|
1082
|
+
* @example [1, 3, 5]
|
|
1083
|
+
*/
|
|
1084
|
+
pages?: number[];
|
|
1085
|
+
/**
|
|
1086
|
+
* Page range to convert (e.g., "1-5", "1,3,5-10")
|
|
1087
|
+
* If not provided, converts all pages
|
|
1088
|
+
* @example "1-5"
|
|
1089
|
+
*/
|
|
1090
|
+
pageRange?: string;
|
|
1091
|
+
/**
|
|
1092
|
+
* Filename pattern for output files
|
|
1093
|
+
* Available placeholders: {page}, {total}, {name}
|
|
1094
|
+
* @default 'page-{page}.{ext}'
|
|
1095
|
+
*/
|
|
1096
|
+
filenamePattern?: string;
|
|
1097
|
+
/**
|
|
1098
|
+
* Background color for transparent PDFs
|
|
1099
|
+
* @default '#FFFFFF'
|
|
1100
|
+
*/
|
|
1101
|
+
backgroundColor?: string;
|
|
1102
|
+
/**
|
|
1103
|
+
* Enable transparent background (PNG only)
|
|
1104
|
+
* @default false
|
|
1105
|
+
*/
|
|
1106
|
+
transparent?: boolean;
|
|
1107
|
+
/**
|
|
1108
|
+
* Crop to content (remove white margins)
|
|
1109
|
+
* @default false
|
|
1110
|
+
*/
|
|
1111
|
+
cropToContent?: boolean;
|
|
1112
|
+
/**
|
|
1113
|
+
* Progress callback
|
|
1114
|
+
*/
|
|
1115
|
+
onProgress?: (current: number, total: number, percentage: number) => void;
|
|
1116
|
+
/**
|
|
1117
|
+
* Callback when a page is converted
|
|
1118
|
+
*/
|
|
1119
|
+
onPageComplete?: (pageNumber: number, filepath: string) => void;
|
|
1120
|
+
/**
|
|
1121
|
+
* Verbose logging
|
|
1122
|
+
* @default false
|
|
1123
|
+
*/
|
|
1124
|
+
verbose?: boolean;
|
|
1125
|
+
}
|
|
1126
|
+
/**
|
|
1127
|
+
* Result of page to image conversion
|
|
1128
|
+
*/
|
|
1129
|
+
interface PageImageResult {
|
|
1130
|
+
/**
|
|
1131
|
+
* Page number (1-based)
|
|
1132
|
+
*/
|
|
1133
|
+
page: number;
|
|
1134
|
+
/**
|
|
1135
|
+
* Output file path
|
|
1136
|
+
*/
|
|
1137
|
+
filepath: string;
|
|
1138
|
+
/**
|
|
1139
|
+
* Image width in pixels
|
|
1140
|
+
*/
|
|
1141
|
+
width: number;
|
|
1142
|
+
/**
|
|
1143
|
+
* Image height in pixels
|
|
1144
|
+
*/
|
|
1145
|
+
height: number;
|
|
1146
|
+
/**
|
|
1147
|
+
* File size in bytes
|
|
1148
|
+
*/
|
|
1149
|
+
fileSize: number;
|
|
1150
|
+
/**
|
|
1151
|
+
* Image format
|
|
1152
|
+
*/
|
|
1153
|
+
format: PageImageFormat;
|
|
1154
|
+
}
|
|
1155
|
+
/**
|
|
1156
|
+
* Result of converting all pages
|
|
1157
|
+
*/
|
|
1158
|
+
interface PageToImageResult {
|
|
1159
|
+
/**
|
|
1160
|
+
* Array of converted page images
|
|
1161
|
+
*/
|
|
1162
|
+
images: PageImageResult[];
|
|
1163
|
+
/**
|
|
1164
|
+
* Total number of pages converted
|
|
1165
|
+
*/
|
|
1166
|
+
totalPages: number;
|
|
1167
|
+
/**
|
|
1168
|
+
* Output directory
|
|
1169
|
+
*/
|
|
1170
|
+
outputDir: string;
|
|
1171
|
+
/**
|
|
1172
|
+
* Total size of all images in bytes
|
|
1173
|
+
*/
|
|
1174
|
+
totalSize: number;
|
|
1175
|
+
}
|
|
1176
|
+
/**
|
|
1177
|
+
* Options for converting a single page
|
|
1178
|
+
*/
|
|
1179
|
+
interface SinglePageOptions {
|
|
1180
|
+
/**
|
|
1181
|
+
* Image format
|
|
1182
|
+
* @default 'png'
|
|
1183
|
+
*/
|
|
1184
|
+
format?: PageImageFormat;
|
|
1185
|
+
/**
|
|
1186
|
+
* JPEG quality (1-100)
|
|
1187
|
+
* @default 90
|
|
1188
|
+
*/
|
|
1189
|
+
quality?: number;
|
|
1190
|
+
/**
|
|
1191
|
+
* DPI for rendering
|
|
1192
|
+
* @default 72
|
|
1193
|
+
*/
|
|
1194
|
+
dpi?: number;
|
|
1195
|
+
/**
|
|
1196
|
+
* Scale factor
|
|
1197
|
+
* @default 1
|
|
1198
|
+
*/
|
|
1199
|
+
scale?: number;
|
|
1200
|
+
/**
|
|
1201
|
+
* Background color
|
|
1202
|
+
* @default '#FFFFFF'
|
|
1203
|
+
*/
|
|
1204
|
+
backgroundColor?: string;
|
|
1205
|
+
/**
|
|
1206
|
+
* Transparent background (PNG only)
|
|
1207
|
+
* @default false
|
|
1208
|
+
*/
|
|
1209
|
+
transparent?: boolean;
|
|
1210
|
+
}
|
|
1211
|
+
/**
|
|
1212
|
+
* Thumbnail generation options
|
|
1213
|
+
*/
|
|
1214
|
+
interface ThumbnailOptions extends SinglePageOptions {
|
|
1215
|
+
/**
|
|
1216
|
+
* Maximum width in pixels
|
|
1217
|
+
* @default 200
|
|
1218
|
+
*/
|
|
1219
|
+
maxWidth?: number;
|
|
1220
|
+
/**
|
|
1221
|
+
* Maximum height in pixels
|
|
1222
|
+
* @default 200
|
|
1223
|
+
*/
|
|
1224
|
+
maxHeight?: number;
|
|
1225
|
+
/**
|
|
1226
|
+
* Maintain aspect ratio
|
|
1227
|
+
* @default true
|
|
1228
|
+
*/
|
|
1229
|
+
maintainAspectRatio?: boolean;
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
/**
|
|
1233
|
+
* PDF Page to Image Converter using pdf.js
|
|
1234
|
+
*
|
|
1235
|
+
* Converts PDF pages to image files (PNG, JPG, WebP) with customizable options.
|
|
1236
|
+
* Uses Mozilla's pdf.js for high-quality rendering without external dependencies.
|
|
1237
|
+
*/
|
|
1238
|
+
|
|
1239
|
+
/**
|
|
1240
|
+
* Page to Image Converter
|
|
1241
|
+
*
|
|
1242
|
+
* @example
|
|
1243
|
+
* ```typescript
|
|
1244
|
+
* const converter = new PageToImageConverter();
|
|
1245
|
+
* const result = await converter.convertToImages('document.pdf', {
|
|
1246
|
+
* outputDir: './pages',
|
|
1247
|
+
* format: 'png',
|
|
1248
|
+
* dpi: 150
|
|
1249
|
+
* });
|
|
1250
|
+
* ```
|
|
1251
|
+
*/
|
|
1252
|
+
declare class PageToImageConverter {
|
|
1253
|
+
private pdfjs;
|
|
1254
|
+
/**
|
|
1255
|
+
* Get or load pdf.js module with proper worker configuration
|
|
1256
|
+
* Based on pdf-to-img library approach
|
|
1257
|
+
*/
|
|
1258
|
+
private getPdfjs;
|
|
1259
|
+
/**
|
|
1260
|
+
* Convert all pages of a PDF to images
|
|
1261
|
+
*
|
|
1262
|
+
* @param pdfPath - Path to PDF file
|
|
1263
|
+
* @param options - Conversion options
|
|
1264
|
+
* @returns Conversion result with image paths
|
|
1265
|
+
*/
|
|
1266
|
+
convertToImages(pdfPath: string, options?: PageToImageOptions): Promise<PageToImageResult>;
|
|
1267
|
+
/**
|
|
1268
|
+
* Convert a single page to an image file
|
|
1269
|
+
*
|
|
1270
|
+
* @param pdfPath - Path to PDF file
|
|
1271
|
+
* @param pageNumber - Page number (1-based)
|
|
1272
|
+
* @param outputPath - Output file path
|
|
1273
|
+
* @param options - Conversion options
|
|
1274
|
+
*/
|
|
1275
|
+
convertPage(pdfPath: string, pageNumber: number, outputPath: string, options?: SinglePageOptions): Promise<PageImageResult>;
|
|
1276
|
+
/**
|
|
1277
|
+
* Convert a page to a buffer (no file write)
|
|
1278
|
+
*
|
|
1279
|
+
* @param pdfPath - Path to PDF file
|
|
1280
|
+
* @param pageNumber - Page number (1-based)
|
|
1281
|
+
* @param options - Conversion options
|
|
1282
|
+
* @returns Image buffer
|
|
1283
|
+
*/
|
|
1284
|
+
convertPageToBuffer(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<Buffer>;
|
|
1285
|
+
/**
|
|
1286
|
+
* Convert a page to base64 string
|
|
1287
|
+
*
|
|
1288
|
+
* @param pdfPath - Path to PDF file
|
|
1289
|
+
* @param pageNumber - Page number (1-based)
|
|
1290
|
+
* @param options - Conversion options
|
|
1291
|
+
* @returns Base64 encoded image
|
|
1292
|
+
*/
|
|
1293
|
+
convertPageToBase64(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<string>;
|
|
1294
|
+
/**
|
|
1295
|
+
* Generate thumbnails for all pages
|
|
1296
|
+
*
|
|
1297
|
+
* @param pdfPath - Path to PDF file
|
|
1298
|
+
* @param options - Thumbnail options
|
|
1299
|
+
* @returns Conversion result
|
|
1300
|
+
*/
|
|
1301
|
+
generateThumbnails(pdfPath: string, options?: ThumbnailOptions & {
|
|
1302
|
+
outputDir?: string;
|
|
1303
|
+
}): Promise<PageToImageResult>;
|
|
1304
|
+
/**
|
|
1305
|
+
* Render a PDF page to image buffer
|
|
1306
|
+
*
|
|
1307
|
+
* Based on pdf-to-img library approach - let pdf.js handle canvas creation
|
|
1308
|
+
* @see https://github.com/k-yle/pdf-to-img
|
|
1309
|
+
*/
|
|
1310
|
+
private renderPageToBuffer;
|
|
1311
|
+
/**
|
|
1312
|
+
* Convert canvas to image buffer
|
|
1313
|
+
*/
|
|
1314
|
+
private canvasToBuffer;
|
|
1315
|
+
/**
|
|
1316
|
+
* Get page numbers to convert based on options
|
|
1317
|
+
*/
|
|
1318
|
+
private getPageNumbers;
|
|
1319
|
+
/**
|
|
1320
|
+
* Parse page range string (e.g., "1-5", "1,3,5-10")
|
|
1321
|
+
*/
|
|
1322
|
+
private parsePageRange;
|
|
1323
|
+
/**
|
|
1324
|
+
* Generate filename from pattern
|
|
1325
|
+
*/
|
|
1326
|
+
private generateFilename;
|
|
1327
|
+
/**
|
|
1328
|
+
* Format bytes to human-readable string
|
|
1329
|
+
*/
|
|
1330
|
+
private formatBytes;
|
|
1331
|
+
}
|
|
1332
|
+
|
|
1333
|
+
/**
|
|
1334
|
+
* Result of image optimization
|
|
1335
|
+
*/
|
|
1336
|
+
interface OptimizationResult {
|
|
1337
|
+
success: boolean;
|
|
1338
|
+
originalSize: number;
|
|
1339
|
+
optimizedSize: number;
|
|
1340
|
+
savedBytes: number;
|
|
1341
|
+
savedPercent: number;
|
|
1342
|
+
engine: "jimp" | "sharp" | "none";
|
|
1343
|
+
error?: string;
|
|
1344
|
+
}
|
|
1345
|
+
/**
|
|
1346
|
+
* Options for image optimization
|
|
1347
|
+
*/
|
|
1348
|
+
interface OptimizationOptions {
|
|
1349
|
+
quality?: number;
|
|
1350
|
+
verbose?: boolean;
|
|
1351
|
+
useSharp?: boolean;
|
|
1352
|
+
}
|
|
1353
|
+
/**
|
|
1354
|
+
* Image optimizer using Jimp (pure JavaScript)
|
|
1355
|
+
*
|
|
1356
|
+
* This class provides image optimization capabilities using Jimp, a pure JavaScript
|
|
1357
|
+
* image processing library with no native dependencies. It supports JPEG and PNG
|
|
1358
|
+
* optimization with quality control.
|
|
1359
|
+
*
|
|
1360
|
+
* @example
|
|
1361
|
+
* ```typescript
|
|
1362
|
+
* const result = await ImageOptimizer.optimizeFile('image.jpg', {
|
|
1363
|
+
* engine: 'auto',
|
|
1364
|
+
* quality: 80
|
|
1365
|
+
* });
|
|
1366
|
+
*
|
|
1367
|
+
* console.log(`Saved ${result.savedPercent.toFixed(1)}% using ${result.engine}`);
|
|
1368
|
+
* ```
|
|
1369
|
+
*/
|
|
1370
|
+
declare class ImageOptimizer {
|
|
1371
|
+
/**
|
|
1372
|
+
* Optimize an image file in-place
|
|
1373
|
+
*
|
|
1374
|
+
* The original file will be replaced with the optimized version.
|
|
1375
|
+
* If optimization fails, the original file remains unchanged.
|
|
1376
|
+
*
|
|
1377
|
+
* @param filePath - Path to the image file to optimize
|
|
1378
|
+
* @param options - Optimization options
|
|
1379
|
+
* @returns Promise resolving to optimization result
|
|
1380
|
+
*/
|
|
1381
|
+
static optimizeFile(filePath: string, options?: OptimizationOptions): Promise<OptimizationResult>;
|
|
1382
|
+
/**
|
|
1383
|
+
* Optimize using Sharp (optional dependency)
|
|
1384
|
+
*/
|
|
1385
|
+
private static optimizeWithSharp;
|
|
1386
|
+
/**
|
|
1387
|
+
* Optimize using Jimp (pure JavaScript)
|
|
1388
|
+
*/
|
|
1389
|
+
private static optimizeWithJimp;
|
|
1390
|
+
/**
|
|
1391
|
+
* Convert JPEG 2000 formats (jp2, jpx, j2c, jpm) to JPG
|
|
1392
|
+
*
|
|
1393
|
+
* JPEG 2000 files are not widely supported by browsers and image tools.
|
|
1394
|
+
* This method converts them to standard JPG format for better compatibility.
|
|
1395
|
+
*
|
|
1396
|
+
* Supports two conversion engines:
|
|
1397
|
+
* - Jimp (default): Pure JavaScript, works everywhere
|
|
1398
|
+
* - Sharp (optional): Better color preservation, requires native compilation
|
|
1399
|
+
*
|
|
1400
|
+
* @param jp2Path - Path to the JPEG 2000 file (jp2, jpx, j2c, or jpm)
|
|
1401
|
+
* @param options - Conversion options
|
|
1402
|
+
* @returns Promise resolving to conversion result with new file path
|
|
1403
|
+
*/
|
|
1404
|
+
static convertJp2ToJpg(jp2Path: string, options?: {
|
|
1405
|
+
quality?: number;
|
|
1406
|
+
verbose?: boolean;
|
|
1407
|
+
useSharp?: boolean;
|
|
1408
|
+
}): Promise<{
|
|
1409
|
+
success: boolean;
|
|
1410
|
+
newPath?: string;
|
|
1411
|
+
originalSize?: number;
|
|
1412
|
+
newSize?: number;
|
|
1413
|
+
error?: string;
|
|
1414
|
+
}>;
|
|
1415
|
+
}
|
|
1416
|
+
|
|
532
1417
|
/**
|
|
533
1418
|
* Handles formatting of image references and text processing
|
|
534
1419
|
*/
|
|
@@ -596,12 +1481,23 @@ declare function validateImageRefFormat(format: string): ValidationError[];
|
|
|
596
1481
|
*/
|
|
597
1482
|
declare function validateFilePath(filePath: string, extensions?: string[]): ValidationError[];
|
|
598
1483
|
|
|
1484
|
+
/**
|
|
1485
|
+
* pdf-plus - A comprehensive PDF content extraction library
|
|
1486
|
+
*
|
|
1487
|
+
* Main entry point for the PDF content extraction library.
|
|
1488
|
+
* Provides both high-level convenience functions and low-level access to extractors.
|
|
1489
|
+
*
|
|
1490
|
+
* @packageDocumentation
|
|
1491
|
+
*/
|
|
1492
|
+
|
|
599
1493
|
/**
|
|
600
1494
|
* Extract content from a PDF file (convenience function)
|
|
601
1495
|
*
|
|
1496
|
+
* Automatically switches to streaming mode for large PDFs if `autoStreamThreshold` is set.
|
|
1497
|
+
*
|
|
602
1498
|
* @param pdfPath - Path to the PDF file
|
|
603
1499
|
* @param options - Extraction options
|
|
604
|
-
* @returns Promise resolving to extraction result
|
|
1500
|
+
* @returns Promise resolving to extraction result or streaming result
|
|
605
1501
|
*
|
|
606
1502
|
* @example
|
|
607
1503
|
* ```typescript
|
|
@@ -615,8 +1511,17 @@ declare function validateFilePath(filePath: string, extensions?: string[]): Vali
|
|
|
615
1511
|
*
|
|
616
1512
|
* console.log(`Extracted ${result.images.length} images from ${result.document.pages} pages`);
|
|
617
1513
|
* ```
|
|
1514
|
+
*
|
|
1515
|
+
* @example
|
|
1516
|
+
* ```typescript
|
|
1517
|
+
* // Auto-streaming for large PDFs
|
|
1518
|
+
* const result = await extractPdfContent('large-document.pdf', {
|
|
1519
|
+
* extractImageFiles: true,
|
|
1520
|
+
* autoStreamThreshold: 100, // Auto-stream if > 100 pages
|
|
1521
|
+
* });
|
|
1522
|
+
* ```
|
|
618
1523
|
*/
|
|
619
|
-
declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult>;
|
|
1524
|
+
declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult | StreamingExtractionResult>;
|
|
620
1525
|
/**
|
|
621
1526
|
* Extract only text content from a PDF (convenience function)
|
|
622
1527
|
*
|
|
@@ -673,22 +1578,70 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
|
|
|
673
1578
|
* ```
|
|
674
1579
|
*/
|
|
675
1580
|
declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
676
|
-
|
|
677
|
-
|
|
1581
|
+
/**
|
|
1582
|
+
* Extract PDF content in streaming mode (Phase 4 - NEW!)
|
|
1583
|
+
*
|
|
1584
|
+
* For large PDFs, this provides a streaming API that processes pages one at a time,
|
|
1585
|
+
* reducing memory usage and providing real-time progress updates.
|
|
1586
|
+
*
|
|
1587
|
+
* @param pdfPath - Path to the PDF file
|
|
1588
|
+
* @param options - Extraction and streaming options
|
|
1589
|
+
* @returns StreamingExtractionResult with async iterator and event callbacks
|
|
1590
|
+
*
|
|
1591
|
+
* @example
|
|
1592
|
+
* ```typescript
|
|
1593
|
+
* // Using async iterator
|
|
1594
|
+
* const stream = extractPdfStream('large-document.pdf', {
|
|
1595
|
+
* extractImageFiles: true,
|
|
1596
|
+
* imageOutputDir: './images',
|
|
1597
|
+
* streamMode: true
|
|
1598
|
+
* });
|
|
1599
|
+
*
|
|
1600
|
+
* for await (const event of stream) {
|
|
1601
|
+
* if (event.type === 'page') {
|
|
1602
|
+
* console.log(`Processed page ${event.pageNumber}/${event.totalPages}`);
|
|
1603
|
+
* } else if (event.type === 'progress') {
|
|
1604
|
+
* console.log(`Progress: ${event.percentComplete.toFixed(1)}%`);
|
|
1605
|
+
* }
|
|
1606
|
+
* }
|
|
1607
|
+
*
|
|
1608
|
+
* // Using event callbacks
|
|
1609
|
+
* const stream = extractPdfStream('large-document.pdf', { streamMode: true })
|
|
1610
|
+
* .on('page', (event) => console.log(`Page ${event.pageNumber} done`))
|
|
1611
|
+
* .on('progress', (event) => console.log(`${event.percentComplete}% complete`))
|
|
1612
|
+
* .on('complete', (event) => console.log(`Done! ${event.totalImages} images`));
|
|
1613
|
+
*
|
|
1614
|
+
* for await (const event of stream) {
|
|
1615
|
+
* // Events are also available via iterator
|
|
1616
|
+
* }
|
|
1617
|
+
* ```
|
|
1618
|
+
*/
|
|
1619
|
+
declare function extractPdfStream(pdfPath: string, options?: Partial<ExtractionOptions>): StreamingExtractionResult;
|
|
1620
|
+
/**
|
|
1621
|
+
* Library version
|
|
1622
|
+
*/
|
|
1623
|
+
declare const version = "1.0.3";
|
|
1624
|
+
/**
|
|
1625
|
+
* Default export containing all public APIs
|
|
1626
|
+
* Useful for CommonJS: const pdfPlus = require('pdf-plus');
|
|
1627
|
+
*/
|
|
678
1628
|
declare const _default: {
|
|
679
1629
|
PDFExtractor: typeof PDFExtractor;
|
|
680
1630
|
pdfExtractor: PDFExtractor;
|
|
1631
|
+
StreamingPDFExtractor: typeof StreamingPDFExtractor;
|
|
681
1632
|
TextExtractor: typeof TextExtractor;
|
|
682
1633
|
ImageExtractor: typeof ImageExtractor;
|
|
1634
|
+
ImageOptimizer: typeof ImageOptimizer;
|
|
683
1635
|
FormatProcessor: typeof FormatProcessor;
|
|
684
1636
|
extractPdfContent: typeof extractPdfContent;
|
|
685
1637
|
extractText: typeof extractText;
|
|
686
1638
|
extractImages: typeof extractImages;
|
|
687
1639
|
extractImageFiles: typeof extractImageFiles;
|
|
1640
|
+
extractPdfStream: typeof extractPdfStream;
|
|
688
1641
|
validateConfig: typeof validateConfig;
|
|
689
1642
|
validateImageRefFormat: typeof validateImageRefFormat;
|
|
690
1643
|
validateFilePath: typeof validateFilePath;
|
|
691
1644
|
version: string;
|
|
692
1645
|
};
|
|
693
1646
|
|
|
694
|
-
export { type AnalyticsData, type DocumentMetadata, type DocumentSummary, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type
|
|
1647
|
+
export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
|