pdf-plus 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,694 @@
1
+ /**
2
+ * Core types for PDF content extraction
3
+ */
4
+ interface Position {
5
+ x: number;
6
+ y: number;
7
+ width: number;
8
+ height: number;
9
+ }
10
+ interface FontInfo {
11
+ name: string;
12
+ size: number;
13
+ weight?: string;
14
+ style?: string;
15
+ color?: string;
16
+ }
17
+ interface TextItem {
18
+ id: string;
19
+ content: string;
20
+ position: Position;
21
+ font: FontInfo;
22
+ page: number;
23
+ transform?: number[];
24
+ type: "text" | "heading" | "paragraph" | "caption";
25
+ fontSize?: number;
26
+ color?: string;
27
+ }
28
+ interface ImageItem {
29
+ id: string;
30
+ name?: string;
31
+ filename?: string;
32
+ filepath?: string;
33
+ position: Position;
34
+ page: number;
35
+ transform?: number[];
36
+ width: number;
37
+ height: number;
38
+ format?: string;
39
+ mimeType?: string;
40
+ size?: number;
41
+ filePath?: string;
42
+ data?: Uint8Array;
43
+ }
44
+ interface PageInfo {
45
+ number: number;
46
+ width: number;
47
+ height: number;
48
+ textItems: TextItem[];
49
+ images: ImageItem[];
50
+ textCount: number;
51
+ imageCount: number;
52
+ }
53
+ interface DocumentMetadata {
54
+ filename: string;
55
+ pages: number;
56
+ textLength: number;
57
+ extractedAt: string;
58
+ metadata: Record<string, unknown>;
59
+ options: ExtractionOptions;
60
+ }
61
+ interface ExtractionResult {
62
+ document: DocumentMetadata;
63
+ pages: PageInfo[];
64
+ images: ImageItem[];
65
+ textItems: TextItem[];
66
+ textWithRefs: string;
67
+ cleanText: string;
68
+ summary?: DocumentSummary;
69
+ structuredData?: StructuredPageData;
70
+ }
71
+ interface DocumentSummary {
72
+ totalPages: number;
73
+ totalTextItems: number;
74
+ totalImages: number;
75
+ totalTextLength: number;
76
+ averageImagesPerPage: string;
77
+ pagesWithImages: number;
78
+ }
79
+ interface StructuredPageData {
80
+ metadata: {
81
+ filename: string;
82
+ extractedAt: string;
83
+ totalPages: number;
84
+ totalTextLength: number;
85
+ totalImages: number;
86
+ extractionOptions: ExtractionOptions;
87
+ };
88
+ pages: PageData[];
89
+ }
90
+ interface PageData {
91
+ pageNumber: number;
92
+ text: {
93
+ content: string;
94
+ rawText: string;
95
+ wordCount: number;
96
+ characterCount: number;
97
+ };
98
+ images: PageImageData[];
99
+ imageCount: number;
100
+ }
101
+ interface PageImageData {
102
+ id: string;
103
+ name: string;
104
+ filename?: string;
105
+ path?: string;
106
+ position: {
107
+ x: number;
108
+ y: number;
109
+ width: number;
110
+ height: number;
111
+ };
112
+ format: string;
113
+ size?: number;
114
+ }
115
+ type ImageExtractionEngine = "pdf-lib" | "poppler" | "auto";
116
+ interface ExtractionOptions {
117
+ extractText?: boolean;
118
+ extractImages?: boolean;
119
+ extractImageFiles?: boolean;
120
+ useImagePaths?: boolean;
121
+ imageOutputDir?: string;
122
+ imageRefFormat?: string;
123
+ includeImageRefs?: boolean;
124
+ includePageMarkers?: boolean;
125
+ pageMarkerFormat?: string;
126
+ /** Page number offset to align with visual PDF pages (e.g., +1 if PDF has cover page) */
127
+ pageOffset?: number;
128
+ /** Use combined extractor for accurate page boundaries (recommended) */
129
+ useCombinedExtractor?: boolean;
130
+ generateStructuredData?: boolean;
131
+ extractTextItems?: boolean;
132
+ specificPages?: number[];
133
+ useCache?: boolean;
134
+ /** Image extraction engine to use */
135
+ imageEngine?: ImageExtractionEngine;
136
+ cacheDir?: string;
137
+ baseName?: string;
138
+ verbose?: boolean;
139
+ memoryLimit?: string;
140
+ batchSize?: number;
141
+ progressCallback?: (progress: ProgressInfo) => void;
142
+ }
143
+ interface ProgressInfo {
144
+ currentPage: number;
145
+ totalPages: number;
146
+ phase: "text" | "images" | "processing" | "complete";
147
+ message?: string;
148
+ }
149
+ interface ExtractorConfig {
150
+ pdfPath: string;
151
+ outputDir?: string;
152
+ options: ExtractionOptions;
153
+ }
154
+ interface ValidationError {
155
+ field: string;
156
+ message: string;
157
+ value?: unknown;
158
+ }
159
+ interface PageExtractionResult {
160
+ pageNumber: number;
161
+ text: string;
162
+ rawText: string;
163
+ textItems: TextItem[];
164
+ images: ImageItem[];
165
+ metadata: {
166
+ wordCount: number;
167
+ characterCount: number;
168
+ imageCount: number;
169
+ };
170
+ }
171
+ interface ExtractionError extends Error {
172
+ code: string;
173
+ context?: Record<string, unknown>;
174
+ validationErrors?: ValidationError[];
175
+ }
176
+ type FormatPlaceholder = "id" | "name" | "page" | "index" | "path";
177
+ interface FormatContext {
178
+ id: string;
179
+ name: string;
180
+ page: number;
181
+ index: number;
182
+ path: string;
183
+ }
184
+ interface ProcessingPhase {
185
+ name: string;
186
+ description: string;
187
+ status: "not_started" | "in_progress" | "complete" | "error";
188
+ progress?: number;
189
+ error?: string;
190
+ }
191
+ interface MemoryUsage {
192
+ used: number;
193
+ total: number;
194
+ percentage: number;
195
+ timestamp: number;
196
+ }
197
+ interface StreamingOptions {
198
+ batchSize: number;
199
+ memoryLimit: number;
200
+ enableCaching: boolean;
201
+ cacheSize?: number;
202
+ }
203
+ interface OCROptions {
204
+ enabled: boolean;
205
+ language?: string;
206
+ confidence?: number;
207
+ engine?: "tesseract" | "cloud";
208
+ }
209
+ interface AnalyticsData {
210
+ processingTime: number;
211
+ memoryPeak: number;
212
+ pagesPerSecond: number;
213
+ errorCount: number;
214
+ qualityScore?: number;
215
+ }
216
+ interface TemplateOptions {
217
+ format: "markdown" | "html" | "xml" | "json" | "custom";
218
+ template?: string;
219
+ variables?: Record<string, unknown>;
220
+ }
221
+
222
+ /**
223
+ * Main PDF content extractor class
224
+ *
225
+ * Provides comprehensive PDF content extraction capabilities including:
226
+ * - Text extraction with positioning and formatting
227
+ * - Image detection and extraction
228
+ * - Structured data generation
229
+ * - Page-specific extraction
230
+ * - Caching for performance optimization
231
+ *
232
+ * @example
233
+ * ```typescript
234
+ * const extractor = new PDFExtractor();
235
+ * const result = await extractor.extract('document.pdf', {
236
+ * extractText: true,
237
+ * extractImages: true,
238
+ * verbose: true
239
+ * });
240
+ * ```
241
+ */
242
+ declare class PDFExtractor {
243
+ private textExtractor;
244
+ private imageExtractor;
245
+ private formatProcessor;
246
+ private structuredDataGenerator;
247
+ private cacheManager;
248
+ /**
249
+ * Create a new PDFExtractor instance
250
+ *
251
+ * @param cacheDir - Optional directory for caching extracted data
252
+ */
253
+ constructor(cacheDir?: string);
254
+ /**
255
+ * Extract content from a PDF file
256
+ *
257
+ * This is the main extraction method that can extract text, images, or both
258
+ * depending on the provided options. It supports various output formats and
259
+ * processing modes.
260
+ *
261
+ * @param pdfPath - Path to the PDF file to extract content from
262
+ * @param options - Configuration options for extraction
263
+ * @returns Promise resolving to complete extraction results
264
+ *
265
+ * @throws {ValidationError} When configuration is invalid
266
+ * @throws {ExtractionError} When PDF processing fails
267
+ *
268
+ * @example
269
+ * ```typescript
270
+ * // Extract both text and images
271
+ * const result = await extractor.extract('document.pdf', {
272
+ * extractText: true,
273
+ * extractImages: true,
274
+ * extractImageFiles: true,
275
+ * imageOutputDir: './images',
276
+ * verbose: true
277
+ * });
278
+ *
279
+ * console.log(`Extracted ${result.images.length} images`);
280
+ * console.log(`Text: ${result.cleanText.substring(0, 100)}...`);
281
+ * ```
282
+ */
283
+ extract(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult>;
284
+ /**
285
+ * Extract only text content (optimized)
286
+ *
287
+ * This method is optimized for text-only extraction and is faster than
288
+ * the full extract() method when you only need text content.
289
+ *
290
+ * @param pdfPath - Path to the PDF file
291
+ * @param options - Partial extraction options (images will be disabled)
292
+ * @returns Promise resolving to extracted text content
293
+ *
294
+ * @example
295
+ * ```typescript
296
+ * const text = await extractor.extractText('document.pdf', {
297
+ * verbose: true
298
+ * });
299
+ * console.log(`Extracted ${text.length} characters`);
300
+ * ```
301
+ */
302
+ extractText(pdfPath: string, options?: Partial<ExtractionOptions>): Promise<string>;
303
+ /**
304
+ * Extract only image references (optimized)
305
+ */
306
+ extractImages(pdfPath: string, options?: Partial<ExtractionOptions>): Promise<ExtractionResult["images"]>;
307
+ /**
308
+ * Extract and save image files
309
+ */
310
+ extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
311
+ private validateConfiguration;
312
+ private processResults;
313
+ /**
314
+ * Get text for a specific page
315
+ */
316
+ getText(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<string>;
317
+ /**
318
+ * Get images for a specific page
319
+ */
320
+ getImages(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<ImageItem[]>;
321
+ /**
322
+ * Get text items for a specific page
323
+ */
324
+ getTextItems(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<TextItem[]>;
325
+ /**
326
+ * Get raw text for a specific page (no page markers, image refs, just clean text)
327
+ */
328
+ getRawText(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<string>;
329
+ /**
330
+ * Get complete page data (text + images + text items)
331
+ */
332
+ getPage(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<PageExtractionResult>;
333
+ /**
334
+ * Extract text for a specific page from full text
335
+ */
336
+ private extractPageText;
337
+ /**
338
+ * Count words in text
339
+ */
340
+ private countWords;
341
+ /**
342
+ * Extract raw text without page markers, image references, or formatting
343
+ */
344
+ private extractRawText;
345
+ /**
346
+ * Clear cache for a PDF
347
+ */
348
+ clearCache(pdfPath: string): void;
349
+ /**
350
+ * Get cache statistics
351
+ */
352
+ getCacheStats(): {
353
+ totalCachedPdfs: number;
354
+ totalCachedPages: number;
355
+ totalCacheSize: number;
356
+ cacheDir: string;
357
+ };
358
+ private reportProgress;
359
+ private createValidationError;
360
+ private createExtractionError;
361
+ }
362
+ declare const pdfExtractor: PDFExtractor;
363
+
364
+ /**
365
+ * Text extraction from PDF files
366
+ *
367
+ * Handles text extraction using pdf-parse library with support for
368
+ * page-by-page extraction and metadata retrieval.
369
+ *
370
+ * @example
371
+ * ```typescript
372
+ * const textExtractor = new TextExtractor();
373
+ * const result = await textExtractor.extract('document.pdf');
374
+ * console.log(result.text);
375
+ * ```
376
+ */
377
+ declare class TextExtractor {
378
+ /**
379
+ * Extract text content from PDF
380
+ *
381
+ * @param pdfPath - Path to the PDF file
382
+ * @returns Promise resolving to extraction result with text and metadata
383
+ * @throws {Error} When PDF extraction fails
384
+ */
385
+ extract(pdfPath: string): Promise<any>;
386
+ /**
387
+ * Extract text with page information
388
+ *
389
+ * @param pdfPath - Path to the PDF file
390
+ * @returns Promise resolving to extraction result with page-separated text
391
+ * @throws {Error} When PDF extraction fails
392
+ */
393
+ extractWithPages(pdfPath: string): Promise<any>;
394
+ /**
395
+ * Split text into approximate pages
396
+ */
397
+ private splitTextIntoPages;
398
+ /**
399
+ * Extract text items with position and metadata
400
+ */
401
+ extractTextItems(pdfPath: string, options?: ExtractionOptions): Promise<TextItem[]>;
402
+ /**
403
+ * Extract text statistics
404
+ */
405
+ extractStatistics(pdfPath: string): Promise<{
406
+ characterCount: number;
407
+ wordCount: number;
408
+ lineCount: number;
409
+ pageCount: number;
410
+ averageWordsPerPage: number;
411
+ readingTime: number;
412
+ }>;
413
+ /**
414
+ * Extract text with font information (requires PDF.js)
415
+ */
416
+ extractWithFontInfo(pdfPath: string): Promise<any>;
417
+ /**
418
+ * Clean extracted text
419
+ */
420
+ cleanText(text: string): string;
421
+ /**
422
+ * Extract text from specific page range
423
+ */
424
+ extractPageRange(pdfPath: string, startPage: number, endPage: number): Promise<string>;
425
+ /**
426
+ * Search for text in PDF
427
+ */
428
+ searchText(pdfPath: string, searchTerm: string, caseSensitive?: boolean): Promise<{
429
+ found: boolean;
430
+ occurrences: number;
431
+ pages: number[];
432
+ context: string[];
433
+ }>;
434
+ /**
435
+ * Extract text with page markers
436
+ */
437
+ extractWithPageMarkers(pdfPath: string, pageMarkerFormat?: string, options?: {
438
+ pageOffset?: number;
439
+ includeImageRefs?: boolean;
440
+ imageRefFormat?: string;
441
+ imageEngine?: ImageExtractionEngine;
442
+ }): Promise<{
443
+ text: string;
444
+ pages: PageData[];
445
+ }>;
446
+ /**
447
+ * Extract text with accurate page boundaries using pdf-lib + pdf-parse
448
+ */
449
+ extractWithAccuratePages(pdfPath: string): Promise<{
450
+ fullText: string;
451
+ pages: PageData[];
452
+ totalPages: number;
453
+ }>;
454
+ }
455
+
456
+ /**
457
+ * Image extraction from PDF files using pdf-lib (clean implementation based on NestJS)
458
+ *
459
+ * Supports multiple extraction engines including pdf-lib and poppler for
460
+ * maximum compatibility and performance. Can extract image metadata,
461
+ * save image files, and handle various image formats.
462
+ *
463
+ * @example
464
+ * ```typescript
465
+ * const imageExtractor = new ImageExtractor();
466
+ * const result = await imageExtractor.extract('document.pdf', {
467
+ * extractImageFiles: true,
468
+ * imageOutputDir: './images',
469
+ * imageEngine: 'auto'
470
+ * });
471
+ * ```
472
+ */
473
+ declare class ImageExtractor {
474
+ /**
475
+ * Extract images from PDF file using configurable engines
476
+ *
477
+ * @param pdfPath - Path to the PDF file
478
+ * @param options - Extraction options including engine selection and output settings
479
+ * @returns Promise resolving to extraction result with image metadata
480
+ * @throws {Error} When image extraction fails
481
+ */
482
+ extract(pdfPath: string, options?: ExtractionOptions): Promise<any>;
483
+ /**
484
+ * Get available image extraction engines
485
+ */
486
+ static getAvailableEngines(): Promise<{
487
+ name: string;
488
+ description: string;
489
+ available: boolean;
490
+ capabilities: {
491
+ formats: string[];
492
+ supportsMetadata: boolean;
493
+ supportsEmbeddedImages: boolean;
494
+ supportsVectorImages: boolean;
495
+ };
496
+ }[]>;
497
+ /**
498
+ * Get engine recommendations
499
+ */
500
+ static getEngineRecommendations(): ({
501
+ useCase: string;
502
+ engine: "pdf-lib";
503
+ reason: string;
504
+ } | {
505
+ useCase: string;
506
+ engine: "poppler";
507
+ reason: string;
508
+ })[];
509
+ /**
510
+ * Extract images using pdf-lib (based on working NestJS implementation)
511
+ * @deprecated Use extract() with imageEngine: 'pdf-lib' instead
512
+ */
513
+ extractWithPdfLib(pdfPath: string, options?: ExtractionOptions): Promise<any>;
514
+ /**
515
+ * Extract a single image from a PDF object using the working approach
516
+ */
517
+ private extractImageFromPdfObject;
518
+ /**
519
+ * Extract image data with proper decompression handling using actual PDF metadata
520
+ */
521
+ private extractImageData;
522
+ /**
523
+ * Detect image format from binary data (from NestJS implementation)
524
+ */
525
+ private detectImageFormat;
526
+ /**
527
+ * Create a PNG file from raw pixel data using actual PDF metadata
528
+ */
529
+ private createPngFromPdfMetadata;
530
+ }
531
+
532
+ /**
533
+ * Handles formatting of image references and text processing
534
+ */
535
+ declare class FormatProcessor {
536
+ /**
537
+ * Generate text with image references inserted
538
+ */
539
+ generateTextWithImageRefs(text: string, images: ImageItem[], format: string, totalPages: number): string;
540
+ /**
541
+ * Generate image-only reference list
542
+ */
543
+ generateImageOnlyRefs(images: ImageItem[], format: string): string;
544
+ /**
545
+ * Format a single image reference
546
+ */
547
+ formatImageReference(image: ImageItem, format: string, globalIndex: number): string;
548
+ /**
549
+ * Replace placeholders in format string
550
+ */
551
+ private replacePlaceholders;
552
+ /**
553
+ * Extract placeholders from format string
554
+ */
555
+ extractPlaceholders(format: string): string[];
556
+ /**
557
+ * Validate format string
558
+ */
559
+ isValidFormat(format: string): boolean;
560
+ /**
561
+ * Get default format based on options
562
+ */
563
+ getDefaultFormat(useImagePaths?: boolean): string;
564
+ /**
565
+ * Clean text by removing image references
566
+ */
567
+ cleanTextFromImageRefs(textWithRefs: string, format: string): string;
568
+ /**
569
+ * Count image references in text
570
+ */
571
+ countImageReferences(text: string, format: string): number;
572
+ /**
573
+ * Generate summary text
574
+ */
575
+ generateSummary(totalPages: number, totalTextItems: number, totalImages: number, totalTextLength: number, processingTime?: number): string;
576
+ /**
577
+ * Format file size
578
+ */
579
+ formatFileSize(bytes: number): string;
580
+ /**
581
+ * Format duration
582
+ */
583
+ formatDuration(milliseconds: number): string;
584
+ }
585
+
586
+ /**
587
+ * Validate extractor configuration
588
+ */
589
+ declare function validateConfig(config: ExtractorConfig): ValidationError[];
590
+ /**
591
+ * Validate image reference format
592
+ */
593
+ declare function validateImageRefFormat(format: string): ValidationError[];
594
+ /**
595
+ * Validate file path
596
+ */
597
+ declare function validateFilePath(filePath: string, extensions?: string[]): ValidationError[];
598
+
599
+ /**
600
+ * Extract content from a PDF file (convenience function)
601
+ *
602
+ * @param pdfPath - Path to the PDF file
603
+ * @param options - Extraction options
604
+ * @returns Promise resolving to extraction result
605
+ *
606
+ * @example
607
+ * ```typescript
608
+ * import { extractPdfContent } from 'pdfnode';
609
+ *
610
+ * const result = await extractPdfContent('document.pdf', {
611
+ * extractText: true,
612
+ * extractImages: true,
613
+ * verbose: true
614
+ * });
615
+ *
616
+ * console.log(`Extracted ${result.images.length} images from ${result.document.pages} pages`);
617
+ * ```
618
+ */
619
+ declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult>;
620
+ /**
621
+ * Extract only text content from a PDF (convenience function)
622
+ *
623
+ * @param pdfPath - Path to the PDF file
624
+ * @param options - Extraction options
625
+ * @returns Promise resolving to text content
626
+ *
627
+ * @example
628
+ * ```typescript
629
+ * import { extractText } from 'pdfnode';
630
+ *
631
+ * const text = await extractText('document.pdf');
632
+ * console.log(`Extracted ${text.length} characters`);
633
+ * ```
634
+ */
635
+ declare function extractText(pdfPath: string, options?: Partial<ExtractionOptions>): Promise<string>;
636
+ /**
637
+ * Extract only image references from a PDF (convenience function)
638
+ *
639
+ * @param pdfPath - Path to the PDF file
640
+ * @param options - Extraction options
641
+ * @returns Promise resolving to array of image items
642
+ *
643
+ * @example
644
+ * ```typescript
645
+ * import { extractImages } from 'pdfnode';
646
+ *
647
+ * const images = await extractImages('document.pdf', {
648
+ * extractImageFiles: true,
649
+ * imageOutputDir: './my-images'
650
+ * });
651
+ *
652
+ * console.log(`Extracted ${images.length} images`);
653
+ * ```
654
+ */
655
+ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOptions>): Promise<ImageItem[]>;
656
+ /**
657
+ * Extract and save image files from a PDF (convenience function)
658
+ *
659
+ * @param pdfPath - Path to the PDF file
660
+ * @param outputDir - Directory to save images
661
+ * @param options - Extraction options
662
+ * @returns Promise resolving to array of saved file paths
663
+ *
664
+ * @example
665
+ * ```typescript
666
+ * import { extractImageFiles } from 'pdfnode';
667
+ *
668
+ * const filePaths = await extractImageFiles('document.pdf', './images', {
669
+ * verbose: true
670
+ * });
671
+ *
672
+ * console.log(`Saved ${filePaths.length} image files`);
673
+ * ```
674
+ */
675
+ declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
676
+ declare const version = "1.0.0";
677
+
678
+ declare const _default: {
679
+ PDFExtractor: typeof PDFExtractor;
680
+ pdfExtractor: PDFExtractor;
681
+ TextExtractor: typeof TextExtractor;
682
+ ImageExtractor: typeof ImageExtractor;
683
+ FormatProcessor: typeof FormatProcessor;
684
+ extractPdfContent: typeof extractPdfContent;
685
+ extractText: typeof extractText;
686
+ extractImages: typeof extractImages;
687
+ extractImageFiles: typeof extractImageFiles;
688
+ validateConfig: typeof validateConfig;
689
+ validateImageRefFormat: typeof validateImageRefFormat;
690
+ validateFilePath: typeof validateFilePath;
691
+ version: string;
692
+ };
693
+
694
+ export { type AnalyticsData, type DocumentMetadata, type DocumentSummary, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageExtractionEngine, ImageExtractor, type ImageItem, type MemoryUsage, type OCROptions, PDFExtractor, type PageInfo, type Position, type ProcessingPhase, type ProgressInfo, type StreamingOptions, type TemplateOptions, TextExtractor, type TextItem, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };