pdf-plus 1.2.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +96 -25
- package/dist/index.d.mts +1009 -115
- package/dist/index.d.ts +1009 -115
- package/dist/index.js +39 -38
- package/dist/index.mjs +39 -38
- package/dist/workers/jp2-converter.worker.js +1 -1
- package/dist/workers/jp2-converter.worker.mjs +1 -1
- package/package.json +2 -6
package/dist/index.d.mts
CHANGED
|
@@ -1,3 +1,128 @@
|
|
|
1
|
+
import * as pdfjs_dist_legacy_build_pdf_mjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
2
|
+
import { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Table Extraction Types
|
|
6
|
+
*
|
|
7
|
+
* Type definitions for automatic table detection and extraction from PDFs.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* A detected table in the PDF
|
|
12
|
+
*/
|
|
13
|
+
interface Table {
|
|
14
|
+
/** Unique identifier for the table */
|
|
15
|
+
id: string;
|
|
16
|
+
/** Page number where the table is located (1-based) */
|
|
17
|
+
page: number;
|
|
18
|
+
/** Bounding box position of the table */
|
|
19
|
+
position: Position;
|
|
20
|
+
/** Array of rows in the table */
|
|
21
|
+
rows: TableRow[];
|
|
22
|
+
/** Array of column definitions */
|
|
23
|
+
columns: TableColumn[];
|
|
24
|
+
/** Confidence score for table detection (0-1) */
|
|
25
|
+
confidence: number;
|
|
26
|
+
/** Whether the table has a detected header row */
|
|
27
|
+
hasHeader: boolean;
|
|
28
|
+
/** Number of rows in the table */
|
|
29
|
+
rowCount: number;
|
|
30
|
+
/** Number of columns in the table */
|
|
31
|
+
columnCount: number;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* A row in a table
|
|
35
|
+
*/
|
|
36
|
+
interface TableRow {
|
|
37
|
+
/** Row index (0-based) */
|
|
38
|
+
index: number;
|
|
39
|
+
/** Y position of the row */
|
|
40
|
+
y: number;
|
|
41
|
+
/** Height of the row */
|
|
42
|
+
height: number;
|
|
43
|
+
/** Cells in this row */
|
|
44
|
+
cells: TableCell[];
|
|
45
|
+
/** Whether this row is a header row */
|
|
46
|
+
isHeader: boolean;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* A column in a table
|
|
50
|
+
*/
|
|
51
|
+
interface TableColumn {
|
|
52
|
+
/** Column index (0-based) */
|
|
53
|
+
index: number;
|
|
54
|
+
/** X position of the column */
|
|
55
|
+
x: number;
|
|
56
|
+
/** Width of the column */
|
|
57
|
+
width: number;
|
|
58
|
+
/** Column header text (if detected) */
|
|
59
|
+
header?: string;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* A cell in a table
|
|
63
|
+
*/
|
|
64
|
+
interface TableCell {
|
|
65
|
+
/** Row index (0-based) */
|
|
66
|
+
row: number;
|
|
67
|
+
/** Column index (0-based) */
|
|
68
|
+
column: number;
|
|
69
|
+
/** Text content of the cell */
|
|
70
|
+
content: string;
|
|
71
|
+
/** Position of the cell */
|
|
72
|
+
position: Position;
|
|
73
|
+
/** Whether this cell is in a header row */
|
|
74
|
+
isHeader: boolean;
|
|
75
|
+
/** Row span (for merged cells, default: 1) */
|
|
76
|
+
rowSpan?: number;
|
|
77
|
+
/** Column span (for merged cells, default: 1) */
|
|
78
|
+
colSpan?: number;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Options for table extraction
|
|
82
|
+
*/
|
|
83
|
+
interface TableExtractionOptions {
|
|
84
|
+
/** Specific pages to extract tables from (1-based). If not specified, all pages are processed. */
|
|
85
|
+
pages?: number[];
|
|
86
|
+
/** Y-position tolerance for grouping text items into rows (default: 3) */
|
|
87
|
+
rowTolerance?: number;
|
|
88
|
+
/** X-position tolerance for grouping text items into columns (default: 5) */
|
|
89
|
+
columnTolerance?: number;
|
|
90
|
+
/** Minimum number of columns to consider a valid table (default: 2) */
|
|
91
|
+
minColumns?: number;
|
|
92
|
+
/** Minimum number of rows to consider a valid table (default: 2) */
|
|
93
|
+
minRows?: number;
|
|
94
|
+
/** Minimum grid density (filled cells / total cells) to consider valid (default: 0.6) */
|
|
95
|
+
minGridDensity?: number;
|
|
96
|
+
/** Whether to detect header rows (default: true) */
|
|
97
|
+
detectHeaders?: boolean;
|
|
98
|
+
/** Enable verbose logging (default: false) */
|
|
99
|
+
verbose?: boolean;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Result of table extraction
|
|
103
|
+
*/
|
|
104
|
+
interface TableExtractionResult {
|
|
105
|
+
/** Array of detected tables */
|
|
106
|
+
tables: Table[];
|
|
107
|
+
/** Total number of pages processed */
|
|
108
|
+
pagesProcessed: number;
|
|
109
|
+
/** Total number of tables found */
|
|
110
|
+
tableCount: number;
|
|
111
|
+
/** Extraction metadata */
|
|
112
|
+
metadata: TableExtractionMetadata;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Metadata about the extraction process
|
|
116
|
+
*/
|
|
117
|
+
interface TableExtractionMetadata {
|
|
118
|
+
/** Time taken to extract tables (in milliseconds) */
|
|
119
|
+
extractionTimeMs: number;
|
|
120
|
+
/** Options used for extraction */
|
|
121
|
+
options: TableExtractionOptions;
|
|
122
|
+
/** Number of candidate tables that were filtered out */
|
|
123
|
+
filteredCandidates: number;
|
|
124
|
+
}
|
|
125
|
+
|
|
1
126
|
/**
|
|
2
127
|
* Types for streaming PDF extraction
|
|
3
128
|
*/
|
|
@@ -30,7 +155,7 @@ interface PageEvent extends StreamEvent {
|
|
|
30
155
|
totalPages: number;
|
|
31
156
|
textLength: number;
|
|
32
157
|
imageCount: number;
|
|
33
|
-
pageInfo?: PageInfo;
|
|
158
|
+
pageInfo?: PageInfo$1;
|
|
34
159
|
}
|
|
35
160
|
/**
|
|
36
161
|
* Image event - emitted when an image is extracted
|
|
@@ -240,7 +365,7 @@ interface ImageItem {
|
|
|
240
365
|
filePath?: string;
|
|
241
366
|
data?: Uint8Array;
|
|
242
367
|
}
|
|
243
|
-
interface PageInfo {
|
|
368
|
+
interface PageInfo$1 {
|
|
244
369
|
number: number;
|
|
245
370
|
width: number;
|
|
246
371
|
height: number;
|
|
@@ -259,7 +384,7 @@ interface DocumentMetadata {
|
|
|
259
384
|
}
|
|
260
385
|
interface ExtractionResult {
|
|
261
386
|
document: DocumentMetadata;
|
|
262
|
-
pages: PageInfo[];
|
|
387
|
+
pages: PageInfo$1[];
|
|
263
388
|
images: ImageItem[];
|
|
264
389
|
textItems: TextItem[];
|
|
265
390
|
text: string;
|
|
@@ -267,7 +392,10 @@ interface ExtractionResult {
|
|
|
267
392
|
cleanText: string;
|
|
268
393
|
summary?: DocumentSummary;
|
|
269
394
|
structuredData?: StructuredPageData;
|
|
395
|
+
/** Detected tables (only populated when extractTables: true) */
|
|
396
|
+
tables?: Table[];
|
|
270
397
|
}
|
|
398
|
+
|
|
271
399
|
interface DocumentSummary {
|
|
272
400
|
totalPages: number;
|
|
273
401
|
totalTextItems: number;
|
|
@@ -378,39 +506,6 @@ interface ExtractionOptions {
|
|
|
378
506
|
* (default: false - convert to JPG)
|
|
379
507
|
*/
|
|
380
508
|
preserveJp2?: boolean;
|
|
381
|
-
/**
|
|
382
|
-
* Use Sharp library for ALL image processing operations (better quality & performance).
|
|
383
|
-
*
|
|
384
|
-
* When enabled, Sharp is used as the global image processing engine for:
|
|
385
|
-
* - JP2 to JPG conversion
|
|
386
|
-
* - Image optimization
|
|
387
|
-
* - Image resizing
|
|
388
|
-
* - Format conversions
|
|
389
|
-
*
|
|
390
|
-
* Sharp is an OPTIONAL dependency. Install it for better performance:
|
|
391
|
-
* ```bash
|
|
392
|
-
* npm install sharp
|
|
393
|
-
* ```
|
|
394
|
-
*
|
|
395
|
-
* If Sharp is not installed, the library will automatically fall back to pure JavaScript (Jimp).
|
|
396
|
-
*
|
|
397
|
-
* (default: false - use pure JS Jimp)
|
|
398
|
-
*/
|
|
399
|
-
useSharp?: boolean;
|
|
400
|
-
/**
|
|
401
|
-
* Use Poppler's pdfimages as fallback when standard extraction finds no images.
|
|
402
|
-
* Poppler can extract images that are embedded in non-standard ways (Form XObjects, inline images, etc.)
|
|
403
|
-
* that the standard XObject-based extraction might miss.
|
|
404
|
-
*
|
|
405
|
-
* Requires poppler-utils to be installed on the system.
|
|
406
|
-
*
|
|
407
|
-
* Installation:
|
|
408
|
-
* - Linux: sudo apt-get install poppler-utils
|
|
409
|
-
* - macOS: brew install poppler
|
|
410
|
-
*
|
|
411
|
-
* (default: false)
|
|
412
|
-
*/
|
|
413
|
-
usePopplerFallback?: boolean;
|
|
414
509
|
/** Enable parallel processing for better performance (default: true) */
|
|
415
510
|
parallelProcessing?: boolean;
|
|
416
511
|
/** Maximum number of pages to process in parallel (default: 10) */
|
|
@@ -480,15 +575,25 @@ interface ExtractionOptions {
|
|
|
480
575
|
/** Quality for JPG page images (default: 90) */
|
|
481
576
|
pageImageQuality?: number;
|
|
482
577
|
/**
|
|
483
|
-
* Page rendering engine
|
|
484
|
-
*
|
|
485
|
-
*
|
|
578
|
+
* Page rendering engine (default: 'pdfjs')
|
|
579
|
+
*
|
|
580
|
+
* Note: Poppler support has been removed. Only 'pdfjs' is now supported.
|
|
581
|
+
* This option is kept for backwards compatibility but is ignored.
|
|
582
|
+
*
|
|
583
|
+
* @deprecated Poppler support removed - pdfjs is now the only engine
|
|
486
584
|
*/
|
|
487
|
-
pageRenderEngine?: "pdfjs"
|
|
585
|
+
pageRenderEngine?: "pdfjs";
|
|
488
586
|
/** Thumbnail width (default: 200) */
|
|
489
587
|
thumbnailWidth?: number;
|
|
490
588
|
/** Thumbnail quality for JPG (default: 80) */
|
|
491
589
|
thumbnailQuality?: number;
|
|
590
|
+
/**
|
|
591
|
+
* Enable table extraction (default: false)
|
|
592
|
+
* When enabled, tables will be detected and included in the result
|
|
593
|
+
*/
|
|
594
|
+
extractTables?: boolean;
|
|
595
|
+
/** Options for table extraction */
|
|
596
|
+
tableOptions?: TableExtractionOptions;
|
|
492
597
|
}
|
|
493
598
|
interface ProgressInfo {
|
|
494
599
|
currentPage: number;
|
|
@@ -593,7 +698,6 @@ declare class PDFExtractor {
|
|
|
593
698
|
private textExtractor;
|
|
594
699
|
private imageExtractor;
|
|
595
700
|
private pageToImageConverter;
|
|
596
|
-
private popplerConverter;
|
|
597
701
|
private formatProcessor;
|
|
598
702
|
private structuredDataGenerator;
|
|
599
703
|
private cacheManager;
|
|
@@ -660,6 +764,29 @@ declare class PDFExtractor {
|
|
|
660
764
|
* Extract and save image files
|
|
661
765
|
*/
|
|
662
766
|
extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
767
|
+
/**
|
|
768
|
+
* Generate page images (render PDF pages to image files)
|
|
769
|
+
*
|
|
770
|
+
* This is a simplified method to only render PDF pages to images
|
|
771
|
+
* without extracting embedded images or text.
|
|
772
|
+
*
|
|
773
|
+
* @param pdfPath - Path to the PDF file
|
|
774
|
+
* @param outputDir - Directory to save page images
|
|
775
|
+
* @param options - Optional configuration (pageImageFormat, pageImageDpi, pageRenderEngine, etc.)
|
|
776
|
+
* @returns Promise resolving to array of generated image file paths
|
|
777
|
+
*
|
|
778
|
+
* @example
|
|
779
|
+
* ```typescript
|
|
780
|
+
* const extractor = new PDFExtractor();
|
|
781
|
+
* const imagePaths = await extractor.generatePageImages('document.pdf', './page-images', {
|
|
782
|
+
* pageImageFormat: 'jpg',
|
|
783
|
+
* pageImageDpi: 150,
|
|
784
|
+
* pageRenderEngine: 'poppler'
|
|
785
|
+
* });
|
|
786
|
+
* console.log(`Generated ${imagePaths.length} page images`);
|
|
787
|
+
* ```
|
|
788
|
+
*/
|
|
789
|
+
generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
663
790
|
private validateConfiguration;
|
|
664
791
|
private processResults;
|
|
665
792
|
/**
|
|
@@ -795,13 +922,8 @@ declare class StreamingPDFExtractor implements StreamingExtractionResult {
|
|
|
795
922
|
* ```
|
|
796
923
|
*/
|
|
797
924
|
declare class TextExtractor {
|
|
798
|
-
constructor();
|
|
799
925
|
/**
|
|
800
|
-
*
|
|
801
|
-
*/
|
|
802
|
-
private initializePdfjs;
|
|
803
|
-
/**
|
|
804
|
-
* Load PDF document
|
|
926
|
+
* Load PDF document using internal pdf utils
|
|
805
927
|
*/
|
|
806
928
|
private loadDocument;
|
|
807
929
|
/**
|
|
@@ -913,11 +1035,6 @@ declare class StructuredTextExtractor {
|
|
|
913
1035
|
private pdfLibDoc;
|
|
914
1036
|
private pdfLibPages;
|
|
915
1037
|
private textData;
|
|
916
|
-
constructor();
|
|
917
|
-
/**
|
|
918
|
-
* Initialize pdf.js worker
|
|
919
|
-
*/
|
|
920
|
-
private initializePdfjs;
|
|
921
1038
|
/**
|
|
922
1039
|
* Process PDF with accurate page-by-page extraction
|
|
923
1040
|
*/
|
|
@@ -1051,9 +1168,10 @@ declare class ImageExtractor {
|
|
|
1051
1168
|
*/
|
|
1052
1169
|
private extractImageData;
|
|
1053
1170
|
/**
|
|
1054
|
-
* Detect image format from binary data
|
|
1171
|
+
* Detect image format from binary data
|
|
1172
|
+
* Uses centralized image format detection utility
|
|
1055
1173
|
*/
|
|
1056
|
-
private
|
|
1174
|
+
private detectImageFormatLocal;
|
|
1057
1175
|
/**
|
|
1058
1176
|
* Create a PNG file from raw pixel data using actual PDF metadata
|
|
1059
1177
|
*/
|
|
@@ -1275,23 +1393,12 @@ interface ThumbnailOptions extends SinglePageOptions {
|
|
|
1275
1393
|
* dpi: 150
|
|
1276
1394
|
* });
|
|
1277
1395
|
* ```
|
|
1396
|
+
*
|
|
1397
|
+
* NOTE: pdf.js does not support JPEG2000 (JP2) images by default.
|
|
1398
|
+
* Pages with JP2 images will have blank spaces where the images should be.
|
|
1399
|
+
* The embedded images are still extracted correctly via extractImages option.
|
|
1278
1400
|
*/
|
|
1279
1401
|
declare class PageToImageConverter {
|
|
1280
|
-
private pdfjs;
|
|
1281
|
-
/**
|
|
1282
|
-
* Get or load pdf.js module with proper worker configuration
|
|
1283
|
-
* Based on pdf-to-img library approach
|
|
1284
|
-
*
|
|
1285
|
-
* NOTE: pdf.js does not support JPEG2000 (JP2) images by default.
|
|
1286
|
-
* Pages with JP2 images will have blank spaces where the images should be.
|
|
1287
|
-
* The embedded images are still extracted correctly via extractImages option.
|
|
1288
|
-
*
|
|
1289
|
-
* For complete page rendering with JP2 support, consider using:
|
|
1290
|
-
* - Poppler (pdf-poppler npm package) - requires system dependency
|
|
1291
|
-
* - ImageMagick - requires system dependency
|
|
1292
|
-
* - Ghostscript - requires system dependency
|
|
1293
|
-
*/
|
|
1294
|
-
private getPdfjs;
|
|
1295
1402
|
/**
|
|
1296
1403
|
* Convert all pages of a PDF to images
|
|
1297
1404
|
*
|
|
@@ -1340,12 +1447,13 @@ declare class PageToImageConverter {
|
|
|
1340
1447
|
/**
|
|
1341
1448
|
* Render a PDF page to image buffer
|
|
1342
1449
|
*
|
|
1343
|
-
*
|
|
1344
|
-
* @see https://github.com/k-yle/pdf-to-img
|
|
1450
|
+
* Uses @napi-rs/canvas via custom canvas factory for high-performance rendering
|
|
1345
1451
|
*/
|
|
1346
1452
|
private renderPageToBuffer;
|
|
1347
1453
|
/**
|
|
1348
1454
|
* Convert canvas to image buffer
|
|
1455
|
+
*
|
|
1456
|
+
* Uses @napi-rs/canvas async encode() for JPEG/WebP quality control
|
|
1349
1457
|
*/
|
|
1350
1458
|
private canvasToBuffer;
|
|
1351
1459
|
/**
|
|
@@ -1367,47 +1475,67 @@ declare class PageToImageConverter {
|
|
|
1367
1475
|
}
|
|
1368
1476
|
|
|
1369
1477
|
/**
|
|
1370
|
-
*
|
|
1371
|
-
*
|
|
1372
|
-
* Uses Poppler's pdfToCairo for high-quality rendering with full JPEG2000 support.
|
|
1373
|
-
* Requires poppler-utils to be installed on the system.
|
|
1478
|
+
* Table Extractor
|
|
1374
1479
|
*
|
|
1375
|
-
*
|
|
1376
|
-
*
|
|
1377
|
-
* - macOS: brew install poppler
|
|
1378
|
-
* - Windows: Download from https://blog.alivate.com.au/poppler-windows/
|
|
1480
|
+
* Main class for detecting and extracting tables from PDF documents.
|
|
1481
|
+
* Uses text positioning data to identify table structures.
|
|
1379
1482
|
*/
|
|
1380
1483
|
|
|
1381
|
-
|
|
1382
|
-
|
|
1484
|
+
/**
|
|
1485
|
+
* TableExtractor class for detecting and extracting tables from PDFs
|
|
1486
|
+
*
|
|
1487
|
+
* @example
|
|
1488
|
+
* ```typescript
|
|
1489
|
+
* const extractor = new TableExtractor();
|
|
1490
|
+
* const result = await extractor.extract('document.pdf', {
|
|
1491
|
+
* detectHeaders: true,
|
|
1492
|
+
* minRows: 3
|
|
1493
|
+
* });
|
|
1494
|
+
*
|
|
1495
|
+
* for (const table of result.tables) {
|
|
1496
|
+
* console.log(extractor.tableToMarkdown(table));
|
|
1497
|
+
* }
|
|
1498
|
+
* ```
|
|
1499
|
+
*/
|
|
1500
|
+
declare class TableExtractor {
|
|
1501
|
+
/**
|
|
1502
|
+
* Extract tables from a PDF file
|
|
1503
|
+
*
|
|
1504
|
+
* @param pdfPath - Path to the PDF file
|
|
1505
|
+
* @param options - Extraction options
|
|
1506
|
+
* @returns Promise resolving to extraction result
|
|
1507
|
+
*/
|
|
1508
|
+
extract(pdfPath: string, options?: TableExtractionOptions): Promise<TableExtractionResult>;
|
|
1383
1509
|
/**
|
|
1384
|
-
*
|
|
1510
|
+
* Detects tables on a single page
|
|
1385
1511
|
*/
|
|
1386
|
-
private
|
|
1512
|
+
private detectTablesOnPage;
|
|
1387
1513
|
/**
|
|
1388
|
-
*
|
|
1389
|
-
*
|
|
1390
|
-
* @param pdfPath - Path to PDF file
|
|
1391
|
-
* @param options - Conversion options
|
|
1392
|
-
* @returns Conversion result with image paths
|
|
1514
|
+
* Builds a Table object from a validated candidate
|
|
1393
1515
|
*/
|
|
1394
|
-
|
|
1516
|
+
private buildTable;
|
|
1395
1517
|
/**
|
|
1396
|
-
*
|
|
1518
|
+
* Converts a table to a 2D array of strings
|
|
1397
1519
|
*/
|
|
1398
|
-
|
|
1520
|
+
tableToArray(table: Table, includeHeaders?: boolean): string[][];
|
|
1399
1521
|
/**
|
|
1400
|
-
*
|
|
1522
|
+
* Converts a table to CSV format
|
|
1401
1523
|
*/
|
|
1402
|
-
|
|
1524
|
+
tableToCSV(table: Table, delimiter?: string): string;
|
|
1403
1525
|
/**
|
|
1404
|
-
*
|
|
1526
|
+
* Converts a table to Markdown format
|
|
1405
1527
|
*/
|
|
1406
|
-
|
|
1528
|
+
tableToMarkdown(table: Table): string;
|
|
1407
1529
|
/**
|
|
1408
|
-
*
|
|
1530
|
+
* Converts a table to HTML format
|
|
1409
1531
|
*/
|
|
1410
|
-
|
|
1532
|
+
tableToHTML(table: Table, options?: {
|
|
1533
|
+
tableClass?: string;
|
|
1534
|
+
}): string;
|
|
1535
|
+
/**
|
|
1536
|
+
* Converts a table to an array of objects (using headers as keys)
|
|
1537
|
+
*/
|
|
1538
|
+
tableToObjects(table: Table): Array<Record<string, string>>;
|
|
1411
1539
|
}
|
|
1412
1540
|
|
|
1413
1541
|
/**
|
|
@@ -1419,7 +1547,7 @@ interface OptimizationResult {
|
|
|
1419
1547
|
optimizedSize: number;
|
|
1420
1548
|
savedBytes: number;
|
|
1421
1549
|
savedPercent: number;
|
|
1422
|
-
engine: "
|
|
1550
|
+
engine: "canvas" | "none";
|
|
1423
1551
|
error?: string;
|
|
1424
1552
|
}
|
|
1425
1553
|
/**
|
|
@@ -1428,19 +1556,17 @@ interface OptimizationResult {
|
|
|
1428
1556
|
interface OptimizationOptions {
|
|
1429
1557
|
quality?: number;
|
|
1430
1558
|
verbose?: boolean;
|
|
1431
|
-
useSharp?: boolean;
|
|
1432
1559
|
}
|
|
1433
1560
|
/**
|
|
1434
|
-
* Image optimizer using
|
|
1561
|
+
* Image optimizer using @napi-rs/canvas
|
|
1435
1562
|
*
|
|
1436
|
-
* This class provides image optimization capabilities using
|
|
1437
|
-
*
|
|
1563
|
+
* This class provides image optimization capabilities using @napi-rs/canvas,
|
|
1564
|
+
* a high-performance Skia-based canvas library. It supports JPEG, PNG, and WebP
|
|
1438
1565
|
* optimization with quality control.
|
|
1439
1566
|
*
|
|
1440
1567
|
* @example
|
|
1441
1568
|
* ```typescript
|
|
1442
1569
|
* const result = await ImageOptimizer.optimizeFile('image.jpg', {
|
|
1443
|
-
* engine: 'auto',
|
|
1444
1570
|
* quality: 80
|
|
1445
1571
|
* });
|
|
1446
1572
|
*
|
|
@@ -1460,22 +1586,16 @@ declare class ImageOptimizer {
|
|
|
1460
1586
|
*/
|
|
1461
1587
|
static optimizeFile(filePath: string, options?: OptimizationOptions): Promise<OptimizationResult>;
|
|
1462
1588
|
/**
|
|
1463
|
-
* Optimize using
|
|
1464
|
-
*/
|
|
1465
|
-
private static optimizeWithSharp;
|
|
1466
|
-
/**
|
|
1467
|
-
* Optimize using Jimp (pure JavaScript)
|
|
1589
|
+
* Optimize using @napi-rs/canvas (Skia-based)
|
|
1468
1590
|
*/
|
|
1469
|
-
private static
|
|
1591
|
+
private static optimizeWithCanvas;
|
|
1470
1592
|
/**
|
|
1471
1593
|
* Convert JPEG 2000 formats (jp2, jpx, j2c, jpm) to JPG
|
|
1472
1594
|
*
|
|
1473
1595
|
* JPEG 2000 files are not widely supported by browsers and image tools.
|
|
1474
1596
|
* This method converts them to standard JPG format for better compatibility.
|
|
1475
1597
|
*
|
|
1476
|
-
*
|
|
1477
|
-
* - Jimp (default): Pure JavaScript, works everywhere
|
|
1478
|
-
* - Sharp (optional): Better color preservation, requires native compilation
|
|
1598
|
+
* Uses @napi-rs/canvas with OpenJPEG WASM decoder for high-performance conversion.
|
|
1479
1599
|
*
|
|
1480
1600
|
* @param jp2Path - Path to the JPEG 2000 file (jp2, jpx, j2c, or jpm)
|
|
1481
1601
|
* @param options - Conversion options
|
|
@@ -1484,7 +1604,6 @@ declare class ImageOptimizer {
|
|
|
1484
1604
|
static convertJp2ToJpg(jp2Path: string, options?: {
|
|
1485
1605
|
quality?: number;
|
|
1486
1606
|
verbose?: boolean;
|
|
1487
|
-
useSharp?: boolean;
|
|
1488
1607
|
}): Promise<{
|
|
1489
1608
|
success: boolean;
|
|
1490
1609
|
newPath?: string;
|
|
@@ -1548,6 +1667,716 @@ declare class FormatProcessor {
|
|
|
1548
1667
|
formatDuration(milliseconds: number): string;
|
|
1549
1668
|
}
|
|
1550
1669
|
|
|
1670
|
+
/**
|
|
1671
|
+
* Type definitions for the internal PDF utilities library
|
|
1672
|
+
*
|
|
1673
|
+
* Provides clean interfaces for PDF operations inspired by unpdf patterns.
|
|
1674
|
+
*/
|
|
1675
|
+
|
|
1676
|
+
/**
|
|
1677
|
+
* Source for loading a PDF - either a file path or raw bytes
|
|
1678
|
+
*/
|
|
1679
|
+
type PDFSource = string | Uint8Array | Buffer;
|
|
1680
|
+
/**
|
|
1681
|
+
* Input type for PDF operations - accepts either raw data or an already loaded document
|
|
1682
|
+
*/
|
|
1683
|
+
type PDFInput = PDFSource | PDFDocumentProxy;
|
|
1684
|
+
/**
|
|
1685
|
+
* Supported image formats for rendering
|
|
1686
|
+
*/
|
|
1687
|
+
type ImageFormat = "png" | "jpeg" | "webp";
|
|
1688
|
+
/**
|
|
1689
|
+
* Options for loading a PDF document
|
|
1690
|
+
*/
|
|
1691
|
+
interface PDFLoadOptions {
|
|
1692
|
+
/** Password for encrypted PDFs */
|
|
1693
|
+
password?: string;
|
|
1694
|
+
/** Verbosity level for pdfjs logging */
|
|
1695
|
+
verbosity?: number;
|
|
1696
|
+
}
|
|
1697
|
+
/**
|
|
1698
|
+
* Text item with full positioning information
|
|
1699
|
+
*/
|
|
1700
|
+
interface PDFTextItem {
|
|
1701
|
+
/** The text string */
|
|
1702
|
+
str: string;
|
|
1703
|
+
/** X position (from transform matrix) */
|
|
1704
|
+
x: number;
|
|
1705
|
+
/** Y position (from transform matrix) */
|
|
1706
|
+
y: number;
|
|
1707
|
+
/** Width of the text item */
|
|
1708
|
+
width: number;
|
|
1709
|
+
/** Height of the text item */
|
|
1710
|
+
height: number;
|
|
1711
|
+
/** Font name */
|
|
1712
|
+
fontName: string;
|
|
1713
|
+
/** Font size (derived from transform) */
|
|
1714
|
+
fontSize: number;
|
|
1715
|
+
/** Full transform matrix [a, b, c, d, e, f] */
|
|
1716
|
+
transform: number[];
|
|
1717
|
+
/** Whether this item ends with EOL */
|
|
1718
|
+
hasEOL: boolean;
|
|
1719
|
+
/** Text direction (ltr or rtl) */
|
|
1720
|
+
dir: "ltr" | "rtl" | "ttb" | "btt";
|
|
1721
|
+
}
|
|
1722
|
+
/**
|
|
1723
|
+
* Progress information for text extraction
|
|
1724
|
+
*/
|
|
1725
|
+
interface TextExtractionProgress {
|
|
1726
|
+
/** Number of pages processed so far */
|
|
1727
|
+
processedPages: number;
|
|
1728
|
+
/** Total number of pages to process */
|
|
1729
|
+
totalPages: number;
|
|
1730
|
+
/** Percentage complete (0-100) */
|
|
1731
|
+
percentage: number;
|
|
1732
|
+
/** Current page being processed (1-based) */
|
|
1733
|
+
currentPage?: number;
|
|
1734
|
+
}
|
|
1735
|
+
/**
|
|
1736
|
+
* Performance metadata for text extraction
|
|
1737
|
+
*/
|
|
1738
|
+
interface TextExtractionMeta {
|
|
1739
|
+
/** Duration in milliseconds */
|
|
1740
|
+
duration: number;
|
|
1741
|
+
/** Number of pages processed */
|
|
1742
|
+
pagesProcessed: number;
|
|
1743
|
+
/** Processing method used */
|
|
1744
|
+
method: "parallel" | "sequential" | "chunked";
|
|
1745
|
+
}
|
|
1746
|
+
/**
|
|
1747
|
+
* Options for text extraction
|
|
1748
|
+
*/
|
|
1749
|
+
interface TextExtractionOptions {
|
|
1750
|
+
/** First page to extract (1-based, default: 1) */
|
|
1751
|
+
firstPage?: number;
|
|
1752
|
+
/** Last page to extract (1-based, default: all pages) */
|
|
1753
|
+
lastPage?: number;
|
|
1754
|
+
/** Include marked content in extraction */
|
|
1755
|
+
includeMarkedContent?: boolean;
|
|
1756
|
+
/** Disable text normalization */
|
|
1757
|
+
disableNormalization?: boolean;
|
|
1758
|
+
/** Merge all pages into a single string (default: false) */
|
|
1759
|
+
mergePages?: boolean;
|
|
1760
|
+
/** Maximum concurrent page extractions (default: 10) */
|
|
1761
|
+
maxConcurrency?: number;
|
|
1762
|
+
/** Progress callback called after each page is processed */
|
|
1763
|
+
onProgress?: (progress: TextExtractionProgress) => void;
|
|
1764
|
+
/** Chunk size for processing very large PDFs (default: undefined = no chunking) */
|
|
1765
|
+
chunkSize?: number;
|
|
1766
|
+
/** Callback called after each chunk is processed (when chunkSize is set) */
|
|
1767
|
+
onChunkComplete?: (info: {
|
|
1768
|
+
chunkIndex: number;
|
|
1769
|
+
totalChunks: number;
|
|
1770
|
+
pagesProcessed: number;
|
|
1771
|
+
}) => void;
|
|
1772
|
+
}
|
|
1773
|
+
/**
|
|
1774
|
+
* Result of text extraction
|
|
1775
|
+
*/
|
|
1776
|
+
interface TextExtractionResult<T extends string | string[]> {
|
|
1777
|
+
/** Total number of pages in the document */
|
|
1778
|
+
totalPages: number;
|
|
1779
|
+
/** Extracted text - string[] when mergePages is false, string when true */
|
|
1780
|
+
text: T;
|
|
1781
|
+
/** Performance metadata (available when extraction completes) */
|
|
1782
|
+
_meta?: TextExtractionMeta;
|
|
1783
|
+
}
|
|
1784
|
+
/**
|
|
1785
|
+
* Result of text items extraction
|
|
1786
|
+
*/
|
|
1787
|
+
interface TextItemsExtractionResult {
|
|
1788
|
+
/** Total number of pages in the document */
|
|
1789
|
+
totalPages: number;
|
|
1790
|
+
/** Text items per page */
|
|
1791
|
+
items: PDFTextItem[][];
|
|
1792
|
+
/** Performance metadata (available when extraction completes) */
|
|
1793
|
+
_meta?: TextExtractionMeta;
|
|
1794
|
+
}
|
|
1795
|
+
/**
|
|
1796
|
+
* Options for metadata extraction
|
|
1797
|
+
*/
|
|
1798
|
+
interface MetadataOptions {
|
|
1799
|
+
/** Parse date strings (CreationDate, ModDate) into Date objects (default: false) */
|
|
1800
|
+
parseDates?: boolean;
|
|
1801
|
+
}
|
|
1802
|
+
/**
|
|
1803
|
+
* Result of link extraction
|
|
1804
|
+
*/
|
|
1805
|
+
interface LinkExtractionResult {
|
|
1806
|
+
/** Total number of pages in the document */
|
|
1807
|
+
totalPages: number;
|
|
1808
|
+
/** Extracted URLs from the document */
|
|
1809
|
+
links: string[];
|
|
1810
|
+
}
|
|
1811
|
+
/**
|
|
1812
|
+
* Options for page rendering
|
|
1813
|
+
*/
|
|
1814
|
+
interface RenderOptions {
|
|
1815
|
+
/** Scale factor (default: 1). Ignored if width or height is set. */
|
|
1816
|
+
scale?: number;
|
|
1817
|
+
/** DPI for rendering (default: 72, affects scale) */
|
|
1818
|
+
dpi?: number;
|
|
1819
|
+
/** Target width in pixels. Auto-calculates scale to fit. */
|
|
1820
|
+
width?: number;
|
|
1821
|
+
/** Target height in pixels. Auto-calculates scale to fit. */
|
|
1822
|
+
height?: number;
|
|
1823
|
+
/** Output format (default: 'png') */
|
|
1824
|
+
format?: ImageFormat;
|
|
1825
|
+
/** Quality for JPEG/WebP (0-100, default: 90) */
|
|
1826
|
+
quality?: number;
|
|
1827
|
+
/** Background color (default: '#FFFFFF') */
|
|
1828
|
+
backgroundColor?: string;
|
|
1829
|
+
/** Transparent background (default: false) */
|
|
1830
|
+
transparent?: boolean;
|
|
1831
|
+
}
|
|
1832
|
+
/**
|
|
1833
|
+
* Result of rendering a page
|
|
1834
|
+
*/
|
|
1835
|
+
interface RenderResult {
|
|
1836
|
+
/** Image buffer */
|
|
1837
|
+
buffer: Buffer;
|
|
1838
|
+
/** Image width in pixels */
|
|
1839
|
+
width: number;
|
|
1840
|
+
/** Image height in pixels */
|
|
1841
|
+
height: number;
|
|
1842
|
+
/** Output format */
|
|
1843
|
+
format: ImageFormat;
|
|
1844
|
+
}
|
|
1845
|
+
/**
|
|
1846
|
+
* Result of rendering a page as data URL
|
|
1847
|
+
*/
|
|
1848
|
+
interface RenderDataURLResult {
|
|
1849
|
+
/** Data URL string (e.g., "data:image/png;base64,...") */
|
|
1850
|
+
dataURL: string;
|
|
1851
|
+
/** Image width in pixels */
|
|
1852
|
+
width: number;
|
|
1853
|
+
/** Image height in pixels */
|
|
1854
|
+
height: number;
|
|
1855
|
+
/** Output format */
|
|
1856
|
+
format: ImageFormat;
|
|
1857
|
+
}
|
|
1858
|
+
/**
|
|
1859
|
+
* PDF document metadata
|
|
1860
|
+
*/
|
|
1861
|
+
interface PDFMetadata {
|
|
1862
|
+
/** Number of pages */
|
|
1863
|
+
numPages: number;
|
|
1864
|
+
/** PDF info dictionary */
|
|
1865
|
+
info: Record<string, unknown>;
|
|
1866
|
+
/** PDF metadata (XMP) */
|
|
1867
|
+
metadata: Record<string, unknown> | null;
|
|
1868
|
+
/** PDF format version */
|
|
1869
|
+
version: string;
|
|
1870
|
+
/** Whether the PDF is encrypted */
|
|
1871
|
+
isEncrypted: boolean;
|
|
1872
|
+
/** Whether the PDF is linearized (fast web view) */
|
|
1873
|
+
isLinearized: boolean;
|
|
1874
|
+
}
|
|
1875
|
+
/**
|
|
1876
|
+
* Page dimensions and properties
|
|
1877
|
+
*/
|
|
1878
|
+
interface PageInfo {
|
|
1879
|
+
/** Page number (1-based) */
|
|
1880
|
+
pageNumber: number;
|
|
1881
|
+
/** Page width in points */
|
|
1882
|
+
width: number;
|
|
1883
|
+
/** Page height in points */
|
|
1884
|
+
height: number;
|
|
1885
|
+
/** Page rotation in degrees */
|
|
1886
|
+
rotation: number;
|
|
1887
|
+
/** Viewport at scale 1 */
|
|
1888
|
+
viewport: {
|
|
1889
|
+
width: number;
|
|
1890
|
+
height: number;
|
|
1891
|
+
scale: number;
|
|
1892
|
+
};
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
/**
|
|
1896
|
+
* Check if running in Node.js environment
|
|
1897
|
+
*/
|
|
1898
|
+
declare const isNode: boolean;
|
|
1899
|
+
/**
|
|
1900
|
+
* Check if running in browser environment
|
|
1901
|
+
*/
|
|
1902
|
+
declare const isBrowser: boolean;
|
|
1903
|
+
/**
|
|
1904
|
+
* Check if a value is a PDFDocumentProxy instance
|
|
1905
|
+
*
|
|
1906
|
+
* Uses internal pdfjs property for reliable detection.
|
|
1907
|
+
*
|
|
1908
|
+
* @param data - Value to check
|
|
1909
|
+
* @returns True if the value is a PDFDocumentProxy
|
|
1910
|
+
*
|
|
1911
|
+
* @example
|
|
1912
|
+
* ```typescript
|
|
1913
|
+
* if (isPDFDocumentProxy(input)) {
|
|
1914
|
+
* // input is typed as PDFDocumentProxy
|
|
1915
|
+
* console.log(input.numPages);
|
|
1916
|
+
* }
|
|
1917
|
+
* ```
|
|
1918
|
+
*/
|
|
1919
|
+
declare function isPDFDocumentProxy(data: unknown): data is PDFDocumentProxy;
|
|
1920
|
+
/**
|
|
1921
|
+
* Get the pdf.js module, initializing it lazily
|
|
1922
|
+
*
|
|
1923
|
+
* This ensures pdf.js is only loaded when needed and worker
|
|
1924
|
+
* configuration happens exactly once.
|
|
1925
|
+
*/
|
|
1926
|
+
declare function getPDFJS(): Promise<typeof pdfjs_dist_legacy_build_pdf_mjs>;
|
|
1927
|
+
/**
|
|
1928
|
+
* Get the pdf.js verbosity level enum
|
|
1929
|
+
*/
|
|
1930
|
+
declare function getVerbosityLevel(): Promise<typeof pdfjs_dist_legacy_build_pdf_mjs.VerbosityLevel>;
|
|
1931
|
+
/**
|
|
1932
|
+
* Load a PDF document from a file path or buffer
|
|
1933
|
+
*
|
|
1934
|
+
* Applies sensible defaults:
|
|
1935
|
+
* - `isEvalSupported: false` (security)
|
|
1936
|
+
* - `useSystemFonts: true` (better font rendering)
|
|
1937
|
+
*
|
|
1938
|
+
* @param source - File path string or Uint8Array/Buffer of PDF data
|
|
1939
|
+
* @param options - Loading options
|
|
1940
|
+
* @returns PDFDocumentProxy
|
|
1941
|
+
*
|
|
1942
|
+
* @example
|
|
1943
|
+
* ```typescript
|
|
1944
|
+
* // Load from file path
|
|
1945
|
+
* const doc = await loadPDF('document.pdf');
|
|
1946
|
+
*
|
|
1947
|
+
* // Load from buffer
|
|
1948
|
+
* const buffer = fs.readFileSync('document.pdf');
|
|
1949
|
+
* const doc = await loadPDF(buffer);
|
|
1950
|
+
*
|
|
1951
|
+
* // With password
|
|
1952
|
+
* const doc = await loadPDF('encrypted.pdf', { password: 'secret' });
|
|
1953
|
+
* ```
|
|
1954
|
+
*/
|
|
1955
|
+
declare function loadPDF(source: PDFSource, options?: PDFLoadOptions): Promise<PDFDocumentProxy>;
|
|
1956
|
+
/**
|
|
1957
|
+
* Get a PDFDocumentProxy from input (loads if necessary)
|
|
1958
|
+
*
|
|
1959
|
+
* This is a convenience function that handles both raw data and
|
|
1960
|
+
* already-loaded documents uniformly.
|
|
1961
|
+
*
|
|
1962
|
+
* @param input - PDF source or already loaded document
|
|
1963
|
+
* @param options - Loading options (only used if input is not already a document)
|
|
1964
|
+
* @returns PDFDocumentProxy
|
|
1965
|
+
*
|
|
1966
|
+
* @example
|
|
1967
|
+
* ```typescript
|
|
1968
|
+
* // Works with file path
|
|
1969
|
+
* const doc1 = await getDocumentProxy('document.pdf');
|
|
1970
|
+
*
|
|
1971
|
+
* // Works with already loaded document (returns as-is)
|
|
1972
|
+
* const doc2 = await getDocumentProxy(existingDoc);
|
|
1973
|
+
* ```
|
|
1974
|
+
*/
|
|
1975
|
+
declare function getDocumentProxy(input: PDFInput, options?: PDFLoadOptions): Promise<PDFDocumentProxy>;
|
|
1976
|
+
/**
|
|
1977
|
+
* Load a PDF and get the number of pages quickly
|
|
1978
|
+
*
|
|
1979
|
+
* Useful for determining if streaming should be enabled.
|
|
1980
|
+
*
|
|
1981
|
+
* @param source - File path or buffer
|
|
1982
|
+
* @returns Number of pages
|
|
1983
|
+
*/
|
|
1984
|
+
declare function getPageCount(source: PDFSource): Promise<number>;
|
|
1985
|
+
/**
|
|
1986
|
+
* Check if a file is a valid PDF
|
|
1987
|
+
*
|
|
1988
|
+
* @param source - File path or buffer
|
|
1989
|
+
* @returns True if the source appears to be a valid PDF
|
|
1990
|
+
*/
|
|
1991
|
+
declare function isPDF(source: PDFSource): Promise<boolean>;
|
|
1992
|
+
/**
|
|
1993
|
+
* Validate page number against document bounds
|
|
1994
|
+
*
|
|
1995
|
+
* @param pageNum - Page number to validate (1-based)
|
|
1996
|
+
* @param totalPages - Total pages in document
|
|
1997
|
+
* @throws Error if page number is invalid
|
|
1998
|
+
*/
|
|
1999
|
+
declare function validatePageNumber(pageNum: number, totalPages: number): void;
|
|
2000
|
+
|
|
2001
|
+
/**
|
|
2002
|
+
* PDF Text Extraction Utilities
|
|
2003
|
+
*
|
|
2004
|
+
* Provides text extraction with full positioning support.
|
|
2005
|
+
* This is our value-add over unpdf - we include positions!
|
|
2006
|
+
*/
|
|
2007
|
+
|
|
2008
|
+
/**
|
|
2009
|
+
* Extract text from all pages
|
|
2010
|
+
*
|
|
2011
|
+
* @param input - PDF document, file path, or buffer
|
|
2012
|
+
* @param options - Extraction options
|
|
2013
|
+
* @returns Object with totalPages and text array
|
|
2014
|
+
*
|
|
2015
|
+
* @example
|
|
2016
|
+
* ```typescript
|
|
2017
|
+
* // Get text as array of pages
|
|
2018
|
+
* const result = await extractText('document.pdf');
|
|
2019
|
+
* console.log(`Page 1: ${result.text[0]}`);
|
|
2020
|
+
*
|
|
2021
|
+
* // Get text as single merged string
|
|
2022
|
+
* const merged = await extractText('document.pdf', { mergePages: true });
|
|
2023
|
+
* console.log(merged.text); // string
|
|
2024
|
+
* ```
|
|
2025
|
+
*/
|
|
2026
|
+
declare function extractText$1(input: PDFInput, options?: TextExtractionOptions & {
|
|
2027
|
+
mergePages?: false;
|
|
2028
|
+
}): Promise<TextExtractionResult<string[]>>;
|
|
2029
|
+
declare function extractText$1(input: PDFInput, options: TextExtractionOptions & {
|
|
2030
|
+
mergePages: true;
|
|
2031
|
+
}): Promise<TextExtractionResult<string>>;
|
|
2032
|
+
/**
|
|
2033
|
+
* Extract text with full positioning information
|
|
2034
|
+
*
|
|
2035
|
+
* This is the main value-add function - provides detailed text items
|
|
2036
|
+
* with x, y, width, height, font info, etc.
|
|
2037
|
+
*
|
|
2038
|
+
* @param input - PDF document, file path, or buffer
|
|
2039
|
+
* @param options - Extraction options
|
|
2040
|
+
* @returns Object with totalPages and items array per page
|
|
2041
|
+
*
|
|
2042
|
+
* @example
|
|
2043
|
+
* ```typescript
|
|
2044
|
+
* const result = await extractTextItems('document.pdf');
|
|
2045
|
+
* for (const item of result.items[0]) {
|
|
2046
|
+
* console.log(`"${item.str}" at (${item.x}, ${item.y})`);
|
|
2047
|
+
* }
|
|
2048
|
+
* ```
|
|
2049
|
+
*/
|
|
2050
|
+
declare function extractTextItems(input: PDFInput, options?: Omit<TextExtractionOptions, "mergePages">): Promise<TextItemsExtractionResult>;
|
|
2051
|
+
/**
|
|
2052
|
+
* Extract text from a single page
|
|
2053
|
+
*
|
|
2054
|
+
* @param input - PDF document, file path, or buffer
|
|
2055
|
+
* @param pageNum - Page number (1-based)
|
|
2056
|
+
* @param options - Extraction options
|
|
2057
|
+
* @returns Text string for the page
|
|
2058
|
+
*/
|
|
2059
|
+
declare function extractPageText(input: PDFInput, pageNum: number, options?: Omit<TextExtractionOptions, "firstPage" | "lastPage" | "mergePages">): Promise<string>;
|
|
2060
|
+
/**
|
|
2061
|
+
* Extract text items from a single page
|
|
2062
|
+
*
|
|
2063
|
+
* @param input - PDF document, file path, or buffer
|
|
2064
|
+
* @param pageNum - Page number (1-based)
|
|
2065
|
+
* @param options - Extraction options
|
|
2066
|
+
* @returns Array of text items
|
|
2067
|
+
*/
|
|
2068
|
+
declare function extractPageTextItems(input: PDFInput, pageNum: number, options?: Omit<TextExtractionOptions, "firstPage" | "lastPage" | "mergePages">): Promise<PDFTextItem[]>;
|
|
2069
|
+
/**
|
|
2070
|
+
* Extract all text as a single string
|
|
2071
|
+
*
|
|
2072
|
+
* @param input - PDF document, file path, or buffer
|
|
2073
|
+
* @param options - Extraction options
|
|
2074
|
+
* @param pageSeparator - String to join pages (default: "\n\n")
|
|
2075
|
+
* @returns Combined text from all pages
|
|
2076
|
+
*
|
|
2077
|
+
* @deprecated Use extractText with { mergePages: true } instead
|
|
2078
|
+
*/
|
|
2079
|
+
declare function extractFullText(input: PDFInput, options?: Omit<TextExtractionOptions, "mergePages">, pageSeparator?: string): Promise<string>;
|
|
2080
|
+
|
|
2081
|
+
/**
|
|
2082
|
+
* PDF Metadata Extraction Utilities
|
|
2083
|
+
*
|
|
2084
|
+
* Provides access to PDF document metadata.
|
|
2085
|
+
*/
|
|
2086
|
+
|
|
2087
|
+
/**
|
|
2088
|
+
* Extract metadata from a PDF document
|
|
2089
|
+
*
|
|
2090
|
+
* @param input - PDF document, file path, or buffer
|
|
2091
|
+
* @param options - Metadata extraction options
|
|
2092
|
+
* @returns PDF metadata
|
|
2093
|
+
*
|
|
2094
|
+
* @example
|
|
2095
|
+
* ```typescript
|
|
2096
|
+
* const meta = await getMetadata('document.pdf');
|
|
2097
|
+
* console.log(`${meta.numPages} pages, version ${meta.version}`);
|
|
2098
|
+
*
|
|
2099
|
+
* // With date parsing
|
|
2100
|
+
* const metaDates = await getMetadata('document.pdf', { parseDates: true });
|
|
2101
|
+
* if (metaDates.info.CreationDate instanceof Date) {
|
|
2102
|
+
* console.log('Created:', metaDates.info.CreationDate.toISOString());
|
|
2103
|
+
* }
|
|
2104
|
+
* ```
|
|
2105
|
+
*/
|
|
2106
|
+
declare function getMetadata(input: PDFInput, options?: MetadataOptions): Promise<PDFMetadata>;
|
|
2107
|
+
/**
|
|
2108
|
+
* Get information about a specific page
|
|
2109
|
+
*
|
|
2110
|
+
* @param input - PDF document, file path, or buffer
|
|
2111
|
+
* @param pageNum - Page number (1-based)
|
|
2112
|
+
* @returns Page information
|
|
2113
|
+
*/
|
|
2114
|
+
declare function getPageInfo(input: PDFInput, pageNum: number): Promise<PageInfo>;
|
|
2115
|
+
/**
|
|
2116
|
+
* Get information about all pages
|
|
2117
|
+
*
|
|
2118
|
+
* @param input - PDF document, file path, or buffer
|
|
2119
|
+
* @returns Array of page information
|
|
2120
|
+
*/
|
|
2121
|
+
declare function getAllPagesInfo(input: PDFInput): Promise<PageInfo[]>;
|
|
2122
|
+
|
|
2123
|
+
/**
|
|
2124
|
+
* PDF Link Extraction Utilities
|
|
2125
|
+
*
|
|
2126
|
+
* Extracts URLs from PDF annotations (hyperlinks).
|
|
2127
|
+
*/
|
|
2128
|
+
|
|
2129
|
+
/**
|
|
2130
|
+
* Extract all links (URLs) from a PDF document
|
|
2131
|
+
*
|
|
2132
|
+
* Extracts hyperlinks from PDF annotations across all pages.
|
|
2133
|
+
*
|
|
2134
|
+
* @param input - PDF document, file path, or buffer
|
|
2135
|
+
* @returns Object with totalPages and unique links array
|
|
2136
|
+
*
|
|
2137
|
+
* @example
|
|
2138
|
+
* ```typescript
|
|
2139
|
+
* const result = await extractLinks('document.pdf');
|
|
2140
|
+
* console.log(`Found ${result.links.length} links in ${result.totalPages} pages`);
|
|
2141
|
+
* for (const url of result.links) {
|
|
2142
|
+
* console.log(url);
|
|
2143
|
+
* }
|
|
2144
|
+
* ```
|
|
2145
|
+
*/
|
|
2146
|
+
declare function extractLinks(input: PDFInput): Promise<LinkExtractionResult>;
|
|
2147
|
+
|
|
2148
|
+
/**
|
|
2149
|
+
* PDF Page Rendering Utilities
|
|
2150
|
+
*
|
|
2151
|
+
* Renders PDF pages to images using @napi-rs/canvas.
|
|
2152
|
+
*/
|
|
2153
|
+
|
|
2154
|
+
/**
|
|
2155
|
+
* Render a PDF page to an image buffer
|
|
2156
|
+
*
|
|
2157
|
+
* @param input - PDF document, file path, or buffer
|
|
2158
|
+
* @param pageNum - Page number (1-based)
|
|
2159
|
+
* @param options - Render options
|
|
2160
|
+
* @returns Render result with buffer and dimensions
|
|
2161
|
+
*
|
|
2162
|
+
* @example
|
|
2163
|
+
* ```typescript
|
|
2164
|
+
* // Using scale
|
|
2165
|
+
* const result = await renderPage('document.pdf', 1, { scale: 2 });
|
|
2166
|
+
*
|
|
2167
|
+
* // Using target width (auto-calculates scale)
|
|
2168
|
+
* const result = await renderPage('document.pdf', 1, { width: 800 });
|
|
2169
|
+
*
|
|
2170
|
+
* // Using target height (auto-calculates scale)
|
|
2171
|
+
* const result = await renderPage('document.pdf', 1, { height: 600 });
|
|
2172
|
+
*
|
|
2173
|
+
* fs.writeFileSync('page1.png', result.buffer);
|
|
2174
|
+
* ```
|
|
2175
|
+
*/
|
|
2176
|
+
declare function renderPage(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<RenderResult>;
|
|
2177
|
+
/**
|
|
2178
|
+
* Render a PDF page directly to a data URL
|
|
2179
|
+
*
|
|
2180
|
+
* @param input - PDF document, file path, or buffer
|
|
2181
|
+
* @param pageNum - Page number (1-based)
|
|
2182
|
+
* @param options - Render options
|
|
2183
|
+
* @returns Render result with data URL and dimensions
|
|
2184
|
+
*
|
|
2185
|
+
* @example
|
|
2186
|
+
* ```typescript
|
|
2187
|
+
* const result = await renderPageAsDataURL('document.pdf', 1, { width: 800 });
|
|
2188
|
+
* // result.dataURL = "data:image/png;base64,..."
|
|
2189
|
+
* ```
|
|
2190
|
+
*/
|
|
2191
|
+
declare function renderPageAsDataURL(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<RenderDataURLResult>;
|
|
2192
|
+
/**
|
|
2193
|
+
* Render multiple pages to image buffers
|
|
2194
|
+
*
|
|
2195
|
+
* @param input - PDF document, file path, or buffer
|
|
2196
|
+
* @param pageNums - Array of page numbers (1-based), or undefined for all pages
|
|
2197
|
+
* @param options - Render options
|
|
2198
|
+
* @returns Array of render results
|
|
2199
|
+
*/
|
|
2200
|
+
declare function renderPages(input: PDFInput, pageNums?: number[], options?: RenderOptions): Promise<RenderResult[]>;
|
|
2201
|
+
/**
|
|
2202
|
+
* Render a page and return as base64 string
|
|
2203
|
+
*
|
|
2204
|
+
* @param input - PDF document, file path, or buffer
|
|
2205
|
+
* @param pageNum - Page number (1-based)
|
|
2206
|
+
* @param options - Render options
|
|
2207
|
+
* @returns Base64-encoded image string
|
|
2208
|
+
*/
|
|
2209
|
+
declare function renderPageToBase64(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<string>;
|
|
2210
|
+
/**
|
|
2211
|
+
* Render a page as a data URL (legacy function, use renderPageAsDataURL instead)
|
|
2212
|
+
*
|
|
2213
|
+
* @param input - PDF document, file path, or buffer
|
|
2214
|
+
* @param pageNum - Page number (1-based)
|
|
2215
|
+
* @param options - Render options
|
|
2216
|
+
* @returns Data URL string
|
|
2217
|
+
*
|
|
2218
|
+
* @deprecated Use renderPageAsDataURL which returns more info
|
|
2219
|
+
*/
|
|
2220
|
+
declare function renderPageToDataURL(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<string>;
|
|
2221
|
+
|
|
2222
|
+
/**
|
|
2223
|
+
* PDF Image Extraction Utilities
|
|
2224
|
+
*
|
|
2225
|
+
* Provides access to embedded images in PDF documents.
|
|
2226
|
+
* This is a thin wrapper around the existing ImageExtractor for consistency.
|
|
2227
|
+
*/
|
|
2228
|
+
|
|
2229
|
+
/**
|
|
2230
|
+
* Options for image extraction
|
|
2231
|
+
*/
|
|
2232
|
+
interface ImageExtractionOptions {
|
|
2233
|
+
/** Extract image files to disk (default: false) */
|
|
2234
|
+
extractFiles?: boolean;
|
|
2235
|
+
/** Output directory for extracted images */
|
|
2236
|
+
outputDir?: string;
|
|
2237
|
+
/** Convert JPEG2000 to JPG (default: true) */
|
|
2238
|
+
convertJp2ToJpg?: boolean;
|
|
2239
|
+
/** Optimize extracted images (default: false) */
|
|
2240
|
+
optimize?: boolean;
|
|
2241
|
+
/** Optimization quality (0-100, default: 80) */
|
|
2242
|
+
quality?: number;
|
|
2243
|
+
/** Enable verbose logging */
|
|
2244
|
+
verbose?: boolean;
|
|
2245
|
+
}
|
|
2246
|
+
/**
|
|
2247
|
+
* Result of image extraction
|
|
2248
|
+
*/
|
|
2249
|
+
interface ImageExtractionResult {
|
|
2250
|
+
/** Array of extracted images */
|
|
2251
|
+
images: ImageItem[];
|
|
2252
|
+
/** Total number of images found */
|
|
2253
|
+
count: number;
|
|
2254
|
+
/** Output directory (if files were extracted) */
|
|
2255
|
+
outputDir?: string;
|
|
2256
|
+
}
|
|
2257
|
+
/**
|
|
2258
|
+
* Extract images from a PDF document
|
|
2259
|
+
*
|
|
2260
|
+
* @param source - File path or buffer
|
|
2261
|
+
* @param options - Extraction options
|
|
2262
|
+
* @returns Extraction result with images
|
|
2263
|
+
*
|
|
2264
|
+
* @example
|
|
2265
|
+
* ```typescript
|
|
2266
|
+
* // Get image metadata only
|
|
2267
|
+
* const result = await extractImages('document.pdf');
|
|
2268
|
+
* console.log(`Found ${result.count} images`);
|
|
2269
|
+
*
|
|
2270
|
+
* // Extract to files
|
|
2271
|
+
* const result = await extractImages('document.pdf', {
|
|
2272
|
+
* extractFiles: true,
|
|
2273
|
+
* outputDir: './images'
|
|
2274
|
+
* });
|
|
2275
|
+
* ```
|
|
2276
|
+
*/
|
|
2277
|
+
declare function extractImages$1(source: PDFSource, options?: ImageExtractionOptions): Promise<ImageExtractionResult>;
|
|
2278
|
+
/**
|
|
2279
|
+
* Get image count from a PDF without full extraction
|
|
2280
|
+
*
|
|
2281
|
+
* @param source - File path
|
|
2282
|
+
* @returns Number of images
|
|
2283
|
+
*/
|
|
2284
|
+
declare function getImageCount(source: string): Promise<number>;
|
|
2285
|
+
|
|
2286
|
+
/**
|
|
2287
|
+
* Internal PDF Utilities Library
|
|
2288
|
+
*
|
|
2289
|
+
* A clean, internal library for PDF operations inspired by unpdf patterns.
|
|
2290
|
+
* Provides unified PDF loading, text extraction with positioning, metadata access,
|
|
2291
|
+
* page rendering, and image extraction.
|
|
2292
|
+
*
|
|
2293
|
+
* Key features:
|
|
2294
|
+
* - Single source of truth for pdf.js configuration
|
|
2295
|
+
* - Lazy loading of pdf.js for better startup performance
|
|
2296
|
+
* - Full text positioning support (our value-add over unpdf)
|
|
2297
|
+
* - Clean, simple API with full TypeScript support
|
|
2298
|
+
*
|
|
2299
|
+
* @example
|
|
2300
|
+
* ```typescript
|
|
2301
|
+
* import { pdfUtils } from 'pdf-plus';
|
|
2302
|
+
*
|
|
2303
|
+
* // Load and work with a PDF
|
|
2304
|
+
* const doc = await pdfUtils.loadPDF('document.pdf');
|
|
2305
|
+
*
|
|
2306
|
+
* // Extract text (simple)
|
|
2307
|
+
* const result = await pdfUtils.extractText(doc);
|
|
2308
|
+
* console.log(result.totalPages, result.text);
|
|
2309
|
+
*
|
|
2310
|
+
* // Extract text with positions (our value-add)
|
|
2311
|
+
* const items = await pdfUtils.extractTextItems(doc);
|
|
2312
|
+
* for (const item of items.items[0]) {
|
|
2313
|
+
* console.log(`"${item.str}" at (${item.x}, ${item.y})`);
|
|
2314
|
+
* }
|
|
2315
|
+
*
|
|
2316
|
+
* // Render page to image with target width
|
|
2317
|
+
* const render = await pdfUtils.renderPage(doc, 1, { width: 800 });
|
|
2318
|
+
* fs.writeFileSync('page1.png', render.buffer);
|
|
2319
|
+
*
|
|
2320
|
+
* // Get metadata with date parsing
|
|
2321
|
+
* const meta = await pdfUtils.getMetadata(doc, { parseDates: true });
|
|
2322
|
+
* console.log(`${meta.numPages} pages`);
|
|
2323
|
+
*
|
|
2324
|
+
* // Clean up
|
|
2325
|
+
* await doc.destroy();
|
|
2326
|
+
* ```
|
|
2327
|
+
*
|
|
2328
|
+
* @packageDocumentation
|
|
2329
|
+
*/
|
|
2330
|
+
|
|
2331
|
+
type index_ImageExtractionOptions = ImageExtractionOptions;
|
|
2332
|
+
type index_ImageExtractionResult = ImageExtractionResult;
|
|
2333
|
+
type index_ImageFormat = ImageFormat;
|
|
2334
|
+
type index_LinkExtractionResult = LinkExtractionResult;
|
|
2335
|
+
type index_MetadataOptions = MetadataOptions;
|
|
2336
|
+
declare const index_PDFDocumentProxy: typeof PDFDocumentProxy;
|
|
2337
|
+
type index_PDFInput = PDFInput;
|
|
2338
|
+
type index_PDFLoadOptions = PDFLoadOptions;
|
|
2339
|
+
type index_PDFMetadata = PDFMetadata;
|
|
2340
|
+
declare const index_PDFPageProxy: typeof PDFPageProxy;
|
|
2341
|
+
type index_PDFSource = PDFSource;
|
|
2342
|
+
type index_PDFTextItem = PDFTextItem;
|
|
2343
|
+
type index_PageInfo = PageInfo;
|
|
2344
|
+
type index_RenderDataURLResult = RenderDataURLResult;
|
|
2345
|
+
type index_RenderOptions = RenderOptions;
|
|
2346
|
+
type index_RenderResult = RenderResult;
|
|
2347
|
+
type index_TextExtractionMeta = TextExtractionMeta;
|
|
2348
|
+
type index_TextExtractionOptions = TextExtractionOptions;
|
|
2349
|
+
type index_TextExtractionProgress = TextExtractionProgress;
|
|
2350
|
+
type index_TextExtractionResult<T extends string | string[]> = TextExtractionResult<T>;
|
|
2351
|
+
type index_TextItemsExtractionResult = TextItemsExtractionResult;
|
|
2352
|
+
declare const index_extractFullText: typeof extractFullText;
|
|
2353
|
+
declare const index_extractLinks: typeof extractLinks;
|
|
2354
|
+
declare const index_extractPageText: typeof extractPageText;
|
|
2355
|
+
declare const index_extractPageTextItems: typeof extractPageTextItems;
|
|
2356
|
+
declare const index_extractTextItems: typeof extractTextItems;
|
|
2357
|
+
declare const index_getAllPagesInfo: typeof getAllPagesInfo;
|
|
2358
|
+
declare const index_getDocumentProxy: typeof getDocumentProxy;
|
|
2359
|
+
declare const index_getImageCount: typeof getImageCount;
|
|
2360
|
+
declare const index_getMetadata: typeof getMetadata;
|
|
2361
|
+
declare const index_getPDFJS: typeof getPDFJS;
|
|
2362
|
+
declare const index_getPageCount: typeof getPageCount;
|
|
2363
|
+
declare const index_getPageInfo: typeof getPageInfo;
|
|
2364
|
+
declare const index_getVerbosityLevel: typeof getVerbosityLevel;
|
|
2365
|
+
declare const index_isBrowser: typeof isBrowser;
|
|
2366
|
+
declare const index_isNode: typeof isNode;
|
|
2367
|
+
declare const index_isPDF: typeof isPDF;
|
|
2368
|
+
declare const index_isPDFDocumentProxy: typeof isPDFDocumentProxy;
|
|
2369
|
+
declare const index_loadPDF: typeof loadPDF;
|
|
2370
|
+
declare const index_renderPage: typeof renderPage;
|
|
2371
|
+
declare const index_renderPageAsDataURL: typeof renderPageAsDataURL;
|
|
2372
|
+
declare const index_renderPageToBase64: typeof renderPageToBase64;
|
|
2373
|
+
declare const index_renderPageToDataURL: typeof renderPageToDataURL;
|
|
2374
|
+
declare const index_renderPages: typeof renderPages;
|
|
2375
|
+
declare const index_validatePageNumber: typeof validatePageNumber;
|
|
2376
|
+
declare namespace index {
|
|
2377
|
+
export { type index_ImageExtractionOptions as ImageExtractionOptions, type index_ImageExtractionResult as ImageExtractionResult, type index_ImageFormat as ImageFormat, type index_LinkExtractionResult as LinkExtractionResult, type index_MetadataOptions as MetadataOptions, index_PDFDocumentProxy as PDFDocumentProxy, type index_PDFInput as PDFInput, type index_PDFLoadOptions as PDFLoadOptions, type index_PDFMetadata as PDFMetadata, index_PDFPageProxy as PDFPageProxy, type index_PDFSource as PDFSource, type index_PDFTextItem as PDFTextItem, type index_PageInfo as PageInfo, type index_RenderDataURLResult as RenderDataURLResult, type index_RenderOptions as RenderOptions, type index_RenderResult as RenderResult, type index_TextExtractionMeta as TextExtractionMeta, type index_TextExtractionOptions as TextExtractionOptions, type index_TextExtractionProgress as TextExtractionProgress, type index_TextExtractionResult as TextExtractionResult, type index_TextItemsExtractionResult as TextItemsExtractionResult, index_extractFullText as extractFullText, extractImages$1 as extractImages, index_extractLinks as extractLinks, index_extractPageText as extractPageText, index_extractPageTextItems as extractPageTextItems, extractText$1 as extractText, index_extractTextItems as extractTextItems, index_getAllPagesInfo as getAllPagesInfo, index_getDocumentProxy as getDocumentProxy, index_getImageCount as getImageCount, index_getMetadata as getMetadata, index_getPDFJS as getPDFJS, index_getPageCount as getPageCount, index_getPageInfo as getPageInfo, index_getVerbosityLevel as getVerbosityLevel, index_isBrowser as isBrowser, index_isNode as isNode, index_isPDF as isPDF, index_isPDFDocumentProxy as isPDFDocumentProxy, index_loadPDF as loadPDF, index_renderPage as renderPage, index_renderPageAsDataURL as renderPageAsDataURL, index_renderPageToBase64 as renderPageToBase64, index_renderPageToDataURL as renderPageToDataURL, index_renderPages as renderPages, index_validatePageNumber as validatePageNumber };
|
|
2378
|
+
}
|
|
2379
|
+
|
|
1551
2380
|
/**
|
|
1552
2381
|
* Validate extractor configuration
|
|
1553
2382
|
*/
|
|
@@ -1658,6 +2487,32 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
|
|
|
1658
2487
|
* ```
|
|
1659
2488
|
*/
|
|
1660
2489
|
declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
2490
|
+
/**
|
|
2491
|
+
* Generate page images from a PDF (render pages to image files)
|
|
2492
|
+
*
|
|
2493
|
+
* This is a convenience function to render PDF pages to images without
|
|
2494
|
+
* extracting embedded images or text. Perfect for creating page previews
|
|
2495
|
+
* or thumbnails.
|
|
2496
|
+
*
|
|
2497
|
+
* @param pdfPath - Path to the PDF file
|
|
2498
|
+
* @param outputDir - Directory to save page images
|
|
2499
|
+
* @param options - Page rendering options
|
|
2500
|
+
* @returns Promise resolving to array of generated image file paths
|
|
2501
|
+
*
|
|
2502
|
+
* @example
|
|
2503
|
+
* ```typescript
|
|
2504
|
+
* import { generatePageImages } from 'pdf-plus';
|
|
2505
|
+
*
|
|
2506
|
+
* const imagePaths = await generatePageImages('document.pdf', './page-images', {
|
|
2507
|
+
* pageImageFormat: 'jpg',
|
|
2508
|
+
* pageImageDpi: 150,
|
|
2509
|
+
* pageRenderEngine: 'poppler'
|
|
2510
|
+
* });
|
|
2511
|
+
*
|
|
2512
|
+
* console.log(`Generated ${imagePaths.length} page images`);
|
|
2513
|
+
* ```
|
|
2514
|
+
*/
|
|
2515
|
+
declare function generatePageImages(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
|
|
1661
2516
|
/**
|
|
1662
2517
|
* Extract PDF content in streaming mode (Phase 4 - NEW!)
|
|
1663
2518
|
*
|
|
@@ -1697,10 +2552,46 @@ declare function extractImageFiles(pdfPath: string, outputDir?: string, options?
|
|
|
1697
2552
|
* ```
|
|
1698
2553
|
*/
|
|
1699
2554
|
declare function extractPdfStream(pdfPath: string, options?: Partial<ExtractionOptions>): StreamingExtractionResult;
|
|
2555
|
+
/**
|
|
2556
|
+
* Extract tables from a PDF file (convenience function)
|
|
2557
|
+
*
|
|
2558
|
+
* Detects and extracts tables from a PDF document using text positioning data.
|
|
2559
|
+
* Tables are detected through spatial clustering of text items.
|
|
2560
|
+
*
|
|
2561
|
+
* @param pdfPath - Path to the PDF file
|
|
2562
|
+
* @param options - Table extraction options
|
|
2563
|
+
* @returns Promise resolving to table extraction result
|
|
2564
|
+
*
|
|
2565
|
+
* @example
|
|
2566
|
+
* ```typescript
|
|
2567
|
+
* import { extractTables, TableExtractor } from 'pdf-plus';
|
|
2568
|
+
*
|
|
2569
|
+
* // Using convenience function
|
|
2570
|
+
* const result = await extractTables('document.pdf', {
|
|
2571
|
+
* pages: [1, 2, 3],
|
|
2572
|
+
* detectHeaders: true,
|
|
2573
|
+
* minRows: 2,
|
|
2574
|
+
* minColumns: 2
|
|
2575
|
+
* });
|
|
2576
|
+
*
|
|
2577
|
+
* console.log(`Found ${result.tableCount} tables`);
|
|
2578
|
+
*
|
|
2579
|
+
* // Access table data
|
|
2580
|
+
* for (const table of result.tables) {
|
|
2581
|
+
* console.log(`Table on page ${table.page}: ${table.rowCount}x${table.columnCount}`);
|
|
2582
|
+
*
|
|
2583
|
+
* // Convert to different formats
|
|
2584
|
+
* const extractor = new TableExtractor();
|
|
2585
|
+
* console.log(extractor.tableToMarkdown(table));
|
|
2586
|
+
* console.log(extractor.tableToCSV(table));
|
|
2587
|
+
* }
|
|
2588
|
+
* ```
|
|
2589
|
+
*/
|
|
2590
|
+
declare function extractTables(pdfPath: string, options?: TableExtractionOptions): Promise<TableExtractionResult>;
|
|
1700
2591
|
/**
|
|
1701
2592
|
* Library version
|
|
1702
2593
|
*/
|
|
1703
|
-
declare const version = "
|
|
2594
|
+
declare const version = "2.0.0";
|
|
1704
2595
|
/**
|
|
1705
2596
|
* Default export containing all public APIs
|
|
1706
2597
|
* Useful for CommonJS: const pdfPlus = require('pdf-plus');
|
|
@@ -1711,17 +2602,20 @@ declare const _default: {
|
|
|
1711
2602
|
StreamingPDFExtractor: typeof StreamingPDFExtractor;
|
|
1712
2603
|
TextExtractor: typeof TextExtractor;
|
|
1713
2604
|
ImageExtractor: typeof ImageExtractor;
|
|
2605
|
+
TableExtractor: typeof TableExtractor;
|
|
1714
2606
|
ImageOptimizer: typeof ImageOptimizer;
|
|
1715
2607
|
FormatProcessor: typeof FormatProcessor;
|
|
1716
2608
|
extractPdfContent: typeof extractPdfContent;
|
|
1717
2609
|
extractText: typeof extractText;
|
|
1718
2610
|
extractImages: typeof extractImages;
|
|
1719
2611
|
extractImageFiles: typeof extractImageFiles;
|
|
2612
|
+
generatePageImages: typeof generatePageImages;
|
|
1720
2613
|
extractPdfStream: typeof extractPdfStream;
|
|
2614
|
+
extractTables: typeof extractTables;
|
|
1721
2615
|
validateConfig: typeof validateConfig;
|
|
1722
2616
|
validateImageRefFormat: typeof validateImageRefFormat;
|
|
1723
2617
|
validateFilePath: typeof validateFilePath;
|
|
1724
2618
|
version: string;
|
|
1725
2619
|
};
|
|
1726
2620
|
|
|
1727
|
-
export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult,
|
|
2621
|
+
export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo$1 as PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type Table, type TableCell, type TableColumn, type TableExtractionOptions, type TableExtractionResult, TableExtractor, type TableRow, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractTables, extractText, generatePageImages, pdfExtractor, index as pdfUtils, validateConfig, validateFilePath, validateImageRefFormat, version };
|