pdf-plus 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,3 +1,128 @@
1
+ import * as pdfjs_dist_legacy_build_pdf_mjs from 'pdfjs-dist/legacy/build/pdf.mjs';
2
+ import { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist/legacy/build/pdf.mjs';
3
+
4
+ /**
5
+ * Table Extraction Types
6
+ *
7
+ * Type definitions for automatic table detection and extraction from PDFs.
8
+ */
9
+
10
+ /**
11
+ * A detected table in the PDF
12
+ */
13
+ interface Table {
14
+ /** Unique identifier for the table */
15
+ id: string;
16
+ /** Page number where the table is located (1-based) */
17
+ page: number;
18
+ /** Bounding box position of the table */
19
+ position: Position;
20
+ /** Array of rows in the table */
21
+ rows: TableRow[];
22
+ /** Array of column definitions */
23
+ columns: TableColumn[];
24
+ /** Confidence score for table detection (0-1) */
25
+ confidence: number;
26
+ /** Whether the table has a detected header row */
27
+ hasHeader: boolean;
28
+ /** Number of rows in the table */
29
+ rowCount: number;
30
+ /** Number of columns in the table */
31
+ columnCount: number;
32
+ }
33
+ /**
34
+ * A row in a table
35
+ */
36
+ interface TableRow {
37
+ /** Row index (0-based) */
38
+ index: number;
39
+ /** Y position of the row */
40
+ y: number;
41
+ /** Height of the row */
42
+ height: number;
43
+ /** Cells in this row */
44
+ cells: TableCell[];
45
+ /** Whether this row is a header row */
46
+ isHeader: boolean;
47
+ }
48
+ /**
49
+ * A column in a table
50
+ */
51
+ interface TableColumn {
52
+ /** Column index (0-based) */
53
+ index: number;
54
+ /** X position of the column */
55
+ x: number;
56
+ /** Width of the column */
57
+ width: number;
58
+ /** Column header text (if detected) */
59
+ header?: string;
60
+ }
61
+ /**
62
+ * A cell in a table
63
+ */
64
+ interface TableCell {
65
+ /** Row index (0-based) */
66
+ row: number;
67
+ /** Column index (0-based) */
68
+ column: number;
69
+ /** Text content of the cell */
70
+ content: string;
71
+ /** Position of the cell */
72
+ position: Position;
73
+ /** Whether this cell is in a header row */
74
+ isHeader: boolean;
75
+ /** Row span (for merged cells, default: 1) */
76
+ rowSpan?: number;
77
+ /** Column span (for merged cells, default: 1) */
78
+ colSpan?: number;
79
+ }
80
+ /**
81
+ * Options for table extraction
82
+ */
83
+ interface TableExtractionOptions {
84
+ /** Specific pages to extract tables from (1-based). If not specified, all pages are processed. */
85
+ pages?: number[];
86
+ /** Y-position tolerance for grouping text items into rows (default: 3) */
87
+ rowTolerance?: number;
88
+ /** X-position tolerance for grouping text items into columns (default: 5) */
89
+ columnTolerance?: number;
90
+ /** Minimum number of columns to consider a valid table (default: 2) */
91
+ minColumns?: number;
92
+ /** Minimum number of rows to consider a valid table (default: 2) */
93
+ minRows?: number;
94
+ /** Minimum grid density (filled cells / total cells) to consider valid (default: 0.6) */
95
+ minGridDensity?: number;
96
+ /** Whether to detect header rows (default: true) */
97
+ detectHeaders?: boolean;
98
+ /** Enable verbose logging (default: false) */
99
+ verbose?: boolean;
100
+ }
101
+ /**
102
+ * Result of table extraction
103
+ */
104
+ interface TableExtractionResult {
105
+ /** Array of detected tables */
106
+ tables: Table[];
107
+ /** Total number of pages processed */
108
+ pagesProcessed: number;
109
+ /** Total number of tables found */
110
+ tableCount: number;
111
+ /** Extraction metadata */
112
+ metadata: TableExtractionMetadata;
113
+ }
114
+ /**
115
+ * Metadata about the extraction process
116
+ */
117
+ interface TableExtractionMetadata {
118
+ /** Time taken to extract tables (in milliseconds) */
119
+ extractionTimeMs: number;
120
+ /** Options used for extraction */
121
+ options: TableExtractionOptions;
122
+ /** Number of candidate tables that were filtered out */
123
+ filteredCandidates: number;
124
+ }
125
+
1
126
  /**
2
127
  * Types for streaming PDF extraction
3
128
  */
@@ -30,7 +155,7 @@ interface PageEvent extends StreamEvent {
30
155
  totalPages: number;
31
156
  textLength: number;
32
157
  imageCount: number;
33
- pageInfo?: PageInfo;
158
+ pageInfo?: PageInfo$1;
34
159
  }
35
160
  /**
36
161
  * Image event - emitted when an image is extracted
@@ -240,7 +365,7 @@ interface ImageItem {
240
365
  filePath?: string;
241
366
  data?: Uint8Array;
242
367
  }
243
- interface PageInfo {
368
+ interface PageInfo$1 {
244
369
  number: number;
245
370
  width: number;
246
371
  height: number;
@@ -259,7 +384,7 @@ interface DocumentMetadata {
259
384
  }
260
385
  interface ExtractionResult {
261
386
  document: DocumentMetadata;
262
- pages: PageInfo[];
387
+ pages: PageInfo$1[];
263
388
  images: ImageItem[];
264
389
  textItems: TextItem[];
265
390
  text: string;
@@ -267,7 +392,10 @@ interface ExtractionResult {
267
392
  cleanText: string;
268
393
  summary?: DocumentSummary;
269
394
  structuredData?: StructuredPageData;
395
+ /** Detected tables (only populated when extractTables: true) */
396
+ tables?: Table[];
270
397
  }
398
+
271
399
  interface DocumentSummary {
272
400
  totalPages: number;
273
401
  totalTextItems: number;
@@ -378,39 +506,6 @@ interface ExtractionOptions {
378
506
  * (default: false - convert to JPG)
379
507
  */
380
508
  preserveJp2?: boolean;
381
- /**
382
- * Use Sharp library for ALL image processing operations (better quality & performance).
383
- *
384
- * When enabled, Sharp is used as the global image processing engine for:
385
- * - JP2 to JPG conversion
386
- * - Image optimization
387
- * - Image resizing
388
- * - Format conversions
389
- *
390
- * Sharp is an OPTIONAL dependency. Install it for better performance:
391
- * ```bash
392
- * npm install sharp
393
- * ```
394
- *
395
- * If Sharp is not installed, the library will automatically fall back to pure JavaScript (Jimp).
396
- *
397
- * (default: false - use pure JS Jimp)
398
- */
399
- useSharp?: boolean;
400
- /**
401
- * Use Poppler's pdfimages as fallback when standard extraction finds no images.
402
- * Poppler can extract images that are embedded in non-standard ways (Form XObjects, inline images, etc.)
403
- * that the standard XObject-based extraction might miss.
404
- *
405
- * Requires poppler-utils to be installed on the system.
406
- *
407
- * Installation:
408
- * - Linux: sudo apt-get install poppler-utils
409
- * - macOS: brew install poppler
410
- *
411
- * (default: false)
412
- */
413
- usePopplerFallback?: boolean;
414
509
  /** Enable parallel processing for better performance (default: true) */
415
510
  parallelProcessing?: boolean;
416
511
  /** Maximum number of pages to process in parallel (default: 10) */
@@ -480,15 +575,25 @@ interface ExtractionOptions {
480
575
  /** Quality for JPG page images (default: 90) */
481
576
  pageImageQuality?: number;
482
577
  /**
483
- * Page rendering engine: 'pdfjs' | 'poppler' (default: 'pdfjs')
484
- * - pdfjs: Pure JavaScript, no dependencies, but NO JP2 support
485
- * - poppler: Requires system poppler-utils, but HAS full JP2 support
578
+ * Page rendering engine (default: 'pdfjs')
579
+ *
580
+ * Note: Poppler support has been removed. Only 'pdfjs' is now supported.
581
+ * This option is kept for backwards compatibility but is ignored.
582
+ *
583
+ * @deprecated Poppler support removed - pdfjs is now the only engine
486
584
  */
487
- pageRenderEngine?: "pdfjs" | "poppler";
585
+ pageRenderEngine?: "pdfjs";
488
586
  /** Thumbnail width (default: 200) */
489
587
  thumbnailWidth?: number;
490
588
  /** Thumbnail quality for JPG (default: 80) */
491
589
  thumbnailQuality?: number;
590
+ /**
591
+ * Enable table extraction (default: false)
592
+ * When enabled, tables will be detected and included in the result
593
+ */
594
+ extractTables?: boolean;
595
+ /** Options for table extraction */
596
+ tableOptions?: TableExtractionOptions;
492
597
  }
493
598
  interface ProgressInfo {
494
599
  currentPage: number;
@@ -593,7 +698,6 @@ declare class PDFExtractor {
593
698
  private textExtractor;
594
699
  private imageExtractor;
595
700
  private pageToImageConverter;
596
- private popplerConverter;
597
701
  private formatProcessor;
598
702
  private structuredDataGenerator;
599
703
  private cacheManager;
@@ -818,13 +922,8 @@ declare class StreamingPDFExtractor implements StreamingExtractionResult {
818
922
  * ```
819
923
  */
820
924
  declare class TextExtractor {
821
- constructor();
822
- /**
823
- * Initialize pdf.js worker
824
- */
825
- private initializePdfjs;
826
925
  /**
827
- * Load PDF document
926
+ * Load PDF document using internal pdf utils
828
927
  */
829
928
  private loadDocument;
830
929
  /**
@@ -936,11 +1035,6 @@ declare class StructuredTextExtractor {
936
1035
  private pdfLibDoc;
937
1036
  private pdfLibPages;
938
1037
  private textData;
939
- constructor();
940
- /**
941
- * Initialize pdf.js worker
942
- */
943
- private initializePdfjs;
944
1038
  /**
945
1039
  * Process PDF with accurate page-by-page extraction
946
1040
  */
@@ -1074,9 +1168,10 @@ declare class ImageExtractor {
1074
1168
  */
1075
1169
  private extractImageData;
1076
1170
  /**
1077
- * Detect image format from binary data (from NestJS implementation)
1171
+ * Detect image format from binary data
1172
+ * Uses centralized image format detection utility
1078
1173
  */
1079
- private detectImageFormat;
1174
+ private detectImageFormatLocal;
1080
1175
  /**
1081
1176
  * Create a PNG file from raw pixel data using actual PDF metadata
1082
1177
  */
@@ -1298,23 +1393,12 @@ interface ThumbnailOptions extends SinglePageOptions {
1298
1393
  * dpi: 150
1299
1394
  * });
1300
1395
  * ```
1396
+ *
1397
+ * NOTE: pdf.js does not support JPEG2000 (JP2) images by default.
1398
+ * Pages with JP2 images will have blank spaces where the images should be.
1399
+ * The embedded images are still extracted correctly via extractImages option.
1301
1400
  */
1302
1401
  declare class PageToImageConverter {
1303
- private pdfjs;
1304
- /**
1305
- * Get or load pdf.js module with proper worker configuration
1306
- * Based on pdf-to-img library approach
1307
- *
1308
- * NOTE: pdf.js does not support JPEG2000 (JP2) images by default.
1309
- * Pages with JP2 images will have blank spaces where the images should be.
1310
- * The embedded images are still extracted correctly via extractImages option.
1311
- *
1312
- * For complete page rendering with JP2 support, consider using:
1313
- * - Poppler (pdf-poppler npm package) - requires system dependency
1314
- * - ImageMagick - requires system dependency
1315
- * - Ghostscript - requires system dependency
1316
- */
1317
- private getPdfjs;
1318
1402
  /**
1319
1403
  * Convert all pages of a PDF to images
1320
1404
  *
@@ -1363,12 +1447,13 @@ declare class PageToImageConverter {
1363
1447
  /**
1364
1448
  * Render a PDF page to image buffer
1365
1449
  *
1366
- * Based on pdf-to-img library approach - let pdf.js handle canvas creation
1367
- * @see https://github.com/k-yle/pdf-to-img
1450
+ * Uses @napi-rs/canvas via custom canvas factory for high-performance rendering
1368
1451
  */
1369
1452
  private renderPageToBuffer;
1370
1453
  /**
1371
1454
  * Convert canvas to image buffer
1455
+ *
1456
+ * Uses @napi-rs/canvas async encode() for JPEG/WebP quality control
1372
1457
  */
1373
1458
  private canvasToBuffer;
1374
1459
  /**
@@ -1390,47 +1475,67 @@ declare class PageToImageConverter {
1390
1475
  }
1391
1476
 
1392
1477
  /**
1393
- * Poppler-based PDF Page to Image Converter
1478
+ * Table Extractor
1394
1479
  *
1395
- * Uses Poppler's pdfToCairo for high-quality rendering with full JPEG2000 support.
1396
- * Requires poppler-utils to be installed on the system.
1397
- *
1398
- * Installation:
1399
- * - Linux: sudo apt-get install poppler-utils
1400
- * - macOS: brew install poppler
1401
- * - Windows: Download from https://blog.alivate.com.au/poppler-windows/
1480
+ * Main class for detecting and extracting tables from PDF documents.
1481
+ * Uses text positioning data to identify table structures.
1402
1482
  */
1403
1483
 
1404
- declare class PopplerConverter {
1405
- private poppler;
1484
+ /**
1485
+ * TableExtractor class for detecting and extracting tables from PDFs
1486
+ *
1487
+ * @example
1488
+ * ```typescript
1489
+ * const extractor = new TableExtractor();
1490
+ * const result = await extractor.extract('document.pdf', {
1491
+ * detectHeaders: true,
1492
+ * minRows: 3
1493
+ * });
1494
+ *
1495
+ * for (const table of result.tables) {
1496
+ * console.log(extractor.tableToMarkdown(table));
1497
+ * }
1498
+ * ```
1499
+ */
1500
+ declare class TableExtractor {
1406
1501
  /**
1407
- * Get or initialize Poppler instance
1502
+ * Extract tables from a PDF file
1503
+ *
1504
+ * @param pdfPath - Path to the PDF file
1505
+ * @param options - Extraction options
1506
+ * @returns Promise resolving to extraction result
1408
1507
  */
1409
- private getPoppler;
1508
+ extract(pdfPath: string, options?: TableExtractionOptions): Promise<TableExtractionResult>;
1410
1509
  /**
1411
- * Convert PDF pages to images using Poppler
1412
- *
1413
- * @param pdfPath - Path to PDF file
1414
- * @param options - Conversion options
1415
- * @returns Conversion result with image paths
1510
+ * Detects tables on a single page
1416
1511
  */
1417
- convertToImages(pdfPath: string, options: PageToImageOptions): Promise<PageToImageResult>;
1512
+ private detectTablesOnPage;
1418
1513
  /**
1419
- * Get PDF information using pdfinfo
1514
+ * Builds a Table object from a validated candidate
1420
1515
  */
1421
- private getPdfInfo;
1516
+ private buildTable;
1422
1517
  /**
1423
- * Get image dimensions
1518
+ * Converts a table to a 2D array of strings
1424
1519
  */
1425
- private getImageDimensions;
1520
+ tableToArray(table: Table, includeHeaders?: boolean): string[][];
1426
1521
  /**
1427
- * Format filename pattern
1522
+ * Converts a table to CSV format
1428
1523
  */
1429
- private formatFilename;
1524
+ tableToCSV(table: Table, delimiter?: string): string;
1430
1525
  /**
1431
- * Format bytes to human-readable string
1526
+ * Converts a table to Markdown format
1432
1527
  */
1433
- private formatBytes;
1528
+ tableToMarkdown(table: Table): string;
1529
+ /**
1530
+ * Converts a table to HTML format
1531
+ */
1532
+ tableToHTML(table: Table, options?: {
1533
+ tableClass?: string;
1534
+ }): string;
1535
+ /**
1536
+ * Converts a table to an array of objects (using headers as keys)
1537
+ */
1538
+ tableToObjects(table: Table): Array<Record<string, string>>;
1434
1539
  }
1435
1540
 
1436
1541
  /**
@@ -1442,7 +1547,7 @@ interface OptimizationResult {
1442
1547
  optimizedSize: number;
1443
1548
  savedBytes: number;
1444
1549
  savedPercent: number;
1445
- engine: "jimp" | "sharp" | "none";
1550
+ engine: "canvas" | "none";
1446
1551
  error?: string;
1447
1552
  }
1448
1553
  /**
@@ -1451,19 +1556,17 @@ interface OptimizationResult {
1451
1556
  interface OptimizationOptions {
1452
1557
  quality?: number;
1453
1558
  verbose?: boolean;
1454
- useSharp?: boolean;
1455
1559
  }
1456
1560
  /**
1457
- * Image optimizer using Jimp (pure JavaScript)
1561
+ * Image optimizer using @napi-rs/canvas
1458
1562
  *
1459
- * This class provides image optimization capabilities using Jimp, a pure JavaScript
1460
- * image processing library with no native dependencies. It supports JPEG and PNG
1563
+ * This class provides image optimization capabilities using @napi-rs/canvas,
1564
+ * a high-performance Skia-based canvas library. It supports JPEG, PNG, and WebP
1461
1565
  * optimization with quality control.
1462
1566
  *
1463
1567
  * @example
1464
1568
  * ```typescript
1465
1569
  * const result = await ImageOptimizer.optimizeFile('image.jpg', {
1466
- * engine: 'auto',
1467
1570
  * quality: 80
1468
1571
  * });
1469
1572
  *
@@ -1483,22 +1586,16 @@ declare class ImageOptimizer {
1483
1586
  */
1484
1587
  static optimizeFile(filePath: string, options?: OptimizationOptions): Promise<OptimizationResult>;
1485
1588
  /**
1486
- * Optimize using Sharp (optional dependency)
1487
- */
1488
- private static optimizeWithSharp;
1489
- /**
1490
- * Optimize using Jimp (pure JavaScript)
1589
+ * Optimize using @napi-rs/canvas (Skia-based)
1491
1590
  */
1492
- private static optimizeWithJimp;
1591
+ private static optimizeWithCanvas;
1493
1592
  /**
1494
1593
  * Convert JPEG 2000 formats (jp2, jpx, j2c, jpm) to JPG
1495
1594
  *
1496
1595
  * JPEG 2000 files are not widely supported by browsers and image tools.
1497
1596
  * This method converts them to standard JPG format for better compatibility.
1498
1597
  *
1499
- * Supports two conversion engines:
1500
- * - Jimp (default): Pure JavaScript, works everywhere
1501
- * - Sharp (optional): Better color preservation, requires native compilation
1598
+ * Uses @napi-rs/canvas with OpenJPEG WASM decoder for high-performance conversion.
1502
1599
  *
1503
1600
  * @param jp2Path - Path to the JPEG 2000 file (jp2, jpx, j2c, or jpm)
1504
1601
  * @param options - Conversion options
@@ -1507,7 +1604,6 @@ declare class ImageOptimizer {
1507
1604
  static convertJp2ToJpg(jp2Path: string, options?: {
1508
1605
  quality?: number;
1509
1606
  verbose?: boolean;
1510
- useSharp?: boolean;
1511
1607
  }): Promise<{
1512
1608
  success: boolean;
1513
1609
  newPath?: string;
@@ -1571,6 +1667,716 @@ declare class FormatProcessor {
1571
1667
  formatDuration(milliseconds: number): string;
1572
1668
  }
1573
1669
 
1670
+ /**
1671
+ * Type definitions for the internal PDF utilities library
1672
+ *
1673
+ * Provides clean interfaces for PDF operations inspired by unpdf patterns.
1674
+ */
1675
+
1676
+ /**
1677
+ * Source for loading a PDF - either a file path or raw bytes
1678
+ */
1679
+ type PDFSource = string | Uint8Array | Buffer;
1680
+ /**
1681
+ * Input type for PDF operations - accepts either raw data or an already loaded document
1682
+ */
1683
+ type PDFInput = PDFSource | PDFDocumentProxy;
1684
+ /**
1685
+ * Supported image formats for rendering
1686
+ */
1687
+ type ImageFormat = "png" | "jpeg" | "webp";
1688
+ /**
1689
+ * Options for loading a PDF document
1690
+ */
1691
+ interface PDFLoadOptions {
1692
+ /** Password for encrypted PDFs */
1693
+ password?: string;
1694
+ /** Verbosity level for pdfjs logging */
1695
+ verbosity?: number;
1696
+ }
1697
+ /**
1698
+ * Text item with full positioning information
1699
+ */
1700
+ interface PDFTextItem {
1701
+ /** The text string */
1702
+ str: string;
1703
+ /** X position (from transform matrix) */
1704
+ x: number;
1705
+ /** Y position (from transform matrix) */
1706
+ y: number;
1707
+ /** Width of the text item */
1708
+ width: number;
1709
+ /** Height of the text item */
1710
+ height: number;
1711
+ /** Font name */
1712
+ fontName: string;
1713
+ /** Font size (derived from transform) */
1714
+ fontSize: number;
1715
+ /** Full transform matrix [a, b, c, d, e, f] */
1716
+ transform: number[];
1717
+ /** Whether this item ends with EOL */
1718
+ hasEOL: boolean;
1719
+ /** Text direction (ltr or rtl) */
1720
+ dir: "ltr" | "rtl" | "ttb" | "btt";
1721
+ }
1722
+ /**
1723
+ * Progress information for text extraction
1724
+ */
1725
+ interface TextExtractionProgress {
1726
+ /** Number of pages processed so far */
1727
+ processedPages: number;
1728
+ /** Total number of pages to process */
1729
+ totalPages: number;
1730
+ /** Percentage complete (0-100) */
1731
+ percentage: number;
1732
+ /** Current page being processed (1-based) */
1733
+ currentPage?: number;
1734
+ }
1735
+ /**
1736
+ * Performance metadata for text extraction
1737
+ */
1738
+ interface TextExtractionMeta {
1739
+ /** Duration in milliseconds */
1740
+ duration: number;
1741
+ /** Number of pages processed */
1742
+ pagesProcessed: number;
1743
+ /** Processing method used */
1744
+ method: "parallel" | "sequential" | "chunked";
1745
+ }
1746
+ /**
1747
+ * Options for text extraction
1748
+ */
1749
+ interface TextExtractionOptions {
1750
+ /** First page to extract (1-based, default: 1) */
1751
+ firstPage?: number;
1752
+ /** Last page to extract (1-based, default: all pages) */
1753
+ lastPage?: number;
1754
+ /** Include marked content in extraction */
1755
+ includeMarkedContent?: boolean;
1756
+ /** Disable text normalization */
1757
+ disableNormalization?: boolean;
1758
+ /** Merge all pages into a single string (default: false) */
1759
+ mergePages?: boolean;
1760
+ /** Maximum concurrent page extractions (default: 10) */
1761
+ maxConcurrency?: number;
1762
+ /** Progress callback called after each page is processed */
1763
+ onProgress?: (progress: TextExtractionProgress) => void;
1764
+ /** Chunk size for processing very large PDFs (default: undefined = no chunking) */
1765
+ chunkSize?: number;
1766
+ /** Callback called after each chunk is processed (when chunkSize is set) */
1767
+ onChunkComplete?: (info: {
1768
+ chunkIndex: number;
1769
+ totalChunks: number;
1770
+ pagesProcessed: number;
1771
+ }) => void;
1772
+ }
1773
+ /**
1774
+ * Result of text extraction
1775
+ */
1776
+ interface TextExtractionResult<T extends string | string[]> {
1777
+ /** Total number of pages in the document */
1778
+ totalPages: number;
1779
+ /** Extracted text - string[] when mergePages is false, string when true */
1780
+ text: T;
1781
+ /** Performance metadata (available when extraction completes) */
1782
+ _meta?: TextExtractionMeta;
1783
+ }
1784
+ /**
1785
+ * Result of text items extraction
1786
+ */
1787
+ interface TextItemsExtractionResult {
1788
+ /** Total number of pages in the document */
1789
+ totalPages: number;
1790
+ /** Text items per page */
1791
+ items: PDFTextItem[][];
1792
+ /** Performance metadata (available when extraction completes) */
1793
+ _meta?: TextExtractionMeta;
1794
+ }
1795
+ /**
1796
+ * Options for metadata extraction
1797
+ */
1798
+ interface MetadataOptions {
1799
+ /** Parse date strings (CreationDate, ModDate) into Date objects (default: false) */
1800
+ parseDates?: boolean;
1801
+ }
1802
+ /**
1803
+ * Result of link extraction
1804
+ */
1805
+ interface LinkExtractionResult {
1806
+ /** Total number of pages in the document */
1807
+ totalPages: number;
1808
+ /** Extracted URLs from the document */
1809
+ links: string[];
1810
+ }
1811
+ /**
1812
+ * Options for page rendering
1813
+ */
1814
+ interface RenderOptions {
1815
+ /** Scale factor (default: 1). Ignored if width or height is set. */
1816
+ scale?: number;
1817
+ /** DPI for rendering (default: 72, affects scale) */
1818
+ dpi?: number;
1819
+ /** Target width in pixels. Auto-calculates scale to fit. */
1820
+ width?: number;
1821
+ /** Target height in pixels. Auto-calculates scale to fit. */
1822
+ height?: number;
1823
+ /** Output format (default: 'png') */
1824
+ format?: ImageFormat;
1825
+ /** Quality for JPEG/WebP (0-100, default: 90) */
1826
+ quality?: number;
1827
+ /** Background color (default: '#FFFFFF') */
1828
+ backgroundColor?: string;
1829
+ /** Transparent background (default: false) */
1830
+ transparent?: boolean;
1831
+ }
1832
+ /**
1833
+ * Result of rendering a page
1834
+ */
1835
+ interface RenderResult {
1836
+ /** Image buffer */
1837
+ buffer: Buffer;
1838
+ /** Image width in pixels */
1839
+ width: number;
1840
+ /** Image height in pixels */
1841
+ height: number;
1842
+ /** Output format */
1843
+ format: ImageFormat;
1844
+ }
1845
+ /**
1846
+ * Result of rendering a page as data URL
1847
+ */
1848
+ interface RenderDataURLResult {
1849
+ /** Data URL string (e.g., "data:image/png;base64,...") */
1850
+ dataURL: string;
1851
+ /** Image width in pixels */
1852
+ width: number;
1853
+ /** Image height in pixels */
1854
+ height: number;
1855
+ /** Output format */
1856
+ format: ImageFormat;
1857
+ }
1858
+ /**
1859
+ * PDF document metadata
1860
+ */
1861
+ interface PDFMetadata {
1862
+ /** Number of pages */
1863
+ numPages: number;
1864
+ /** PDF info dictionary */
1865
+ info: Record<string, unknown>;
1866
+ /** PDF metadata (XMP) */
1867
+ metadata: Record<string, unknown> | null;
1868
+ /** PDF format version */
1869
+ version: string;
1870
+ /** Whether the PDF is encrypted */
1871
+ isEncrypted: boolean;
1872
+ /** Whether the PDF is linearized (fast web view) */
1873
+ isLinearized: boolean;
1874
+ }
1875
+ /**
1876
+ * Page dimensions and properties
1877
+ */
1878
+ interface PageInfo {
1879
+ /** Page number (1-based) */
1880
+ pageNumber: number;
1881
+ /** Page width in points */
1882
+ width: number;
1883
+ /** Page height in points */
1884
+ height: number;
1885
+ /** Page rotation in degrees */
1886
+ rotation: number;
1887
+ /** Viewport at scale 1 */
1888
+ viewport: {
1889
+ width: number;
1890
+ height: number;
1891
+ scale: number;
1892
+ };
1893
+ }
1894
+
1895
+ /**
1896
+ * Check if running in Node.js environment
1897
+ */
1898
+ declare const isNode: boolean;
1899
+ /**
1900
+ * Check if running in browser environment
1901
+ */
1902
+ declare const isBrowser: boolean;
1903
+ /**
1904
+ * Check if a value is a PDFDocumentProxy instance
1905
+ *
1906
+ * Uses internal pdfjs property for reliable detection.
1907
+ *
1908
+ * @param data - Value to check
1909
+ * @returns True if the value is a PDFDocumentProxy
1910
+ *
1911
+ * @example
1912
+ * ```typescript
1913
+ * if (isPDFDocumentProxy(input)) {
1914
+ * // input is typed as PDFDocumentProxy
1915
+ * console.log(input.numPages);
1916
+ * }
1917
+ * ```
1918
+ */
1919
+ declare function isPDFDocumentProxy(data: unknown): data is PDFDocumentProxy;
1920
+ /**
1921
+ * Get the pdf.js module, initializing it lazily
1922
+ *
1923
+ * This ensures pdf.js is only loaded when needed and worker
1924
+ * configuration happens exactly once.
1925
+ */
1926
+ declare function getPDFJS(): Promise<typeof pdfjs_dist_legacy_build_pdf_mjs>;
1927
+ /**
1928
+ * Get the pdf.js verbosity level enum
1929
+ */
1930
+ declare function getVerbosityLevel(): Promise<typeof pdfjs_dist_legacy_build_pdf_mjs.VerbosityLevel>;
1931
+ /**
1932
+ * Load a PDF document from a file path or buffer
1933
+ *
1934
+ * Applies sensible defaults:
1935
+ * - `isEvalSupported: false` (security)
1936
+ * - `useSystemFonts: true` (better font rendering)
1937
+ *
1938
+ * @param source - File path string or Uint8Array/Buffer of PDF data
1939
+ * @param options - Loading options
1940
+ * @returns PDFDocumentProxy
1941
+ *
1942
+ * @example
1943
+ * ```typescript
1944
+ * // Load from file path
1945
+ * const doc = await loadPDF('document.pdf');
1946
+ *
1947
+ * // Load from buffer
1948
+ * const buffer = fs.readFileSync('document.pdf');
1949
+ * const doc = await loadPDF(buffer);
1950
+ *
1951
+ * // With password
1952
+ * const doc = await loadPDF('encrypted.pdf', { password: 'secret' });
1953
+ * ```
1954
+ */
1955
+ declare function loadPDF(source: PDFSource, options?: PDFLoadOptions): Promise<PDFDocumentProxy>;
1956
+ /**
1957
+ * Get a PDFDocumentProxy from input (loads if necessary)
1958
+ *
1959
+ * This is a convenience function that handles both raw data and
1960
+ * already-loaded documents uniformly.
1961
+ *
1962
+ * @param input - PDF source or already loaded document
1963
+ * @param options - Loading options (only used if input is not already a document)
1964
+ * @returns PDFDocumentProxy
1965
+ *
1966
+ * @example
1967
+ * ```typescript
1968
+ * // Works with file path
1969
+ * const doc1 = await getDocumentProxy('document.pdf');
1970
+ *
1971
+ * // Works with already loaded document (returns as-is)
1972
+ * const doc2 = await getDocumentProxy(existingDoc);
1973
+ * ```
1974
+ */
1975
+ declare function getDocumentProxy(input: PDFInput, options?: PDFLoadOptions): Promise<PDFDocumentProxy>;
1976
+ /**
1977
+ * Load a PDF and get the number of pages quickly
1978
+ *
1979
+ * Useful for determining if streaming should be enabled.
1980
+ *
1981
+ * @param source - File path or buffer
1982
+ * @returns Number of pages
1983
+ */
1984
+ declare function getPageCount(source: PDFSource): Promise<number>;
1985
+ /**
1986
+ * Check if a file is a valid PDF
1987
+ *
1988
+ * @param source - File path or buffer
1989
+ * @returns True if the source appears to be a valid PDF
1990
+ */
1991
+ declare function isPDF(source: PDFSource): Promise<boolean>;
1992
+ /**
1993
+ * Validate page number against document bounds
1994
+ *
1995
+ * @param pageNum - Page number to validate (1-based)
1996
+ * @param totalPages - Total pages in document
1997
+ * @throws Error if page number is invalid
1998
+ */
1999
+ declare function validatePageNumber(pageNum: number, totalPages: number): void;
2000
+
2001
+ /**
2002
+ * PDF Text Extraction Utilities
2003
+ *
2004
+ * Provides text extraction with full positioning support.
2005
+ * This is our value-add over unpdf - we include positions!
2006
+ */
2007
+
2008
+ /**
2009
+ * Extract text from all pages
2010
+ *
2011
+ * @param input - PDF document, file path, or buffer
2012
+ * @param options - Extraction options
2013
+ * @returns Object with totalPages and text array
2014
+ *
2015
+ * @example
2016
+ * ```typescript
2017
+ * // Get text as array of pages
2018
+ * const result = await extractText('document.pdf');
2019
+ * console.log(`Page 1: ${result.text[0]}`);
2020
+ *
2021
+ * // Get text as single merged string
2022
+ * const merged = await extractText('document.pdf', { mergePages: true });
2023
+ * console.log(merged.text); // string
2024
+ * ```
2025
+ */
2026
+ declare function extractText$1(input: PDFInput, options?: TextExtractionOptions & {
2027
+ mergePages?: false;
2028
+ }): Promise<TextExtractionResult<string[]>>;
2029
+ declare function extractText$1(input: PDFInput, options: TextExtractionOptions & {
2030
+ mergePages: true;
2031
+ }): Promise<TextExtractionResult<string>>;
2032
+ /**
2033
+ * Extract text with full positioning information
2034
+ *
2035
+ * This is the main value-add function - provides detailed text items
2036
+ * with x, y, width, height, font info, etc.
2037
+ *
2038
+ * @param input - PDF document, file path, or buffer
2039
+ * @param options - Extraction options
2040
+ * @returns Object with totalPages and items array per page
2041
+ *
2042
+ * @example
2043
+ * ```typescript
2044
+ * const result = await extractTextItems('document.pdf');
2045
+ * for (const item of result.items[0]) {
2046
+ * console.log(`"${item.str}" at (${item.x}, ${item.y})`);
2047
+ * }
2048
+ * ```
2049
+ */
2050
+ declare function extractTextItems(input: PDFInput, options?: Omit<TextExtractionOptions, "mergePages">): Promise<TextItemsExtractionResult>;
2051
+ /**
2052
+ * Extract text from a single page
2053
+ *
2054
+ * @param input - PDF document, file path, or buffer
2055
+ * @param pageNum - Page number (1-based)
2056
+ * @param options - Extraction options
2057
+ * @returns Text string for the page
2058
+ */
2059
+ declare function extractPageText(input: PDFInput, pageNum: number, options?: Omit<TextExtractionOptions, "firstPage" | "lastPage" | "mergePages">): Promise<string>;
2060
+ /**
2061
+ * Extract text items from a single page
2062
+ *
2063
+ * @param input - PDF document, file path, or buffer
2064
+ * @param pageNum - Page number (1-based)
2065
+ * @param options - Extraction options
2066
+ * @returns Array of text items
2067
+ */
2068
+ declare function extractPageTextItems(input: PDFInput, pageNum: number, options?: Omit<TextExtractionOptions, "firstPage" | "lastPage" | "mergePages">): Promise<PDFTextItem[]>;
2069
+ /**
2070
+ * Extract all text as a single string
2071
+ *
2072
+ * @param input - PDF document, file path, or buffer
2073
+ * @param options - Extraction options
2074
+ * @param pageSeparator - String to join pages (default: "\n\n")
2075
+ * @returns Combined text from all pages
2076
+ *
2077
+ * @deprecated Use extractText with { mergePages: true } instead
2078
+ */
2079
+ declare function extractFullText(input: PDFInput, options?: Omit<TextExtractionOptions, "mergePages">, pageSeparator?: string): Promise<string>;
2080
+
2081
+ /**
2082
+ * PDF Metadata Extraction Utilities
2083
+ *
2084
+ * Provides access to PDF document metadata.
2085
+ */
2086
+
2087
+ /**
2088
+ * Extract metadata from a PDF document
2089
+ *
2090
+ * @param input - PDF document, file path, or buffer
2091
+ * @param options - Metadata extraction options
2092
+ * @returns PDF metadata
2093
+ *
2094
+ * @example
2095
+ * ```typescript
2096
+ * const meta = await getMetadata('document.pdf');
2097
+ * console.log(`${meta.numPages} pages, version ${meta.version}`);
2098
+ *
2099
+ * // With date parsing
2100
+ * const metaDates = await getMetadata('document.pdf', { parseDates: true });
2101
+ * if (metaDates.info.CreationDate instanceof Date) {
2102
+ * console.log('Created:', metaDates.info.CreationDate.toISOString());
2103
+ * }
2104
+ * ```
2105
+ */
2106
+ declare function getMetadata(input: PDFInput, options?: MetadataOptions): Promise<PDFMetadata>;
2107
+ /**
2108
+ * Get information about a specific page
2109
+ *
2110
+ * @param input - PDF document, file path, or buffer
2111
+ * @param pageNum - Page number (1-based)
2112
+ * @returns Page information
2113
+ */
2114
+ declare function getPageInfo(input: PDFInput, pageNum: number): Promise<PageInfo>;
2115
+ /**
2116
+ * Get information about all pages
2117
+ *
2118
+ * @param input - PDF document, file path, or buffer
2119
+ * @returns Array of page information
2120
+ */
2121
+ declare function getAllPagesInfo(input: PDFInput): Promise<PageInfo[]>;
2122
+
2123
+ /**
2124
+ * PDF Link Extraction Utilities
2125
+ *
2126
+ * Extracts URLs from PDF annotations (hyperlinks).
2127
+ */
2128
+
2129
+ /**
2130
+ * Extract all links (URLs) from a PDF document
2131
+ *
2132
+ * Extracts hyperlinks from PDF annotations across all pages.
2133
+ *
2134
+ * @param input - PDF document, file path, or buffer
2135
+ * @returns Object with totalPages and unique links array
2136
+ *
2137
+ * @example
2138
+ * ```typescript
2139
+ * const result = await extractLinks('document.pdf');
2140
+ * console.log(`Found ${result.links.length} links in ${result.totalPages} pages`);
2141
+ * for (const url of result.links) {
2142
+ * console.log(url);
2143
+ * }
2144
+ * ```
2145
+ */
2146
+ declare function extractLinks(input: PDFInput): Promise<LinkExtractionResult>;
2147
+
2148
+ /**
2149
+ * PDF Page Rendering Utilities
2150
+ *
2151
+ * Renders PDF pages to images using @napi-rs/canvas.
2152
+ */
2153
+
2154
+ /**
2155
+ * Render a PDF page to an image buffer
2156
+ *
2157
+ * @param input - PDF document, file path, or buffer
2158
+ * @param pageNum - Page number (1-based)
2159
+ * @param options - Render options
2160
+ * @returns Render result with buffer and dimensions
2161
+ *
2162
+ * @example
2163
+ * ```typescript
2164
+ * // Using scale
2165
+ * const result = await renderPage('document.pdf', 1, { scale: 2 });
2166
+ *
2167
+ * // Using target width (auto-calculates scale)
2168
+ * const result = await renderPage('document.pdf', 1, { width: 800 });
2169
+ *
2170
+ * // Using target height (auto-calculates scale)
2171
+ * const result = await renderPage('document.pdf', 1, { height: 600 });
2172
+ *
2173
+ * fs.writeFileSync('page1.png', result.buffer);
2174
+ * ```
2175
+ */
2176
+ declare function renderPage(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<RenderResult>;
2177
+ /**
2178
+ * Render a PDF page directly to a data URL
2179
+ *
2180
+ * @param input - PDF document, file path, or buffer
2181
+ * @param pageNum - Page number (1-based)
2182
+ * @param options - Render options
2183
+ * @returns Render result with data URL and dimensions
2184
+ *
2185
+ * @example
2186
+ * ```typescript
2187
+ * const result = await renderPageAsDataURL('document.pdf', 1, { width: 800 });
2188
+ * // result.dataURL = "data:image/png;base64,..."
2189
+ * ```
2190
+ */
2191
+ declare function renderPageAsDataURL(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<RenderDataURLResult>;
2192
+ /**
2193
+ * Render multiple pages to image buffers
2194
+ *
2195
+ * @param input - PDF document, file path, or buffer
2196
+ * @param pageNums - Array of page numbers (1-based), or undefined for all pages
2197
+ * @param options - Render options
2198
+ * @returns Array of render results
2199
+ */
2200
+ declare function renderPages(input: PDFInput, pageNums?: number[], options?: RenderOptions): Promise<RenderResult[]>;
2201
+ /**
2202
+ * Render a page and return as base64 string
2203
+ *
2204
+ * @param input - PDF document, file path, or buffer
2205
+ * @param pageNum - Page number (1-based)
2206
+ * @param options - Render options
2207
+ * @returns Base64-encoded image string
2208
+ */
2209
+ declare function renderPageToBase64(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<string>;
2210
+ /**
2211
+ * Render a page as a data URL (legacy function, use renderPageAsDataURL instead)
2212
+ *
2213
+ * @param input - PDF document, file path, or buffer
2214
+ * @param pageNum - Page number (1-based)
2215
+ * @param options - Render options
2216
+ * @returns Data URL string
2217
+ *
2218
+ * @deprecated Use renderPageAsDataURL which returns more info
2219
+ */
2220
+ declare function renderPageToDataURL(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<string>;
2221
+
2222
+ /**
2223
+ * PDF Image Extraction Utilities
2224
+ *
2225
+ * Provides access to embedded images in PDF documents.
2226
+ * This is a thin wrapper around the existing ImageExtractor for consistency.
2227
+ */
2228
+
2229
+ /**
2230
+ * Options for image extraction
2231
+ */
2232
+ interface ImageExtractionOptions {
2233
+ /** Extract image files to disk (default: false) */
2234
+ extractFiles?: boolean;
2235
+ /** Output directory for extracted images */
2236
+ outputDir?: string;
2237
+ /** Convert JPEG2000 to JPG (default: true) */
2238
+ convertJp2ToJpg?: boolean;
2239
+ /** Optimize extracted images (default: false) */
2240
+ optimize?: boolean;
2241
+ /** Optimization quality (0-100, default: 80) */
2242
+ quality?: number;
2243
+ /** Enable verbose logging */
2244
+ verbose?: boolean;
2245
+ }
2246
+ /**
2247
+ * Result of image extraction
2248
+ */
2249
+ interface ImageExtractionResult {
2250
+ /** Array of extracted images */
2251
+ images: ImageItem[];
2252
+ /** Total number of images found */
2253
+ count: number;
2254
+ /** Output directory (if files were extracted) */
2255
+ outputDir?: string;
2256
+ }
2257
+ /**
2258
+ * Extract images from a PDF document
2259
+ *
2260
+ * @param source - File path or buffer
2261
+ * @param options - Extraction options
2262
+ * @returns Extraction result with images
2263
+ *
2264
+ * @example
2265
+ * ```typescript
2266
+ * // Get image metadata only
2267
+ * const result = await extractImages('document.pdf');
2268
+ * console.log(`Found ${result.count} images`);
2269
+ *
2270
+ * // Extract to files
2271
+ * const result = await extractImages('document.pdf', {
2272
+ * extractFiles: true,
2273
+ * outputDir: './images'
2274
+ * });
2275
+ * ```
2276
+ */
2277
+ declare function extractImages$1(source: PDFSource, options?: ImageExtractionOptions): Promise<ImageExtractionResult>;
2278
+ /**
2279
+ * Get image count from a PDF without full extraction
2280
+ *
2281
+ * @param source - File path
2282
+ * @returns Number of images
2283
+ */
2284
+ declare function getImageCount(source: string): Promise<number>;
2285
+
2286
+ /**
2287
+ * Internal PDF Utilities Library
2288
+ *
2289
+ * A clean, internal library for PDF operations inspired by unpdf patterns.
2290
+ * Provides unified PDF loading, text extraction with positioning, metadata access,
2291
+ * page rendering, and image extraction.
2292
+ *
2293
+ * Key features:
2294
+ * - Single source of truth for pdf.js configuration
2295
+ * - Lazy loading of pdf.js for better startup performance
2296
+ * - Full text positioning support (our value-add over unpdf)
2297
+ * - Clean, simple API with full TypeScript support
2298
+ *
2299
+ * @example
2300
+ * ```typescript
2301
+ * import { pdfUtils } from 'pdf-plus';
2302
+ *
2303
+ * // Load and work with a PDF
2304
+ * const doc = await pdfUtils.loadPDF('document.pdf');
2305
+ *
2306
+ * // Extract text (simple)
2307
+ * const result = await pdfUtils.extractText(doc);
2308
+ * console.log(result.totalPages, result.text);
2309
+ *
2310
+ * // Extract text with positions (our value-add)
2311
+ * const items = await pdfUtils.extractTextItems(doc);
2312
+ * for (const item of items.items[0]) {
2313
+ * console.log(`"${item.str}" at (${item.x}, ${item.y})`);
2314
+ * }
2315
+ *
2316
+ * // Render page to image with target width
2317
+ * const render = await pdfUtils.renderPage(doc, 1, { width: 800 });
2318
+ * fs.writeFileSync('page1.png', render.buffer);
2319
+ *
2320
+ * // Get metadata with date parsing
2321
+ * const meta = await pdfUtils.getMetadata(doc, { parseDates: true });
2322
+ * console.log(`${meta.numPages} pages`);
2323
+ *
2324
+ * // Clean up
2325
+ * await doc.destroy();
2326
+ * ```
2327
+ *
2328
+ * @packageDocumentation
2329
+ */
2330
+
2331
+ type index_ImageExtractionOptions = ImageExtractionOptions;
2332
+ type index_ImageExtractionResult = ImageExtractionResult;
2333
+ type index_ImageFormat = ImageFormat;
2334
+ type index_LinkExtractionResult = LinkExtractionResult;
2335
+ type index_MetadataOptions = MetadataOptions;
2336
+ declare const index_PDFDocumentProxy: typeof PDFDocumentProxy;
2337
+ type index_PDFInput = PDFInput;
2338
+ type index_PDFLoadOptions = PDFLoadOptions;
2339
+ type index_PDFMetadata = PDFMetadata;
2340
+ declare const index_PDFPageProxy: typeof PDFPageProxy;
2341
+ type index_PDFSource = PDFSource;
2342
+ type index_PDFTextItem = PDFTextItem;
2343
+ type index_PageInfo = PageInfo;
2344
+ type index_RenderDataURLResult = RenderDataURLResult;
2345
+ type index_RenderOptions = RenderOptions;
2346
+ type index_RenderResult = RenderResult;
2347
+ type index_TextExtractionMeta = TextExtractionMeta;
2348
+ type index_TextExtractionOptions = TextExtractionOptions;
2349
+ type index_TextExtractionProgress = TextExtractionProgress;
2350
+ type index_TextExtractionResult<T extends string | string[]> = TextExtractionResult<T>;
2351
+ type index_TextItemsExtractionResult = TextItemsExtractionResult;
2352
+ declare const index_extractFullText: typeof extractFullText;
2353
+ declare const index_extractLinks: typeof extractLinks;
2354
+ declare const index_extractPageText: typeof extractPageText;
2355
+ declare const index_extractPageTextItems: typeof extractPageTextItems;
2356
+ declare const index_extractTextItems: typeof extractTextItems;
2357
+ declare const index_getAllPagesInfo: typeof getAllPagesInfo;
2358
+ declare const index_getDocumentProxy: typeof getDocumentProxy;
2359
+ declare const index_getImageCount: typeof getImageCount;
2360
+ declare const index_getMetadata: typeof getMetadata;
2361
+ declare const index_getPDFJS: typeof getPDFJS;
2362
+ declare const index_getPageCount: typeof getPageCount;
2363
+ declare const index_getPageInfo: typeof getPageInfo;
2364
+ declare const index_getVerbosityLevel: typeof getVerbosityLevel;
2365
+ declare const index_isBrowser: typeof isBrowser;
2366
+ declare const index_isNode: typeof isNode;
2367
+ declare const index_isPDF: typeof isPDF;
2368
+ declare const index_isPDFDocumentProxy: typeof isPDFDocumentProxy;
2369
+ declare const index_loadPDF: typeof loadPDF;
2370
+ declare const index_renderPage: typeof renderPage;
2371
+ declare const index_renderPageAsDataURL: typeof renderPageAsDataURL;
2372
+ declare const index_renderPageToBase64: typeof renderPageToBase64;
2373
+ declare const index_renderPageToDataURL: typeof renderPageToDataURL;
2374
+ declare const index_renderPages: typeof renderPages;
2375
+ declare const index_validatePageNumber: typeof validatePageNumber;
2376
+ declare namespace index {
2377
+ export { type index_ImageExtractionOptions as ImageExtractionOptions, type index_ImageExtractionResult as ImageExtractionResult, type index_ImageFormat as ImageFormat, type index_LinkExtractionResult as LinkExtractionResult, type index_MetadataOptions as MetadataOptions, index_PDFDocumentProxy as PDFDocumentProxy, type index_PDFInput as PDFInput, type index_PDFLoadOptions as PDFLoadOptions, type index_PDFMetadata as PDFMetadata, index_PDFPageProxy as PDFPageProxy, type index_PDFSource as PDFSource, type index_PDFTextItem as PDFTextItem, type index_PageInfo as PageInfo, type index_RenderDataURLResult as RenderDataURLResult, type index_RenderOptions as RenderOptions, type index_RenderResult as RenderResult, type index_TextExtractionMeta as TextExtractionMeta, type index_TextExtractionOptions as TextExtractionOptions, type index_TextExtractionProgress as TextExtractionProgress, type index_TextExtractionResult as TextExtractionResult, type index_TextItemsExtractionResult as TextItemsExtractionResult, index_extractFullText as extractFullText, extractImages$1 as extractImages, index_extractLinks as extractLinks, index_extractPageText as extractPageText, index_extractPageTextItems as extractPageTextItems, extractText$1 as extractText, index_extractTextItems as extractTextItems, index_getAllPagesInfo as getAllPagesInfo, index_getDocumentProxy as getDocumentProxy, index_getImageCount as getImageCount, index_getMetadata as getMetadata, index_getPDFJS as getPDFJS, index_getPageCount as getPageCount, index_getPageInfo as getPageInfo, index_getVerbosityLevel as getVerbosityLevel, index_isBrowser as isBrowser, index_isNode as isNode, index_isPDF as isPDF, index_isPDFDocumentProxy as isPDFDocumentProxy, index_loadPDF as loadPDF, index_renderPage as renderPage, index_renderPageAsDataURL as renderPageAsDataURL, index_renderPageToBase64 as renderPageToBase64, index_renderPageToDataURL as renderPageToDataURL, index_renderPages as renderPages, index_validatePageNumber as validatePageNumber };
2378
+ }
2379
+
1574
2380
  /**
1575
2381
  * Validate extractor configuration
1576
2382
  */
@@ -1746,10 +2552,46 @@ declare function generatePageImages(pdfPath: string, outputDir?: string, options
1746
2552
  * ```
1747
2553
  */
1748
2554
  declare function extractPdfStream(pdfPath: string, options?: Partial<ExtractionOptions>): StreamingExtractionResult;
2555
+ /**
2556
+ * Extract tables from a PDF file (convenience function)
2557
+ *
2558
+ * Detects and extracts tables from a PDF document using text positioning data.
2559
+ * Tables are detected through spatial clustering of text items.
2560
+ *
2561
+ * @param pdfPath - Path to the PDF file
2562
+ * @param options - Table extraction options
2563
+ * @returns Promise resolving to table extraction result
2564
+ *
2565
+ * @example
2566
+ * ```typescript
2567
+ * import { extractTables, TableExtractor } from 'pdf-plus';
2568
+ *
2569
+ * // Using convenience function
2570
+ * const result = await extractTables('document.pdf', {
2571
+ * pages: [1, 2, 3],
2572
+ * detectHeaders: true,
2573
+ * minRows: 2,
2574
+ * minColumns: 2
2575
+ * });
2576
+ *
2577
+ * console.log(`Found ${result.tableCount} tables`);
2578
+ *
2579
+ * // Access table data
2580
+ * for (const table of result.tables) {
2581
+ * console.log(`Table on page ${table.page}: ${table.rowCount}x${table.columnCount}`);
2582
+ *
2583
+ * // Convert to different formats
2584
+ * const extractor = new TableExtractor();
2585
+ * console.log(extractor.tableToMarkdown(table));
2586
+ * console.log(extractor.tableToCSV(table));
2587
+ * }
2588
+ * ```
2589
+ */
2590
+ declare function extractTables(pdfPath: string, options?: TableExtractionOptions): Promise<TableExtractionResult>;
1749
2591
  /**
1750
2592
  * Library version
1751
2593
  */
1752
- declare const version = "1.0.3";
2594
+ declare const version = "2.0.0";
1753
2595
  /**
1754
2596
  * Default export containing all public APIs
1755
2597
  * Useful for CommonJS: const pdfPlus = require('pdf-plus');
@@ -1760,6 +2602,7 @@ declare const _default: {
1760
2602
  StreamingPDFExtractor: typeof StreamingPDFExtractor;
1761
2603
  TextExtractor: typeof TextExtractor;
1762
2604
  ImageExtractor: typeof ImageExtractor;
2605
+ TableExtractor: typeof TableExtractor;
1763
2606
  ImageOptimizer: typeof ImageOptimizer;
1764
2607
  FormatProcessor: typeof FormatProcessor;
1765
2608
  extractPdfContent: typeof extractPdfContent;
@@ -1768,10 +2611,11 @@ declare const _default: {
1768
2611
  extractImageFiles: typeof extractImageFiles;
1769
2612
  generatePageImages: typeof generatePageImages;
1770
2613
  extractPdfStream: typeof extractPdfStream;
2614
+ extractTables: typeof extractTables;
1771
2615
  validateConfig: typeof validateConfig;
1772
2616
  validateImageRefFormat: typeof validateImageRefFormat;
1773
2617
  validateFilePath: typeof validateFilePath;
1774
2618
  version: string;
1775
2619
  };
1776
2620
 
1777
- export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, generatePageImages, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
2621
+ export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo$1 as PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type Table, type TableCell, type TableColumn, type TableExtractionOptions, type TableExtractionResult, TableExtractor, type TableRow, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractTables, extractText, generatePageImages, pdfExtractor, index as pdfUtils, validateConfig, validateFilePath, validateImageRefFormat, version };