npm - pdf-plus - Versions diffs - 1.3.0 → 2.0.0 - Mend

pdf-plus 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.mts +959 -115
package/dist/index.d.ts +959 -115
package/dist/index.js +39 -38
package/dist/index.mjs +39 -38
package/dist/workers/jp2-converter.worker.js +1 -1
package/dist/workers/jp2-converter.worker.mjs +1 -1
package/package.json +2 -6

package/dist/index.d.mts CHANGED Viewed

@@ -1,3 +1,128 @@
+import * as pdfjs_dist_legacy_build_pdf_mjs from 'pdfjs-dist/legacy/build/pdf.mjs';
+import { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist/legacy/build/pdf.mjs';
+/**
+ * Table Extraction Types
+ *
+ * Type definitions for automatic table detection and extraction from PDFs.
+ */
+/**
+ * A detected table in the PDF
+ */
+interface Table {
+    /** Unique identifier for the table */
+    id: string;
+    /** Page number where the table is located (1-based) */
+    page: number;
+    /** Bounding box position of the table */
+    position: Position;
+    /** Array of rows in the table */
+    rows: TableRow[];
+    /** Array of column definitions */
+    columns: TableColumn[];
+    /** Confidence score for table detection (0-1) */
+    confidence: number;
+    /** Whether the table has a detected header row */
+    hasHeader: boolean;
+    /** Number of rows in the table */
+    rowCount: number;
+    /** Number of columns in the table */
+    columnCount: number;
+}
+/**
+ * A row in a table
+ */
+interface TableRow {
+    /** Row index (0-based) */
+    index: number;
+    /** Y position of the row */
+    y: number;
+    /** Height of the row */
+    height: number;
+    /** Cells in this row */
+    cells: TableCell[];
+    /** Whether this row is a header row */
+    isHeader: boolean;
+}
+/**
+ * A column in a table
+ */
+interface TableColumn {
+    /** Column index (0-based) */
+    index: number;
+    /** X position of the column */
+    x: number;
+    /** Width of the column */
+    width: number;
+    /** Column header text (if detected) */
+    header?: string;
+}
+/**
+ * A cell in a table
+ */
+interface TableCell {
+    /** Row index (0-based) */
+    row: number;
+    /** Column index (0-based) */
+    column: number;
+    /** Text content of the cell */
+    content: string;
+    /** Position of the cell */
+    position: Position;
+    /** Whether this cell is in a header row */
+    isHeader: boolean;
+    /** Row span (for merged cells, default: 1) */
+    rowSpan?: number;
+    /** Column span (for merged cells, default: 1) */
+    colSpan?: number;
+}
+/**
+ * Options for table extraction
+ */
+interface TableExtractionOptions {
+    /** Specific pages to extract tables from (1-based). If not specified, all pages are processed. */
+    pages?: number[];
+    /** Y-position tolerance for grouping text items into rows (default: 3) */
+    rowTolerance?: number;
+    /** X-position tolerance for grouping text items into columns (default: 5) */
+    columnTolerance?: number;
+    /** Minimum number of columns to consider a valid table (default: 2) */
+    minColumns?: number;
+    /** Minimum number of rows to consider a valid table (default: 2) */
+    minRows?: number;
+    /** Minimum grid density (filled cells / total cells) to consider valid (default: 0.6) */
+    minGridDensity?: number;
+    /** Whether to detect header rows (default: true) */
+    detectHeaders?: boolean;
+    /** Enable verbose logging (default: false) */
+    verbose?: boolean;
+}
+/**
+ * Result of table extraction
+ */
+interface TableExtractionResult {
+    /** Array of detected tables */
+    tables: Table[];
+    /** Total number of pages processed */
+    pagesProcessed: number;
+    /** Total number of tables found */
+    tableCount: number;
+    /** Extraction metadata */
+    metadata: TableExtractionMetadata;
+}
+/**
+ * Metadata about the extraction process
+ */
+interface TableExtractionMetadata {
+    /** Time taken to extract tables (in milliseconds) */
+    extractionTimeMs: number;
+    /** Options used for extraction */
+    options: TableExtractionOptions;
+    /** Number of candidate tables that were filtered out */
+    filteredCandidates: number;
+}
 /**
  * Types for streaming PDF extraction
  */
@@ -30,7 +155,7 @@ interface PageEvent extends StreamEvent {
     totalPages: number;
     textLength: number;
     imageCount: number;
-    pageInfo?: PageInfo;
+    pageInfo?: PageInfo$1;
 }
 /**
  * Image event - emitted when an image is extracted
@@ -240,7 +365,7 @@ interface ImageItem {
     filePath?: string;
     data?: Uint8Array;
 }
-interface PageInfo {
+interface PageInfo$1 {
     number: number;
     width: number;
     height: number;
@@ -259,7 +384,7 @@ interface DocumentMetadata {
 }
 interface ExtractionResult {
     document: DocumentMetadata;
-    pages: PageInfo[];
+    pages: PageInfo$1[];
     images: ImageItem[];
     textItems: TextItem[];
     text: string;
@@ -267,7 +392,10 @@ interface ExtractionResult {
     cleanText: string;
     summary?: DocumentSummary;
     structuredData?: StructuredPageData;
+    /** Detected tables (only populated when extractTables: true) */
+    tables?: Table[];
 }
 interface DocumentSummary {
     totalPages: number;
     totalTextItems: number;
@@ -378,39 +506,6 @@ interface ExtractionOptions {
      * (default: false - convert to JPG)
      */
     preserveJp2?: boolean;
-    /**
-     * Use Sharp library for ALL image processing operations (better quality & performance).
-     *
-     * When enabled, Sharp is used as the global image processing engine for:
-     * - JP2 to JPG conversion
-     * - Image optimization
-     * - Image resizing
-     * - Format conversions
-     *
-     * Sharp is an OPTIONAL dependency. Install it for better performance:
-     * ```bash
-     * npm install sharp
-     * ```
-     *
-     * If Sharp is not installed, the library will automatically fall back to pure JavaScript (Jimp).
-     *
-     * (default: false - use pure JS Jimp)
-     */
-    useSharp?: boolean;
-    /**
-     * Use Poppler's pdfimages as fallback when standard extraction finds no images.
-     * Poppler can extract images that are embedded in non-standard ways (Form XObjects, inline images, etc.)
-     * that the standard XObject-based extraction might miss.
-     *
-     * Requires poppler-utils to be installed on the system.
-     *
-     * Installation:
-     * - Linux: sudo apt-get install poppler-utils
-     * - macOS: brew install poppler
-     *
-     * (default: false)
-     */
-    usePopplerFallback?: boolean;
     /** Enable parallel processing for better performance (default: true) */
     parallelProcessing?: boolean;
     /** Maximum number of pages to process in parallel (default: 10) */
@@ -480,15 +575,25 @@ interface ExtractionOptions {
     /** Quality for JPG page images (default: 90) */
     pageImageQuality?: number;
     /**
-     * Page rendering engine: 'pdfjs' | 'poppler' (default: 'pdfjs')
-     * - pdfjs: Pure JavaScript, no dependencies, but NO JP2 support
-     * - poppler: Requires system poppler-utils, but HAS full JP2 support
+     * Page rendering engine (default: 'pdfjs')
+     *
+     * Note: Poppler support has been removed. Only 'pdfjs' is now supported.
+     * This option is kept for backwards compatibility but is ignored.
+     *
+     * @deprecated Poppler support removed - pdfjs is now the only engine
      */
-    pageRenderEngine?: "pdfjs" | "poppler";
+    pageRenderEngine?: "pdfjs";
     /** Thumbnail width (default: 200) */
     thumbnailWidth?: number;
     /** Thumbnail quality for JPG (default: 80) */
     thumbnailQuality?: number;
+    /**
+     * Enable table extraction (default: false)
+     * When enabled, tables will be detected and included in the result
+     */
+    extractTables?: boolean;
+    /** Options for table extraction */
+    tableOptions?: TableExtractionOptions;
 }
 interface ProgressInfo {
     currentPage: number;
@@ -593,7 +698,6 @@ declare class PDFExtractor {
     private textExtractor;
     private imageExtractor;
     private pageToImageConverter;
-    private popplerConverter;
     private formatProcessor;
     private structuredDataGenerator;
     private cacheManager;
@@ -818,13 +922,8 @@ declare class StreamingPDFExtractor implements StreamingExtractionResult {
  * ```
  */
 declare class TextExtractor {
-    constructor();
-    /**
-     * Initialize pdf.js worker
-     */
-    private initializePdfjs;
     /**
-     * Load PDF document
+     * Load PDF document using internal pdf utils
      */
     private loadDocument;
     /**
@@ -936,11 +1035,6 @@ declare class StructuredTextExtractor {
     private pdfLibDoc;
     private pdfLibPages;
     private textData;
-    constructor();
-    /**
-     * Initialize pdf.js worker
-     */
-    private initializePdfjs;
     /**
      * Process PDF with accurate page-by-page extraction
      */
@@ -1074,9 +1168,10 @@ declare class ImageExtractor {
      */
     private extractImageData;
     /**
-     * Detect image format from binary data (from NestJS implementation)
+     * Detect image format from binary data
+     * Uses centralized image format detection utility
      */
-    private detectImageFormat;
+    private detectImageFormatLocal;
     /**
      * Create a PNG file from raw pixel data using actual PDF metadata
      */
@@ -1298,23 +1393,12 @@ interface ThumbnailOptions extends SinglePageOptions {
  *   dpi: 150
  * });
  * ```
+ *
+ * NOTE: pdf.js does not support JPEG2000 (JP2) images by default.
+ * Pages with JP2 images will have blank spaces where the images should be.
+ * The embedded images are still extracted correctly via extractImages option.
  */
 declare class PageToImageConverter {
-    private pdfjs;
-    /**
-     * Get or load pdf.js module with proper worker configuration
-     * Based on pdf-to-img library approach
-     *
-     * NOTE: pdf.js does not support JPEG2000 (JP2) images by default.
-     * Pages with JP2 images will have blank spaces where the images should be.
-     * The embedded images are still extracted correctly via extractImages option.
-     *
-     * For complete page rendering with JP2 support, consider using:
-     * - Poppler (pdf-poppler npm package) - requires system dependency
-     * - ImageMagick - requires system dependency
-     * - Ghostscript - requires system dependency
-     */
-    private getPdfjs;
     /**
      * Convert all pages of a PDF to images
      *
@@ -1363,12 +1447,13 @@ declare class PageToImageConverter {
     /**
      * Render a PDF page to image buffer
      *
-     * Based on pdf-to-img library approach - let pdf.js handle canvas creation
-     * @see https://github.com/k-yle/pdf-to-img
+     * Uses @napi-rs/canvas via custom canvas factory for high-performance rendering
      */
     private renderPageToBuffer;
     /**
      * Convert canvas to image buffer
+     *
+     * Uses @napi-rs/canvas async encode() for JPEG/WebP quality control
      */
     private canvasToBuffer;
     /**
@@ -1390,47 +1475,67 @@ declare class PageToImageConverter {
 }
 /**
- * Poppler-based PDF Page to Image Converter
+ * Table Extractor
  *
- * Uses Poppler's pdfToCairo for high-quality rendering with full JPEG2000 support.
- * Requires poppler-utils to be installed on the system.
- *
- * Installation:
- * - Linux: sudo apt-get install poppler-utils
- * - macOS: brew install poppler
- * - Windows: Download from https://blog.alivate.com.au/poppler-windows/
+ * Main class for detecting and extracting tables from PDF documents.
+ * Uses text positioning data to identify table structures.
  */
-declare class PopplerConverter {
-    private poppler;
+/**
+ * TableExtractor class for detecting and extracting tables from PDFs
+ *
+ * @example
+ * ```typescript
+ * const extractor = new TableExtractor();
+ * const result = await extractor.extract('document.pdf', {
+ *   detectHeaders: true,
+ *   minRows: 3
+ * });
+ *
+ * for (const table of result.tables) {
+ *   console.log(extractor.tableToMarkdown(table));
+ * }
+ * ```
+ */
+declare class TableExtractor {
     /**
-     * Get or initialize Poppler instance
+     * Extract tables from a PDF file
+     *
+     * @param pdfPath - Path to the PDF file
+     * @param options - Extraction options
+     * @returns Promise resolving to extraction result
      */
-    private getPoppler;
+    extract(pdfPath: string, options?: TableExtractionOptions): Promise<TableExtractionResult>;
     /**
-     * Convert PDF pages to images using Poppler
-     *
-     * @param pdfPath - Path to PDF file
-     * @param options - Conversion options
-     * @returns Conversion result with image paths
+     * Detects tables on a single page
      */
-    convertToImages(pdfPath: string, options: PageToImageOptions): Promise<PageToImageResult>;
+    private detectTablesOnPage;
     /**
-     * Get PDF information using pdfinfo
+     * Builds a Table object from a validated candidate
      */
-    private getPdfInfo;
+    private buildTable;
     /**
-     * Get image dimensions
+     * Converts a table to a 2D array of strings
      */
-    private getImageDimensions;
+    tableToArray(table: Table, includeHeaders?: boolean): string[][];
     /**
-     * Format filename pattern
+     * Converts a table to CSV format
      */
-    private formatFilename;
+    tableToCSV(table: Table, delimiter?: string): string;
     /**
-     * Format bytes to human-readable string
+     * Converts a table to Markdown format
      */
-    private formatBytes;
+    tableToMarkdown(table: Table): string;
+    /**
+     * Converts a table to HTML format
+     */
+    tableToHTML(table: Table, options?: {
+        tableClass?: string;
+    }): string;
+    /**
+     * Converts a table to an array of objects (using headers as keys)
+     */
+    tableToObjects(table: Table): Array<Record<string, string>>;
 }
 /**
@@ -1442,7 +1547,7 @@ interface OptimizationResult {
     optimizedSize: number;
     savedBytes: number;
     savedPercent: number;
-    engine: "jimp" | "sharp" | "none";
+    engine: "canvas" | "none";
     error?: string;
 }
 /**
@@ -1451,19 +1556,17 @@ interface OptimizationResult {
 interface OptimizationOptions {
     quality?: number;
     verbose?: boolean;
-    useSharp?: boolean;
 }
 /**
- * Image optimizer using Jimp (pure JavaScript)
+ * Image optimizer using @napi-rs/canvas
  *
- * This class provides image optimization capabilities using Jimp, a pure JavaScript
- * image processing library with no native dependencies. It supports JPEG and PNG
+ * This class provides image optimization capabilities using @napi-rs/canvas,
+ * a high-performance Skia-based canvas library. It supports JPEG, PNG, and WebP
  * optimization with quality control.
  *
  * @example
  * ```typescript
  * const result = await ImageOptimizer.optimizeFile('image.jpg', {
- *   engine: 'auto',
  *   quality: 80
  * });
  *
@@ -1483,22 +1586,16 @@ declare class ImageOptimizer {
      */
     static optimizeFile(filePath: string, options?: OptimizationOptions): Promise<OptimizationResult>;
     /**
-     * Optimize using Sharp (optional dependency)
-     */
-    private static optimizeWithSharp;
-    /**
-     * Optimize using Jimp (pure JavaScript)
+     * Optimize using @napi-rs/canvas (Skia-based)
      */
-    private static optimizeWithJimp;
+    private static optimizeWithCanvas;
     /**
      * Convert JPEG 2000 formats (jp2, jpx, j2c, jpm) to JPG
      *
      * JPEG 2000 files are not widely supported by browsers and image tools.
      * This method converts them to standard JPG format for better compatibility.
      *
-     * Supports two conversion engines:
-     * - Jimp (default): Pure JavaScript, works everywhere
-     * - Sharp (optional): Better color preservation, requires native compilation
+     * Uses @napi-rs/canvas with OpenJPEG WASM decoder for high-performance conversion.
      *
      * @param jp2Path - Path to the JPEG 2000 file (jp2, jpx, j2c, or jpm)
      * @param options - Conversion options
@@ -1507,7 +1604,6 @@ declare class ImageOptimizer {
     static convertJp2ToJpg(jp2Path: string, options?: {
         quality?: number;
         verbose?: boolean;
-        useSharp?: boolean;
     }): Promise<{
         success: boolean;
         newPath?: string;
@@ -1571,6 +1667,716 @@ declare class FormatProcessor {
     formatDuration(milliseconds: number): string;
 }
+/**
+ * Type definitions for the internal PDF utilities library
+ *
+ * Provides clean interfaces for PDF operations inspired by unpdf patterns.
+ */
+/**
+ * Source for loading a PDF - either a file path or raw bytes
+ */
+type PDFSource = string | Uint8Array | Buffer;
+/**
+ * Input type for PDF operations - accepts either raw data or an already loaded document
+ */
+type PDFInput = PDFSource | PDFDocumentProxy;
+/**
+ * Supported image formats for rendering
+ */
+type ImageFormat = "png" | "jpeg" | "webp";
+/**
+ * Options for loading a PDF document
+ */
+interface PDFLoadOptions {
+    /** Password for encrypted PDFs */
+    password?: string;
+    /** Verbosity level for pdfjs logging */
+    verbosity?: number;
+}
+/**
+ * Text item with full positioning information
+ */
+interface PDFTextItem {
+    /** The text string */
+    str: string;
+    /** X position (from transform matrix) */
+    x: number;
+    /** Y position (from transform matrix) */
+    y: number;
+    /** Width of the text item */
+    width: number;
+    /** Height of the text item */
+    height: number;
+    /** Font name */
+    fontName: string;
+    /** Font size (derived from transform) */
+    fontSize: number;
+    /** Full transform matrix [a, b, c, d, e, f] */
+    transform: number[];
+    /** Whether this item ends with EOL */
+    hasEOL: boolean;
+    /** Text direction (ltr or rtl) */
+    dir: "ltr" | "rtl" | "ttb" | "btt";
+}
+/**
+ * Progress information for text extraction
+ */
+interface TextExtractionProgress {
+    /** Number of pages processed so far */
+    processedPages: number;
+    /** Total number of pages to process */
+    totalPages: number;
+    /** Percentage complete (0-100) */
+    percentage: number;
+    /** Current page being processed (1-based) */
+    currentPage?: number;
+}
+/**
+ * Performance metadata for text extraction
+ */
+interface TextExtractionMeta {
+    /** Duration in milliseconds */
+    duration: number;
+    /** Number of pages processed */
+    pagesProcessed: number;
+    /** Processing method used */
+    method: "parallel" | "sequential" | "chunked";
+}
+/**
+ * Options for text extraction
+ */
+interface TextExtractionOptions {
+    /** First page to extract (1-based, default: 1) */
+    firstPage?: number;
+    /** Last page to extract (1-based, default: all pages) */
+    lastPage?: number;
+    /** Include marked content in extraction */
+    includeMarkedContent?: boolean;
+    /** Disable text normalization */
+    disableNormalization?: boolean;
+    /** Merge all pages into a single string (default: false) */
+    mergePages?: boolean;
+    /** Maximum concurrent page extractions (default: 10) */
+    maxConcurrency?: number;
+    /** Progress callback called after each page is processed */
+    onProgress?: (progress: TextExtractionProgress) => void;
+    /** Chunk size for processing very large PDFs (default: undefined = no chunking) */
+    chunkSize?: number;
+    /** Callback called after each chunk is processed (when chunkSize is set) */
+    onChunkComplete?: (info: {
+        chunkIndex: number;
+        totalChunks: number;
+        pagesProcessed: number;
+    }) => void;
+}
+/**
+ * Result of text extraction
+ */
+interface TextExtractionResult<T extends string | string[]> {
+    /** Total number of pages in the document */
+    totalPages: number;
+    /** Extracted text - string[] when mergePages is false, string when true */
+    text: T;
+    /** Performance metadata (available when extraction completes) */
+    _meta?: TextExtractionMeta;
+}
+/**
+ * Result of text items extraction
+ */
+interface TextItemsExtractionResult {
+    /** Total number of pages in the document */
+    totalPages: number;
+    /** Text items per page */
+    items: PDFTextItem[][];
+    /** Performance metadata (available when extraction completes) */
+    _meta?: TextExtractionMeta;
+}
+/**
+ * Options for metadata extraction
+ */
+interface MetadataOptions {
+    /** Parse date strings (CreationDate, ModDate) into Date objects (default: false) */
+    parseDates?: boolean;
+}
+/**
+ * Result of link extraction
+ */
+interface LinkExtractionResult {
+    /** Total number of pages in the document */
+    totalPages: number;
+    /** Extracted URLs from the document */
+    links: string[];
+}
+/**
+ * Options for page rendering
+ */
+interface RenderOptions {
+    /** Scale factor (default: 1). Ignored if width or height is set. */
+    scale?: number;
+    /** DPI for rendering (default: 72, affects scale) */
+    dpi?: number;
+    /** Target width in pixels. Auto-calculates scale to fit. */
+    width?: number;
+    /** Target height in pixels. Auto-calculates scale to fit. */
+    height?: number;
+    /** Output format (default: 'png') */
+    format?: ImageFormat;
+    /** Quality for JPEG/WebP (0-100, default: 90) */
+    quality?: number;
+    /** Background color (default: '#FFFFFF') */
+    backgroundColor?: string;
+    /** Transparent background (default: false) */
+    transparent?: boolean;
+}
+/**
+ * Result of rendering a page
+ */
+interface RenderResult {
+    /** Image buffer */
+    buffer: Buffer;
+    /** Image width in pixels */
+    width: number;
+    /** Image height in pixels */
+    height: number;
+    /** Output format */
+    format: ImageFormat;
+}
+/**
+ * Result of rendering a page as data URL
+ */
+interface RenderDataURLResult {
+    /** Data URL string (e.g., "data:image/png;base64,...") */
+    dataURL: string;
+    /** Image width in pixels */
+    width: number;
+    /** Image height in pixels */
+    height: number;
+    /** Output format */
+    format: ImageFormat;
+}
+/**
+ * PDF document metadata
+ */
+interface PDFMetadata {
+    /** Number of pages */
+    numPages: number;
+    /** PDF info dictionary */
+    info: Record<string, unknown>;
+    /** PDF metadata (XMP) */
+    metadata: Record<string, unknown> | null;
+    /** PDF format version */
+    version: string;
+    /** Whether the PDF is encrypted */
+    isEncrypted: boolean;
+    /** Whether the PDF is linearized (fast web view) */
+    isLinearized: boolean;
+}
+/**
+ * Page dimensions and properties
+ */
+interface PageInfo {
+    /** Page number (1-based) */
+    pageNumber: number;
+    /** Page width in points */
+    width: number;
+    /** Page height in points */
+    height: number;
+    /** Page rotation in degrees */
+    rotation: number;
+    /** Viewport at scale 1 */
+    viewport: {
+        width: number;
+        height: number;
+        scale: number;
+    };
+}
+/**
+ * Check if running in Node.js environment
+ */
+declare const isNode: boolean;
+/**
+ * Check if running in browser environment
+ */
+declare const isBrowser: boolean;
+/**
+ * Check if a value is a PDFDocumentProxy instance
+ *
+ * Uses internal pdfjs property for reliable detection.
+ *
+ * @param data - Value to check
+ * @returns True if the value is a PDFDocumentProxy
+ *
+ * @example
+ * ```typescript
+ * if (isPDFDocumentProxy(input)) {
+ *   // input is typed as PDFDocumentProxy
+ *   console.log(input.numPages);
+ * }
+ * ```
+ */
+declare function isPDFDocumentProxy(data: unknown): data is PDFDocumentProxy;
+/**
+ * Get the pdf.js module, initializing it lazily
+ *
+ * This ensures pdf.js is only loaded when needed and worker
+ * configuration happens exactly once.
+ */
+declare function getPDFJS(): Promise<typeof pdfjs_dist_legacy_build_pdf_mjs>;
+/**
+ * Get the pdf.js verbosity level enum
+ */
+declare function getVerbosityLevel(): Promise<typeof pdfjs_dist_legacy_build_pdf_mjs.VerbosityLevel>;
+/**
+ * Load a PDF document from a file path or buffer
+ *
+ * Applies sensible defaults:
+ * - `isEvalSupported: false` (security)
+ * - `useSystemFonts: true` (better font rendering)
+ *
+ * @param source - File path string or Uint8Array/Buffer of PDF data
+ * @param options - Loading options
+ * @returns PDFDocumentProxy
+ *
+ * @example
+ * ```typescript
+ * // Load from file path
+ * const doc = await loadPDF('document.pdf');
+ *
+ * // Load from buffer
+ * const buffer = fs.readFileSync('document.pdf');
+ * const doc = await loadPDF(buffer);
+ *
+ * // With password
+ * const doc = await loadPDF('encrypted.pdf', { password: 'secret' });
+ * ```
+ */
+declare function loadPDF(source: PDFSource, options?: PDFLoadOptions): Promise<PDFDocumentProxy>;
+/**
+ * Get a PDFDocumentProxy from input (loads if necessary)
+ *
+ * This is a convenience function that handles both raw data and
+ * already-loaded documents uniformly.
+ *
+ * @param input - PDF source or already loaded document
+ * @param options - Loading options (only used if input is not already a document)
+ * @returns PDFDocumentProxy
+ *
+ * @example
+ * ```typescript
+ * // Works with file path
+ * const doc1 = await getDocumentProxy('document.pdf');
+ *
+ * // Works with already loaded document (returns as-is)
+ * const doc2 = await getDocumentProxy(existingDoc);
+ * ```
+ */
+declare function getDocumentProxy(input: PDFInput, options?: PDFLoadOptions): Promise<PDFDocumentProxy>;
+/**
+ * Load a PDF and get the number of pages quickly
+ *
+ * Useful for determining if streaming should be enabled.
+ *
+ * @param source - File path or buffer
+ * @returns Number of pages
+ */
+declare function getPageCount(source: PDFSource): Promise<number>;
+/**
+ * Check if a file is a valid PDF
+ *
+ * @param source - File path or buffer
+ * @returns True if the source appears to be a valid PDF
+ */
+declare function isPDF(source: PDFSource): Promise<boolean>;
+/**
+ * Validate page number against document bounds
+ *
+ * @param pageNum - Page number to validate (1-based)
+ * @param totalPages - Total pages in document
+ * @throws Error if page number is invalid
+ */
+declare function validatePageNumber(pageNum: number, totalPages: number): void;
+/**
+ * PDF Text Extraction Utilities
+ *
+ * Provides text extraction with full positioning support.
+ * This is our value-add over unpdf - we include positions!
+ */
+/**
+ * Extract text from all pages
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param options - Extraction options
+ * @returns Object with totalPages and text array
+ *
+ * @example
+ * ```typescript
+ * // Get text as array of pages
+ * const result = await extractText('document.pdf');
+ * console.log(`Page 1: ${result.text[0]}`);
+ *
+ * // Get text as single merged string
+ * const merged = await extractText('document.pdf', { mergePages: true });
+ * console.log(merged.text); // string
+ * ```
+ */
+declare function extractText$1(input: PDFInput, options?: TextExtractionOptions & {
+    mergePages?: false;
+}): Promise<TextExtractionResult<string[]>>;
+declare function extractText$1(input: PDFInput, options: TextExtractionOptions & {
+    mergePages: true;
+}): Promise<TextExtractionResult<string>>;
+/**
+ * Extract text with full positioning information
+ *
+ * This is the main value-add function - provides detailed text items
+ * with x, y, width, height, font info, etc.
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param options - Extraction options
+ * @returns Object with totalPages and items array per page
+ *
+ * @example
+ * ```typescript
+ * const result = await extractTextItems('document.pdf');
+ * for (const item of result.items[0]) {
+ *   console.log(`"${item.str}" at (${item.x}, ${item.y})`);
+ * }
+ * ```
+ */
+declare function extractTextItems(input: PDFInput, options?: Omit<TextExtractionOptions, "mergePages">): Promise<TextItemsExtractionResult>;
+/**
+ * Extract text from a single page
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param pageNum - Page number (1-based)
+ * @param options - Extraction options
+ * @returns Text string for the page
+ */
+declare function extractPageText(input: PDFInput, pageNum: number, options?: Omit<TextExtractionOptions, "firstPage" | "lastPage" | "mergePages">): Promise<string>;
+/**
+ * Extract text items from a single page
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param pageNum - Page number (1-based)
+ * @param options - Extraction options
+ * @returns Array of text items
+ */
+declare function extractPageTextItems(input: PDFInput, pageNum: number, options?: Omit<TextExtractionOptions, "firstPage" | "lastPage" | "mergePages">): Promise<PDFTextItem[]>;
+/**
+ * Extract all text as a single string
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param options - Extraction options
+ * @param pageSeparator - String to join pages (default: "\n\n")
+ * @returns Combined text from all pages
+ *
+ * @deprecated Use extractText with { mergePages: true } instead
+ */
+declare function extractFullText(input: PDFInput, options?: Omit<TextExtractionOptions, "mergePages">, pageSeparator?: string): Promise<string>;
+/**
+ * PDF Metadata Extraction Utilities
+ *
+ * Provides access to PDF document metadata.
+ */
+/**
+ * Extract metadata from a PDF document
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param options - Metadata extraction options
+ * @returns PDF metadata
+ *
+ * @example
+ * ```typescript
+ * const meta = await getMetadata('document.pdf');
+ * console.log(`${meta.numPages} pages, version ${meta.version}`);
+ *
+ * // With date parsing
+ * const metaDates = await getMetadata('document.pdf', { parseDates: true });
+ * if (metaDates.info.CreationDate instanceof Date) {
+ *   console.log('Created:', metaDates.info.CreationDate.toISOString());
+ * }
+ * ```
+ */
+declare function getMetadata(input: PDFInput, options?: MetadataOptions): Promise<PDFMetadata>;
+/**
+ * Get information about a specific page
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param pageNum - Page number (1-based)
+ * @returns Page information
+ */
+declare function getPageInfo(input: PDFInput, pageNum: number): Promise<PageInfo>;
+/**
+ * Get information about all pages
+ *
+ * @param input - PDF document, file path, or buffer
+ * @returns Array of page information
+ */
+declare function getAllPagesInfo(input: PDFInput): Promise<PageInfo[]>;
+/**
+ * PDF Link Extraction Utilities
+ *
+ * Extracts URLs from PDF annotations (hyperlinks).
+ */
+/**
+ * Extract all links (URLs) from a PDF document
+ *
+ * Extracts hyperlinks from PDF annotations across all pages.
+ *
+ * @param input - PDF document, file path, or buffer
+ * @returns Object with totalPages and unique links array
+ *
+ * @example
+ * ```typescript
+ * const result = await extractLinks('document.pdf');
+ * console.log(`Found ${result.links.length} links in ${result.totalPages} pages`);
+ * for (const url of result.links) {
+ *   console.log(url);
+ * }
+ * ```
+ */
+declare function extractLinks(input: PDFInput): Promise<LinkExtractionResult>;
+/**
+ * PDF Page Rendering Utilities
+ *
+ * Renders PDF pages to images using @napi-rs/canvas.
+ */
+/**
+ * Render a PDF page to an image buffer
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param pageNum - Page number (1-based)
+ * @param options - Render options
+ * @returns Render result with buffer and dimensions
+ *
+ * @example
+ * ```typescript
+ * // Using scale
+ * const result = await renderPage('document.pdf', 1, { scale: 2 });
+ *
+ * // Using target width (auto-calculates scale)
+ * const result = await renderPage('document.pdf', 1, { width: 800 });
+ *
+ * // Using target height (auto-calculates scale)
+ * const result = await renderPage('document.pdf', 1, { height: 600 });
+ *
+ * fs.writeFileSync('page1.png', result.buffer);
+ * ```
+ */
+declare function renderPage(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<RenderResult>;
+/**
+ * Render a PDF page directly to a data URL
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param pageNum - Page number (1-based)
+ * @param options - Render options
+ * @returns Render result with data URL and dimensions
+ *
+ * @example
+ * ```typescript
+ * const result = await renderPageAsDataURL('document.pdf', 1, { width: 800 });
+ * // result.dataURL = "data:image/png;base64,..."
+ * ```
+ */
+declare function renderPageAsDataURL(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<RenderDataURLResult>;
+/**
+ * Render multiple pages to image buffers
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param pageNums - Array of page numbers (1-based), or undefined for all pages
+ * @param options - Render options
+ * @returns Array of render results
+ */
+declare function renderPages(input: PDFInput, pageNums?: number[], options?: RenderOptions): Promise<RenderResult[]>;
+/**
+ * Render a page and return as base64 string
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param pageNum - Page number (1-based)
+ * @param options - Render options
+ * @returns Base64-encoded image string
+ */
+declare function renderPageToBase64(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<string>;
+/**
+ * Render a page as a data URL (legacy function, use renderPageAsDataURL instead)
+ *
+ * @param input - PDF document, file path, or buffer
+ * @param pageNum - Page number (1-based)
+ * @param options - Render options
+ * @returns Data URL string
+ *
+ * @deprecated Use renderPageAsDataURL which returns more info
+ */
+declare function renderPageToDataURL(input: PDFInput, pageNum: number, options?: RenderOptions): Promise<string>;
+/**
+ * PDF Image Extraction Utilities
+ *
+ * Provides access to embedded images in PDF documents.
+ * This is a thin wrapper around the existing ImageExtractor for consistency.
+ */
+/**
+ * Options for image extraction
+ */
+interface ImageExtractionOptions {
+    /** Extract image files to disk (default: false) */
+    extractFiles?: boolean;
+    /** Output directory for extracted images */
+    outputDir?: string;
+    /** Convert JPEG2000 to JPG (default: true) */
+    convertJp2ToJpg?: boolean;
+    /** Optimize extracted images (default: false) */
+    optimize?: boolean;
+    /** Optimization quality (0-100, default: 80) */
+    quality?: number;
+    /** Enable verbose logging */
+    verbose?: boolean;
+}
+/**
+ * Result of image extraction
+ */
+interface ImageExtractionResult {
+    /** Array of extracted images */
+    images: ImageItem[];
+    /** Total number of images found */
+    count: number;
+    /** Output directory (if files were extracted) */
+    outputDir?: string;
+}
+/**
+ * Extract images from a PDF document
+ *
+ * @param source - File path or buffer
+ * @param options - Extraction options
+ * @returns Extraction result with images
+ *
+ * @example
+ * ```typescript
+ * // Get image metadata only
+ * const result = await extractImages('document.pdf');
+ * console.log(`Found ${result.count} images`);
+ *
+ * // Extract to files
+ * const result = await extractImages('document.pdf', {
+ *   extractFiles: true,
+ *   outputDir: './images'
+ * });
+ * ```
+ */
+declare function extractImages$1(source: PDFSource, options?: ImageExtractionOptions): Promise<ImageExtractionResult>;
+/**
+ * Get image count from a PDF without full extraction
+ *
+ * @param source - File path
+ * @returns Number of images
+ */
+declare function getImageCount(source: string): Promise<number>;
+/**
+ * Internal PDF Utilities Library
+ *
+ * A clean, internal library for PDF operations inspired by unpdf patterns.
+ * Provides unified PDF loading, text extraction with positioning, metadata access,
+ * page rendering, and image extraction.
+ *
+ * Key features:
+ * - Single source of truth for pdf.js configuration
+ * - Lazy loading of pdf.js for better startup performance
+ * - Full text positioning support (our value-add over unpdf)
+ * - Clean, simple API with full TypeScript support
+ *
+ * @example
+ * ```typescript
+ * import { pdfUtils } from 'pdf-plus';
+ *
+ * // Load and work with a PDF
+ * const doc = await pdfUtils.loadPDF('document.pdf');
+ *
+ * // Extract text (simple)
+ * const result = await pdfUtils.extractText(doc);
+ * console.log(result.totalPages, result.text);
+ *
+ * // Extract text with positions (our value-add)
+ * const items = await pdfUtils.extractTextItems(doc);
+ * for (const item of items.items[0]) {
+ *   console.log(`"${item.str}" at (${item.x}, ${item.y})`);
+ * }
+ *
+ * // Render page to image with target width
+ * const render = await pdfUtils.renderPage(doc, 1, { width: 800 });
+ * fs.writeFileSync('page1.png', render.buffer);
+ *
+ * // Get metadata with date parsing
+ * const meta = await pdfUtils.getMetadata(doc, { parseDates: true });
+ * console.log(`${meta.numPages} pages`);
+ *
+ * // Clean up
+ * await doc.destroy();
+ * ```
+ *
+ * @packageDocumentation
+ */
+type index_ImageExtractionOptions = ImageExtractionOptions;
+type index_ImageExtractionResult = ImageExtractionResult;
+type index_ImageFormat = ImageFormat;
+type index_LinkExtractionResult = LinkExtractionResult;
+type index_MetadataOptions = MetadataOptions;
+declare const index_PDFDocumentProxy: typeof PDFDocumentProxy;
+type index_PDFInput = PDFInput;
+type index_PDFLoadOptions = PDFLoadOptions;
+type index_PDFMetadata = PDFMetadata;
+declare const index_PDFPageProxy: typeof PDFPageProxy;
+type index_PDFSource = PDFSource;
+type index_PDFTextItem = PDFTextItem;
+type index_PageInfo = PageInfo;
+type index_RenderDataURLResult = RenderDataURLResult;
+type index_RenderOptions = RenderOptions;
+type index_RenderResult = RenderResult;
+type index_TextExtractionMeta = TextExtractionMeta;
+type index_TextExtractionOptions = TextExtractionOptions;
+type index_TextExtractionProgress = TextExtractionProgress;
+type index_TextExtractionResult<T extends string | string[]> = TextExtractionResult<T>;
+type index_TextItemsExtractionResult = TextItemsExtractionResult;
+declare const index_extractFullText: typeof extractFullText;
+declare const index_extractLinks: typeof extractLinks;
+declare const index_extractPageText: typeof extractPageText;
+declare const index_extractPageTextItems: typeof extractPageTextItems;
+declare const index_extractTextItems: typeof extractTextItems;
+declare const index_getAllPagesInfo: typeof getAllPagesInfo;
+declare const index_getDocumentProxy: typeof getDocumentProxy;
+declare const index_getImageCount: typeof getImageCount;
+declare const index_getMetadata: typeof getMetadata;
+declare const index_getPDFJS: typeof getPDFJS;
+declare const index_getPageCount: typeof getPageCount;
+declare const index_getPageInfo: typeof getPageInfo;
+declare const index_getVerbosityLevel: typeof getVerbosityLevel;
+declare const index_isBrowser: typeof isBrowser;
+declare const index_isNode: typeof isNode;
+declare const index_isPDF: typeof isPDF;
+declare const index_isPDFDocumentProxy: typeof isPDFDocumentProxy;
+declare const index_loadPDF: typeof loadPDF;
+declare const index_renderPage: typeof renderPage;
+declare const index_renderPageAsDataURL: typeof renderPageAsDataURL;
+declare const index_renderPageToBase64: typeof renderPageToBase64;
+declare const index_renderPageToDataURL: typeof renderPageToDataURL;
+declare const index_renderPages: typeof renderPages;
+declare const index_validatePageNumber: typeof validatePageNumber;
+declare namespace index {
+  export { type index_ImageExtractionOptions as ImageExtractionOptions, type index_ImageExtractionResult as ImageExtractionResult, type index_ImageFormat as ImageFormat, type index_LinkExtractionResult as LinkExtractionResult, type index_MetadataOptions as MetadataOptions, index_PDFDocumentProxy as PDFDocumentProxy, type index_PDFInput as PDFInput, type index_PDFLoadOptions as PDFLoadOptions, type index_PDFMetadata as PDFMetadata, index_PDFPageProxy as PDFPageProxy, type index_PDFSource as PDFSource, type index_PDFTextItem as PDFTextItem, type index_PageInfo as PageInfo, type index_RenderDataURLResult as RenderDataURLResult, type index_RenderOptions as RenderOptions, type index_RenderResult as RenderResult, type index_TextExtractionMeta as TextExtractionMeta, type index_TextExtractionOptions as TextExtractionOptions, type index_TextExtractionProgress as TextExtractionProgress, type index_TextExtractionResult as TextExtractionResult, type index_TextItemsExtractionResult as TextItemsExtractionResult, index_extractFullText as extractFullText, extractImages$1 as extractImages, index_extractLinks as extractLinks, index_extractPageText as extractPageText, index_extractPageTextItems as extractPageTextItems, extractText$1 as extractText, index_extractTextItems as extractTextItems, index_getAllPagesInfo as getAllPagesInfo, index_getDocumentProxy as getDocumentProxy, index_getImageCount as getImageCount, index_getMetadata as getMetadata, index_getPDFJS as getPDFJS, index_getPageCount as getPageCount, index_getPageInfo as getPageInfo, index_getVerbosityLevel as getVerbosityLevel, index_isBrowser as isBrowser, index_isNode as isNode, index_isPDF as isPDF, index_isPDFDocumentProxy as isPDFDocumentProxy, index_loadPDF as loadPDF, index_renderPage as renderPage, index_renderPageAsDataURL as renderPageAsDataURL, index_renderPageToBase64 as renderPageToBase64, index_renderPageToDataURL as renderPageToDataURL, index_renderPages as renderPages, index_validatePageNumber as validatePageNumber };
+}
 /**
  * Validate extractor configuration
  */
@@ -1746,10 +2552,46 @@ declare function generatePageImages(pdfPath: string, outputDir?: string, options
  * ```
  */
 declare function extractPdfStream(pdfPath: string, options?: Partial<ExtractionOptions>): StreamingExtractionResult;
+/**
+ * Extract tables from a PDF file (convenience function)
+ *
+ * Detects and extracts tables from a PDF document using text positioning data.
+ * Tables are detected through spatial clustering of text items.
+ *
+ * @param pdfPath - Path to the PDF file
+ * @param options - Table extraction options
+ * @returns Promise resolving to table extraction result
+ *
+ * @example
+ * ```typescript
+ * import { extractTables, TableExtractor } from 'pdf-plus';
+ *
+ * // Using convenience function
+ * const result = await extractTables('document.pdf', {
+ *   pages: [1, 2, 3],
+ *   detectHeaders: true,
+ *   minRows: 2,
+ *   minColumns: 2
+ * });
+ *
+ * console.log(`Found ${result.tableCount} tables`);
+ *
+ * // Access table data
+ * for (const table of result.tables) {
+ *   console.log(`Table on page ${table.page}: ${table.rowCount}x${table.columnCount}`);
+ *
+ *   // Convert to different formats
+ *   const extractor = new TableExtractor();
+ *   console.log(extractor.tableToMarkdown(table));
+ *   console.log(extractor.tableToCSV(table));
+ * }
+ * ```
+ */
+declare function extractTables(pdfPath: string, options?: TableExtractionOptions): Promise<TableExtractionResult>;
 /**
  * Library version
  */
-declare const version = "1.0.3";
+declare const version = "2.0.0";
 /**
  * Default export containing all public APIs
  * Useful for CommonJS: const pdfPlus = require('pdf-plus');
@@ -1760,6 +2602,7 @@ declare const _default: {
     StreamingPDFExtractor: typeof StreamingPDFExtractor;
     TextExtractor: typeof TextExtractor;
     ImageExtractor: typeof ImageExtractor;
+    TableExtractor: typeof TableExtractor;
     ImageOptimizer: typeof ImageOptimizer;
     FormatProcessor: typeof FormatProcessor;
     extractPdfContent: typeof extractPdfContent;
@@ -1768,10 +2611,11 @@ declare const _default: {
     extractImageFiles: typeof extractImageFiles;
     generatePageImages: typeof generatePageImages;
     extractPdfStream: typeof extractPdfStream;
+    extractTables: typeof extractTables;
     validateConfig: typeof validateConfig;
     validateImageRefFormat: typeof validateImageRefFormat;
     validateFilePath: typeof validateFilePath;
     version: string;
 };
-export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, PopplerConverter, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, generatePageImages, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
+export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo$1 as PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type Table, type TableCell, type TableColumn, type TableExtractionOptions, type TableExtractionResult, TableExtractor, type TableRow, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractTables, extractText, generatePageImages, pdfExtractor, index as pdfUtils, validateConfig, validateFilePath, validateImageRefFormat, version };