npm - pdf-plus - Versions diffs - 1.0.1 → 1.0.3 - Mend

pdf-plus 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +303 -2
package/dist/index.d.mts +974 -21
package/dist/index.d.ts +974 -21
package/dist/index.js +36 -35
package/dist/index.mjs +36 -35
package/dist/workers/image-decoder.worker.d.mts +2 -0
package/dist/workers/image-decoder.worker.d.ts +2 -0
package/dist/workers/image-decoder.worker.js +2 -0
package/dist/workers/image-decoder.worker.mjs +2 -0
package/dist/workers/jp2-converter.worker.d.mts +2 -0
package/dist/workers/jp2-converter.worker.d.ts +2 -0
package/dist/workers/jp2-converter.worker.js +2 -0
package/dist/workers/jp2-converter.worker.mjs +2 -0
package/package.json +22 -7

package/dist/index.d.mts CHANGED Viewed

@@ -1,3 +1,202 @@
+/**
+ * Types for streaming PDF extraction
+ */
+/**
+ * Event types emitted during streaming extraction
+ */
+type StreamEventType = "start" | "page" | "image" | "progress" | "complete" | "error";
+/**
+ * Base event structure
+ */
+interface StreamEvent {
+    type: StreamEventType;
+    timestamp: number;
+}
+/**
+ * Start event - emitted when extraction begins
+ */
+interface StartEvent extends StreamEvent {
+    type: "start";
+    totalPages: number;
+    pdfPath: string;
+}
+/**
+ * Page event - emitted when a page is processed
+ */
+interface PageEvent extends StreamEvent {
+    type: "page";
+    pageNumber: number;
+    totalPages: number;
+    textLength: number;
+    imageCount: number;
+    pageInfo?: PageInfo;
+}
+/**
+ * Image event - emitted when an image is extracted
+ */
+interface ImageEvent extends StreamEvent {
+    type: "image";
+    image: ImageItem;
+    pageNumber: number;
+    imageIndex: number;
+    totalImages: number;
+}
+/**
+ * Progress event - emitted periodically during extraction
+ */
+interface ProgressEvent extends StreamEvent {
+    type: "progress";
+    pagesProcessed: number;
+    totalPages: number;
+    imagesExtracted: number;
+    percentComplete: number;
+    estimatedTimeRemaining?: number;
+}
+/**
+ * Complete event - emitted when extraction finishes
+ */
+interface CompleteEvent extends StreamEvent {
+    type: "complete";
+    totalPages: number;
+    totalImages: number;
+    totalTextLength: number;
+    duration: number;
+}
+/**
+ * Error event - emitted when an error occurs
+ */
+interface ErrorEvent extends StreamEvent {
+    type: "error";
+    error: Error;
+    pageNumber?: number;
+    recoverable: boolean;
+}
+/**
+ * Union type of all stream events
+ */
+type StreamEventUnion = StartEvent | PageEvent | ImageEvent | ProgressEvent | CompleteEvent | ErrorEvent;
+/**
+ * Streaming extraction options
+ */
+interface StreamingOptions$1 {
+    /**
+     * Enable streaming mode
+     * @default false
+     */
+    streamMode?: boolean;
+    /**
+     * Automatically enable streaming for PDFs with more than this many pages
+     * @default 100
+     */
+    autoStreamThreshold?: number;
+    /**
+     * Enable backpressure handling (pause extraction if consumer is slow)
+     * @default true
+     */
+    enableBackpressure?: boolean;
+    /**
+     * Maximum number of pages to buffer before pausing (backpressure)
+     * @default 10
+     */
+    maxBufferedPages?: number;
+    /**
+     * Emit progress events every N pages
+     * @default 5
+     */
+    progressInterval?: number;
+    /**
+     * Enable event callbacks (in addition to async iterator)
+     * @default false
+     */
+    enableEventCallbacks?: boolean;
+}
+/**
+ * Event callback function type
+ */
+type StreamEventCallback = (event: StreamEventUnion) => void | Promise<void>;
+/**
+ * Event callbacks map
+ */
+interface StreamEventCallbacks {
+    onStart?: (event: StartEvent) => void | Promise<void>;
+    onPage?: (event: PageEvent) => void | Promise<void>;
+    onImage?: (event: ImageEvent) => void | Promise<void>;
+    onProgress?: (event: ProgressEvent) => void | Promise<void>;
+    onComplete?: (event: CompleteEvent) => void | Promise<void>;
+    onError?: (event: ErrorEvent) => void | Promise<void>;
+    onAny?: StreamEventCallback;
+}
+/**
+ * Streaming extraction result (async iterator)
+ */
+interface StreamingExtractionResult {
+    /**
+     * Async iterator for streaming events
+     */
+    [Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
+    /**
+     * Register event callbacks
+     */
+    on(event: "start", callback: (event: StartEvent) => void | Promise<void>): this;
+    on(event: "page", callback: (event: PageEvent) => void | Promise<void>): this;
+    on(event: "image", callback: (event: ImageEvent) => void | Promise<void>): this;
+    on(event: "progress", callback: (event: ProgressEvent) => void | Promise<void>): this;
+    on(event: "complete", callback: (event: CompleteEvent) => void | Promise<void>): this;
+    on(event: "error", callback: (event: ErrorEvent) => void | Promise<void>): this;
+    on(event: "any", callback: StreamEventCallback): this;
+    /**
+     * Cancel the streaming extraction
+     */
+    cancel(): Promise<void>;
+    /**
+     * Pause the streaming extraction (backpressure)
+     */
+    pause(): void;
+    /**
+     * Resume the streaming extraction
+     */
+    resume(): void;
+    /**
+     * Get current streaming statistics
+     */
+    getStats(): StreamingStats;
+}
+/**
+ * Streaming statistics
+ */
+interface StreamingStats {
+    pagesProcessed: number;
+    totalPages: number;
+    imagesExtracted: number;
+    bytesProcessed: number;
+    startTime: number;
+    elapsedTime: number;
+    isPaused: boolean;
+    isCancelled: boolean;
+    isComplete: boolean;
+    averagePageTime: number;
+    estimatedTimeRemaining: number;
+}
+/**
+ * Internal streaming state
+ */
+interface StreamingState {
+    totalPages: number;
+    pagesProcessed: number;
+    imagesExtracted: number;
+    totalTextLength: number;
+    bytesProcessed: number;
+    startTime: number;
+    lastProgressTime: number;
+    isPaused: boolean;
+    isCancelled: boolean;
+    isComplete: boolean;
+    bufferedPages: number;
+    eventQueue: StreamEventUnion[];
+    callbacks: StreamEventCallbacks;
+}
 /**
  * Core types for PDF content extraction
  */
@@ -63,6 +262,7 @@ interface ExtractionResult {
     pages: PageInfo[];
     images: ImageItem[];
     textItems: TextItem[];
+    text: string;
     textWithRefs: string;
     cleanText: string;
     summary?: DocumentSummary;
@@ -85,9 +285,9 @@ interface StructuredPageData {
         totalImages: number;
         extractionOptions: ExtractionOptions;
     };
-    pages: PageData[];
+    pages: PageData$1[];
 }
-interface PageData {
+interface PageData$1 {
     pageNumber: number;
     text: {
         content: string;
@@ -97,6 +297,32 @@ interface PageData {
     };
     images: PageImageData[];
     imageCount: number;
+    pageImage?: {
+        path: string;
+        format: string;
+        width: number;
+        height: number;
+        size: number;
+        dpi?: number;
+        quality?: number;
+    };
+    thumbnail?: {
+        path: string;
+        format: string;
+        width: number;
+        height: number;
+        size: number;
+        quality?: number;
+    };
+    pageImageVariants?: Array<{
+        path: string;
+        format: string;
+        width: number;
+        height: number;
+        size: number;
+        quality: number;
+        dpi?: number;
+    }>;
 }
 interface PageImageData {
     id: string;
@@ -111,8 +337,10 @@ interface PageImageData {
     };
     format: string;
     size?: number;
+    width?: number;
+    height?: number;
+    mimeType?: string;
 }
-type ImageExtractionEngine = "pdf-lib" | "poppler" | "auto";
 interface ExtractionOptions {
     extractText?: boolean;
     extractImages?: boolean;
@@ -131,14 +359,116 @@ interface ExtractionOptions {
     extractTextItems?: boolean;
     specificPages?: number[];
     useCache?: boolean;
-    /** Image extraction engine to use */
-    imageEngine?: ImageExtractionEngine;
+    /** Enable image optimization after extraction (uses Jimp - pure JavaScript, default: false) */
+    optimizeImages?: boolean;
+    /** Image quality for optimization (0-100, default: 80) */
+    imageQuality?: number;
+    /**
+     * Convert JPEG 2000 images to JPG format for better compatibility.
+     * (default: true - convert JP2 to JPG)
+     */
+    convertJp2ToJpg?: boolean;
+    /**
+     * Preserve JPEG 2000 images in their original format.
+     * By default (false), JPEG 2000 images (jp2, jpx, j2c, jpm) are converted to JPG for better compatibility.
+     * Set to true to keep JPEG 2000 files in their original format.
+     *
+     * Note: JP2 images from PDFs are automatically decoded by PDF.js during extraction.
+     * This option only affects standalone JP2 files.
+     * (default: false - convert to JPG)
+     */
+    preserveJp2?: boolean;
+    /**
+     * Use Sharp library for ALL image processing operations (better quality & performance).
+     *
+     * When enabled, Sharp is used as the global image processing engine for:
+     * - JP2 to JPG conversion
+     * - Image optimization
+     * - Image resizing
+     * - Format conversions
+     *
+     * Sharp is an OPTIONAL dependency. Install it for better performance:
+     * ```bash
+     * npm install sharp
+     * ```
+     *
+     * If Sharp is not installed, the library will automatically fall back to pure JavaScript (Jimp).
+     *
+     * (default: false - use pure JS Jimp)
+     */
+    useSharp?: boolean;
+    /** Enable parallel processing for better performance (default: true) */
+    parallelProcessing?: boolean;
+    /** Maximum number of pages to process in parallel (default: 10) */
+    maxConcurrentPages?: number;
+    /** Maximum number of images per page to extract in parallel (default: 20) */
+    maxConcurrentImages?: number;
+    /** Maximum number of JP2 to JPG conversions in parallel (default: 5) */
+    maxConcurrentConversions?: number;
+    /** Maximum number of image optimizations in parallel (default: 5) */
+    maxConcurrentOptimizations?: number;
+    /** Enable worker threads for CPU-intensive operations (default: false) */
+    useWorkerThreads?: boolean;
+    /** Auto-scale workers based on system resources (default: true) */
+    autoScaleWorkers?: boolean;
+    /** Maximum number of worker threads (default: CPU cores - 1) */
+    maxWorkerThreads?: number;
+    /** Minimum number of worker threads to keep alive (default: 1) */
+    minWorkerThreads?: number;
+    /** Memory threshold for scaling down workers 0-1 (default: 0.8) */
+    memoryThreshold?: number;
+    /** CPU threshold for scaling up workers 0-1 (default: 0.9) */
+    cpuThreshold?: number;
+    /** Worker task timeout in milliseconds (default: 30000) */
+    workerTaskTimeout?: number;
+    /** Worker idle timeout in milliseconds (default: 60000) */
+    workerIdleTimeout?: number;
+    /** Memory limit per worker in MB (default: 512) */
+    workerMemoryLimit?: number;
+    /** Use workers for JP2 conversion (default: true) */
+    enableWorkerForConversion?: boolean;
+    /** Use workers for image optimization (default: true) */
+    enableWorkerForOptimization?: boolean;
+    /** Use workers for image decoding (default: true) */
+    enableWorkerForDecoding?: boolean;
+    /** Enable streaming mode for large PDFs (default: false) */
+    streamMode?: boolean;
+    /** Automatically enable streaming for PDFs with more than this many pages (default: 100) */
+    autoStreamThreshold?: number;
+    /** Enable backpressure handling (pause extraction if consumer is slow) (default: true) */
+    enableBackpressure?: boolean;
+    /** Maximum number of pages to buffer before pausing (default: 10) */
+    maxBufferedPages?: number;
+    /** Emit progress events every N pages (default: 5) */
+    progressInterval?: number;
+    /** Enable event callbacks in addition to async iterator (default: false) */
+    enableEventCallbacks?: boolean;
     cacheDir?: string;
     baseName?: string;
     verbose?: boolean;
     memoryLimit?: string;
     batchSize?: number;
     progressCallback?: (progress: ProgressInfo) => void;
+    /** Generate page images (default: false) */
+    generatePageImages?: boolean;
+    /** Generate thumbnails for pages (default: false) */
+    generateThumbnails?: boolean;
+    /** Include page images in structured output (default: false) */
+    includePageImagesInStructuredData?: boolean;
+    /** Page numbers to generate images for (default: all pages) */
+    pageNumbers?: number[];
+    /** Generate multiple quality variants of page images */
+    pageImageQualities?: number[];
+    /** DPI for page images (default: 150) */
+    pageImageDpi?: number;
+    /** Format for page images: 'png' | 'jpg' (default: 'png') */
+    pageImageFormat?: "png" | "jpg";
+    /** Quality for JPG page images (default: 90) */
+    pageImageQuality?: number;
+    /** Thumbnail width (default: 200) */
+    thumbnailWidth?: number;
+    /** Thumbnail quality for JPG (default: 80) */
+    thumbnailQuality?: number;
 }
 interface ProgressInfo {
     currentPage: number;
@@ -242,6 +572,7 @@ interface TemplateOptions {
 declare class PDFExtractor {
     private textExtractor;
     private imageExtractor;
+    private pageToImageConverter;
     private formatProcessor;
     private structuredDataGenerator;
     private cacheManager;
@@ -355,6 +686,14 @@ declare class PDFExtractor {
         totalCacheSize: number;
         cacheDir: string;
     };
+    /**
+     * Generate page images with multiple quality variants
+     */
+    private generatePageImagesWithVariants;
+    /**
+     * Generate thumbnails for pages
+     */
+    private generatePageThumbnails;
     private reportProgress;
     private createValidationError;
     private createExtractionError;
@@ -362,10 +701,70 @@ declare class PDFExtractor {
 declare const pdfExtractor: PDFExtractor;
 /**
- * Text extraction from PDF files
+ * Streaming PDF extractor for large PDFs
+ * Provides async iterator and event-based APIs
+ */
+/**
+ * Streaming PDF extractor implementation
+ */
+declare class StreamingPDFExtractor implements StreamingExtractionResult {
+    private state;
+    private options;
+    private pdfPath;
+    private extractor;
+    private eventQueue;
+    private resolveNext;
+    private extractionPromise;
+    constructor(pdfPath: string, options?: ExtractionOptions & StreamingOptions$1);
+    /**
+     * Async iterator implementation
+     */
+    [Symbol.asyncIterator](): AsyncIterator<StreamEventUnion>;
+    /**
+     * Register event callbacks
+     */
+    on(event: "start", callback: (event: StartEvent) => void): this;
+    on(event: "page", callback: (event: PageEvent) => void): this;
+    on(event: "image", callback: (event: ImageEvent) => void): this;
+    on(event: "progress", callback: (event: ProgressEvent) => void): this;
+    on(event: "complete", callback: (event: CompleteEvent) => void): this;
+    on(event: "error", callback: (event: ErrorEvent) => void): this;
+    on(event: "any", callback: (event: StreamEventUnion) => void): this;
+    /**
+     * Cancel extraction
+     */
+    cancel(): Promise<void>;
+    /**
+     * Pause extraction (backpressure)
+     */
+    pause(): void;
+    /**
+     * Resume extraction
+     */
+    resume(): void;
+    /**
+     * Get streaming statistics
+     */
+    getStats(): StreamingStats;
+    /**
+     * Emit an event
+     */
+    private emitEvent;
+    /**
+     * Start the extraction process
+     */
+    private startExtraction;
+}
+/**
+ * Text extraction from PDF files using pdf.js
  *
- * Handles text extraction using pdf-parse library with support for
- * page-by-page extraction and metadata retrieval.
+ * Direct pdf.js-based text extraction with support for:
+ * - Page-by-page extraction with accurate boundaries
+ * - Text positioning and font information
+ * - Metadata retrieval
+ * - No external dependencies (uses pdf.js directly)
  *
  * @example
  * ```typescript
@@ -375,6 +774,19 @@ declare const pdfExtractor: PDFExtractor;
  * ```
  */
 declare class TextExtractor {
+    constructor();
+    /**
+     * Initialize pdf.js worker
+     */
+    private initializePdfjs;
+    /**
+     * Load PDF document
+     */
+    private loadDocument;
+    /**
+     * Extract text from a single page
+     */
+    private getPageText;
     /**
      * Extract text content from PDF
      *
@@ -383,6 +795,17 @@ declare class TextExtractor {
      * @throws {Error} When PDF extraction fails
      */
     extract(pdfPath: string): Promise<any>;
+    /**
+     * Extract text with metadata
+     *
+     * @param pdfPath - Path to the PDF file
+     * @returns Promise resolving to extraction result with text and metadata
+     * @throws {Error} When PDF extraction fails
+     */
+    extractWithMetadata(pdfPath: string): Promise<{
+        text: string;
+        metadata: any;
+    }>;
     /**
      * Extract text with page information
      *
@@ -392,11 +815,7 @@ declare class TextExtractor {
      */
     extractWithPages(pdfPath: string): Promise<any>;
     /**
-     * Split text into approximate pages
-     */
-    private splitTextIntoPages;
-    /**
-     * Extract text items with position and metadata
+     * Extract text items with position and metadata using pdf.js
      */
     extractTextItems(pdfPath: string, options?: ExtractionOptions): Promise<TextItem[]>;
     /**
@@ -438,21 +857,112 @@ declare class TextExtractor {
         pageOffset?: number;
         includeImageRefs?: boolean;
         imageRefFormat?: string;
-        imageEngine?: ImageExtractionEngine;
     }): Promise<{
         text: string;
-        pages: PageData[];
+        pages: PageData$1[];
     }>;
     /**
      * Extract text with accurate page boundaries using pdf-lib + pdf-parse
      */
     extractWithAccuratePages(pdfPath: string): Promise<{
         fullText: string;
-        pages: PageData[];
+        pages: PageData$1[];
         totalPages: number;
     }>;
 }
+/**
+ * Structured text extractor using both pdf-lib and pdf.js for accurate page-by-page extraction
+ *
+ * Extracts text with rich metadata including page dimensions, rotation, word counts, and character counts.
+ * Uses pdf-lib for accurate page structure and pdf.js for text content.
+ */
+interface PageData {
+    pageNumber: number;
+    text: string;
+    width: number;
+    height: number;
+    rotation: number;
+    mediaBox: number[];
+    textItems?: any[];
+    wordCount: number;
+    characterCount: number;
+}
+declare class StructuredTextExtractor {
+    private pdfLibDoc;
+    private pdfLibPages;
+    private textData;
+    constructor();
+    /**
+     * Initialize pdf.js worker
+     */
+    private initializePdfjs;
+    /**
+     * Process PDF with accurate page-by-page extraction
+     */
+    processPDF(pdfPath: string): Promise<{
+        totalPages: number;
+        pages: PageData[];
+        fullText: string;
+    }>;
+    /**
+     * Process with pdf-lib to get accurate page structure
+     */
+    private processPDFLib;
+    /**
+     * Process with pdf.js to extract text page by page
+     */
+    private processPDFjs;
+    /**
+     * Combine results from both libraries
+     */
+    private combineResults;
+    /**
+     * Extract text with page markers using accurate page boundaries
+     */
+    extractWithPageMarkers(pdfPath: string, pageMarkerFormat?: string, options?: {
+        includeImageRefs?: boolean;
+        imageRefFormat?: string;
+    }): Promise<{
+        text: string;
+        cleanText: string;
+        numPages: number;
+        pages: PageData[];
+    }>;
+    /**
+     * Get specific page data
+     */
+    getPage(pageNumber: number): PageData | null;
+    /**
+     * Get detailed page information including text positioning
+     */
+    getDetailedPageInfo(pdfPath: string, pageNumber: number): Promise<{
+        pageNumber: number;
+        text: string;
+        textItems: Array<{
+            text: string;
+            x: number;
+            y: number;
+            width: number;
+            height: number;
+            fontName?: string;
+            fontSize?: number;
+        }>;
+        dimensions: {
+            width: number;
+            height: number;
+        };
+    } | null>;
+    /**
+     * Count words in text
+     */
+    private countWords;
+    /**
+     * Process single page (for streaming/batch processing)
+     */
+    processSinglePage(pdfPath: string, pageNumber: number): Promise<PageData | null>;
+}
 /**
  * Image extraction from PDF files using pdf-lib (clean implementation based on NestJS)
  *
@@ -529,6 +1039,381 @@ declare class ImageExtractor {
     private createPngFromPdfMetadata;
 }
+/**
+ * Types for PDF page to image conversion
+ */
+/**
+ * Image format for page conversion
+ */
+type PageImageFormat = "png" | "jpg" | "jpeg" | "webp";
+/**
+ * Options for converting PDF pages to images
+ */
+interface PageToImageOptions {
+    /**
+     * Output directory for image files
+     * @default './page-images'
+     */
+    outputDir?: string;
+    /**
+     * Image format
+     * @default 'png'
+     */
+    format?: PageImageFormat;
+    /**
+     * JPEG quality (1-100, only for JPG format)
+     * @default 90
+     */
+    quality?: number;
+    /**
+     * DPI (dots per inch) for rendering
+     * Higher DPI = better quality but larger files
+     * @default 72
+     */
+    dpi?: number;
+    /**
+     * Scale factor (multiplier for dimensions)
+     * @default 1
+     */
+    scale?: number;
+    /**
+     * Specific pages to convert (1-based)
+     * If not provided, converts all pages
+     * @example [1, 3, 5]
+     */
+    pages?: number[];
+    /**
+     * Page range to convert (e.g., "1-5", "1,3,5-10")
+     * If not provided, converts all pages
+     * @example "1-5"
+     */
+    pageRange?: string;
+    /**
+     * Filename pattern for output files
+     * Available placeholders: {page}, {total}, {name}
+     * @default 'page-{page}.{ext}'
+     */
+    filenamePattern?: string;
+    /**
+     * Background color for transparent PDFs
+     * @default '#FFFFFF'
+     */
+    backgroundColor?: string;
+    /**
+     * Enable transparent background (PNG only)
+     * @default false
+     */
+    transparent?: boolean;
+    /**
+     * Crop to content (remove white margins)
+     * @default false
+     */
+    cropToContent?: boolean;
+    /**
+     * Progress callback
+     */
+    onProgress?: (current: number, total: number, percentage: number) => void;
+    /**
+     * Callback when a page is converted
+     */
+    onPageComplete?: (pageNumber: number, filepath: string) => void;
+    /**
+     * Verbose logging
+     * @default false
+     */
+    verbose?: boolean;
+}
+/**
+ * Result of page to image conversion
+ */
+interface PageImageResult {
+    /**
+     * Page number (1-based)
+     */
+    page: number;
+    /**
+     * Output file path
+     */
+    filepath: string;
+    /**
+     * Image width in pixels
+     */
+    width: number;
+    /**
+     * Image height in pixels
+     */
+    height: number;
+    /**
+     * File size in bytes
+     */
+    fileSize: number;
+    /**
+     * Image format
+     */
+    format: PageImageFormat;
+}
+/**
+ * Result of converting all pages
+ */
+interface PageToImageResult {
+    /**
+     * Array of converted page images
+     */
+    images: PageImageResult[];
+    /**
+     * Total number of pages converted
+     */
+    totalPages: number;
+    /**
+     * Output directory
+     */
+    outputDir: string;
+    /**
+     * Total size of all images in bytes
+     */
+    totalSize: number;
+}
+/**
+ * Options for converting a single page
+ */
+interface SinglePageOptions {
+    /**
+     * Image format
+     * @default 'png'
+     */
+    format?: PageImageFormat;
+    /**
+     * JPEG quality (1-100)
+     * @default 90
+     */
+    quality?: number;
+    /**
+     * DPI for rendering
+     * @default 72
+     */
+    dpi?: number;
+    /**
+     * Scale factor
+     * @default 1
+     */
+    scale?: number;
+    /**
+     * Background color
+     * @default '#FFFFFF'
+     */
+    backgroundColor?: string;
+    /**
+     * Transparent background (PNG only)
+     * @default false
+     */
+    transparent?: boolean;
+}
+/**
+ * Thumbnail generation options
+ */
+interface ThumbnailOptions extends SinglePageOptions {
+    /**
+     * Maximum width in pixels
+     * @default 200
+     */
+    maxWidth?: number;
+    /**
+     * Maximum height in pixels
+     * @default 200
+     */
+    maxHeight?: number;
+    /**
+     * Maintain aspect ratio
+     * @default true
+     */
+    maintainAspectRatio?: boolean;
+}
+/**
+ * PDF Page to Image Converter using pdf.js
+ *
+ * Converts PDF pages to image files (PNG, JPG, WebP) with customizable options.
+ * Uses Mozilla's pdf.js for high-quality rendering without external dependencies.
+ */
+/**
+ * Page to Image Converter
+ *
+ * @example
+ * ```typescript
+ * const converter = new PageToImageConverter();
+ * const result = await converter.convertToImages('document.pdf', {
+ *   outputDir: './pages',
+ *   format: 'png',
+ *   dpi: 150
+ * });
+ * ```
+ */
+declare class PageToImageConverter {
+    private pdfjs;
+    /**
+     * Get or load pdf.js module with proper worker configuration
+     * Based on pdf-to-img library approach
+     */
+    private getPdfjs;
+    /**
+     * Convert all pages of a PDF to images
+     *
+     * @param pdfPath - Path to PDF file
+     * @param options - Conversion options
+     * @returns Conversion result with image paths
+     */
+    convertToImages(pdfPath: string, options?: PageToImageOptions): Promise<PageToImageResult>;
+    /**
+     * Convert a single page to an image file
+     *
+     * @param pdfPath - Path to PDF file
+     * @param pageNumber - Page number (1-based)
+     * @param outputPath - Output file path
+     * @param options - Conversion options
+     */
+    convertPage(pdfPath: string, pageNumber: number, outputPath: string, options?: SinglePageOptions): Promise<PageImageResult>;
+    /**
+     * Convert a page to a buffer (no file write)
+     *
+     * @param pdfPath - Path to PDF file
+     * @param pageNumber - Page number (1-based)
+     * @param options - Conversion options
+     * @returns Image buffer
+     */
+    convertPageToBuffer(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<Buffer>;
+    /**
+     * Convert a page to base64 string
+     *
+     * @param pdfPath - Path to PDF file
+     * @param pageNumber - Page number (1-based)
+     * @param options - Conversion options
+     * @returns Base64 encoded image
+     */
+    convertPageToBase64(pdfPath: string, pageNumber: number, options?: SinglePageOptions): Promise<string>;
+    /**
+     * Generate thumbnails for all pages
+     *
+     * @param pdfPath - Path to PDF file
+     * @param options - Thumbnail options
+     * @returns Conversion result
+     */
+    generateThumbnails(pdfPath: string, options?: ThumbnailOptions & {
+        outputDir?: string;
+    }): Promise<PageToImageResult>;
+    /**
+     * Render a PDF page to image buffer
+     *
+     * Based on pdf-to-img library approach - let pdf.js handle canvas creation
+     * @see https://github.com/k-yle/pdf-to-img
+     */
+    private renderPageToBuffer;
+    /**
+     * Convert canvas to image buffer
+     */
+    private canvasToBuffer;
+    /**
+     * Get page numbers to convert based on options
+     */
+    private getPageNumbers;
+    /**
+     * Parse page range string (e.g., "1-5", "1,3,5-10")
+     */
+    private parsePageRange;
+    /**
+     * Generate filename from pattern
+     */
+    private generateFilename;
+    /**
+     * Format bytes to human-readable string
+     */
+    private formatBytes;
+}
+/**
+ * Result of image optimization
+ */
+interface OptimizationResult {
+    success: boolean;
+    originalSize: number;
+    optimizedSize: number;
+    savedBytes: number;
+    savedPercent: number;
+    engine: "jimp" | "sharp" | "none";
+    error?: string;
+}
+/**
+ * Options for image optimization
+ */
+interface OptimizationOptions {
+    quality?: number;
+    verbose?: boolean;
+    useSharp?: boolean;
+}
+/**
+ * Image optimizer using Jimp (pure JavaScript)
+ *
+ * This class provides image optimization capabilities using Jimp, a pure JavaScript
+ * image processing library with no native dependencies. It supports JPEG and PNG
+ * optimization with quality control.
+ *
+ * @example
+ * ```typescript
+ * const result = await ImageOptimizer.optimizeFile('image.jpg', {
+ *   engine: 'auto',
+ *   quality: 80
+ * });
+ *
+ * console.log(`Saved ${result.savedPercent.toFixed(1)}% using ${result.engine}`);
+ * ```
+ */
+declare class ImageOptimizer {
+    /**
+     * Optimize an image file in-place
+     *
+     * The original file will be replaced with the optimized version.
+     * If optimization fails, the original file remains unchanged.
+     *
+     * @param filePath - Path to the image file to optimize
+     * @param options - Optimization options
+     * @returns Promise resolving to optimization result
+     */
+    static optimizeFile(filePath: string, options?: OptimizationOptions): Promise<OptimizationResult>;
+    /**
+     * Optimize using Sharp (optional dependency)
+     */
+    private static optimizeWithSharp;
+    /**
+     * Optimize using Jimp (pure JavaScript)
+     */
+    private static optimizeWithJimp;
+    /**
+     * Convert JPEG 2000 formats (jp2, jpx, j2c, jpm) to JPG
+     *
+     * JPEG 2000 files are not widely supported by browsers and image tools.
+     * This method converts them to standard JPG format for better compatibility.
+     *
+     * Supports two conversion engines:
+     * - Jimp (default): Pure JavaScript, works everywhere
+     * - Sharp (optional): Better color preservation, requires native compilation
+     *
+     * @param jp2Path - Path to the JPEG 2000 file (jp2, jpx, j2c, or jpm)
+     * @param options - Conversion options
+     * @returns Promise resolving to conversion result with new file path
+     */
+    static convertJp2ToJpg(jp2Path: string, options?: {
+        quality?: number;
+        verbose?: boolean;
+        useSharp?: boolean;
+    }): Promise<{
+        success: boolean;
+        newPath?: string;
+        originalSize?: number;
+        newSize?: number;
+        error?: string;
+    }>;
+}
 /**
  * Handles formatting of image references and text processing
  */
@@ -596,12 +1481,23 @@ declare function validateImageRefFormat(format: string): ValidationError[];
  */
 declare function validateFilePath(filePath: string, extensions?: string[]): ValidationError[];
+/**
+ * pdf-plus - A comprehensive PDF content extraction library
+ *
+ * Main entry point for the PDF content extraction library.
+ * Provides both high-level convenience functions and low-level access to extractors.
+ *
+ * @packageDocumentation
+ */
 /**
  * Extract content from a PDF file (convenience function)
  *
+ * Automatically switches to streaming mode for large PDFs if `autoStreamThreshold` is set.
+ *
  * @param pdfPath - Path to the PDF file
  * @param options - Extraction options
- * @returns Promise resolving to extraction result
+ * @returns Promise resolving to extraction result or streaming result
  *
  * @example
  * ```typescript
@@ -615,8 +1511,17 @@ declare function validateFilePath(filePath: string, extensions?: string[]): Vali
  *
  * console.log(`Extracted ${result.images.length} images from ${result.document.pages} pages`);
  * ```
+ *
+ * @example
+ * ```typescript
+ * // Auto-streaming for large PDFs
+ * const result = await extractPdfContent('large-document.pdf', {
+ *   extractImageFiles: true,
+ *   autoStreamThreshold: 100, // Auto-stream if > 100 pages
+ * });
+ * ```
  */
-declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult>;
+declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult | StreamingExtractionResult>;
 /**
  * Extract only text content from a PDF (convenience function)
  *
@@ -673,22 +1578,70 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
  * ```
  */
 declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
-declare const version = "1.0.0";
+/**
+ * Extract PDF content in streaming mode (Phase 4 - NEW!)
+ *
+ * For large PDFs, this provides a streaming API that processes pages one at a time,
+ * reducing memory usage and providing real-time progress updates.
+ *
+ * @param pdfPath - Path to the PDF file
+ * @param options - Extraction and streaming options
+ * @returns StreamingExtractionResult with async iterator and event callbacks
+ *
+ * @example
+ * ```typescript
+ * // Using async iterator
+ * const stream = extractPdfStream('large-document.pdf', {
+ *   extractImageFiles: true,
+ *   imageOutputDir: './images',
+ *   streamMode: true
+ * });
+ *
+ * for await (const event of stream) {
+ *   if (event.type === 'page') {
+ *     console.log(`Processed page ${event.pageNumber}/${event.totalPages}`);
+ *   } else if (event.type === 'progress') {
+ *     console.log(`Progress: ${event.percentComplete.toFixed(1)}%`);
+ *   }
+ * }
+ *
+ * // Using event callbacks
+ * const stream = extractPdfStream('large-document.pdf', { streamMode: true })
+ *   .on('page', (event) => console.log(`Page ${event.pageNumber} done`))
+ *   .on('progress', (event) => console.log(`${event.percentComplete}% complete`))
+ *   .on('complete', (event) => console.log(`Done! ${event.totalImages} images`));
+ *
+ * for await (const event of stream) {
+ *   // Events are also available via iterator
+ * }
+ * ```
+ */
+declare function extractPdfStream(pdfPath: string, options?: Partial<ExtractionOptions>): StreamingExtractionResult;
+/**
+ * Library version
+ */
+declare const version = "1.0.3";
+/**
+ * Default export containing all public APIs
+ * Useful for CommonJS: const pdfPlus = require('pdf-plus');
+ */
 declare const _default: {
     PDFExtractor: typeof PDFExtractor;
     pdfExtractor: PDFExtractor;
+    StreamingPDFExtractor: typeof StreamingPDFExtractor;
     TextExtractor: typeof TextExtractor;
     ImageExtractor: typeof ImageExtractor;
+    ImageOptimizer: typeof ImageOptimizer;
     FormatProcessor: typeof FormatProcessor;
     extractPdfContent: typeof extractPdfContent;
     extractText: typeof extractText;
     extractImages: typeof extractImages;
     extractImageFiles: typeof extractImageFiles;
+    extractPdfStream: typeof extractPdfStream;
     validateConfig: typeof validateConfig;
     validateImageRefFormat: typeof validateImageRefFormat;
     validateFilePath: typeof validateFilePath;
     version: string;
 };
-export { type AnalyticsData, type DocumentMetadata, type DocumentSummary, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageExtractionEngine, ImageExtractor, type ImageItem, type MemoryUsage, type OCROptions, PDFExtractor, type PageInfo, type Position, type ProcessingPhase, type ProgressInfo, type StreamingOptions, type TemplateOptions, TextExtractor, type TextItem, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };
+export { type AnalyticsData, type CompleteEvent, type DocumentMetadata, type DocumentSummary, type ErrorEvent, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageEvent, ImageExtractor, type ImageItem, ImageOptimizer, type MemoryUsage, type OCROptions, type OptimizationOptions, type OptimizationResult, PDFExtractor, type PageEvent, type PageImageFormat, type PageImageResult, type PageInfo, PageToImageConverter, type PageToImageOptions, type PageToImageResult, type Position, type ProcessingPhase, type ProgressEvent, type ProgressInfo, type SinglePageOptions, type StartEvent, type StreamEvent, type StreamEventCallbacks, type StreamEventType, type StreamingExtractionResult, type StreamingOptions, StreamingPDFExtractor, type StreamingState, type StreamingStats, StructuredTextExtractor, type TemplateOptions, TextExtractor, type TextItem, type ThumbnailOptions, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractPdfStream, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };