npm - pdf-plus - Versions diffs - 1.0.0 - Mend

pdf-plus 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.mts ADDED Viewed

@@ -0,0 +1,694 @@
+/**
+ * Core types for PDF content extraction
+ */
+interface Position {
+    x: number;
+    y: number;
+    width: number;
+    height: number;
+}
+interface FontInfo {
+    name: string;
+    size: number;
+    weight?: string;
+    style?: string;
+    color?: string;
+}
+interface TextItem {
+    id: string;
+    content: string;
+    position: Position;
+    font: FontInfo;
+    page: number;
+    transform?: number[];
+    type: "text" | "heading" | "paragraph" | "caption";
+    fontSize?: number;
+    color?: string;
+}
+interface ImageItem {
+    id: string;
+    name?: string;
+    filename?: string;
+    filepath?: string;
+    position: Position;
+    page: number;
+    transform?: number[];
+    width: number;
+    height: number;
+    format?: string;
+    mimeType?: string;
+    size?: number;
+    filePath?: string;
+    data?: Uint8Array;
+}
+interface PageInfo {
+    number: number;
+    width: number;
+    height: number;
+    textItems: TextItem[];
+    images: ImageItem[];
+    textCount: number;
+    imageCount: number;
+}
+interface DocumentMetadata {
+    filename: string;
+    pages: number;
+    textLength: number;
+    extractedAt: string;
+    metadata: Record<string, unknown>;
+    options: ExtractionOptions;
+}
+interface ExtractionResult {
+    document: DocumentMetadata;
+    pages: PageInfo[];
+    images: ImageItem[];
+    textItems: TextItem[];
+    textWithRefs: string;
+    cleanText: string;
+    summary?: DocumentSummary;
+    structuredData?: StructuredPageData;
+}
+interface DocumentSummary {
+    totalPages: number;
+    totalTextItems: number;
+    totalImages: number;
+    totalTextLength: number;
+    averageImagesPerPage: string;
+    pagesWithImages: number;
+}
+interface StructuredPageData {
+    metadata: {
+        filename: string;
+        extractedAt: string;
+        totalPages: number;
+        totalTextLength: number;
+        totalImages: number;
+        extractionOptions: ExtractionOptions;
+    };
+    pages: PageData[];
+}
+interface PageData {
+    pageNumber: number;
+    text: {
+        content: string;
+        rawText: string;
+        wordCount: number;
+        characterCount: number;
+    };
+    images: PageImageData[];
+    imageCount: number;
+}
+interface PageImageData {
+    id: string;
+    name: string;
+    filename?: string;
+    path?: string;
+    position: {
+        x: number;
+        y: number;
+        width: number;
+        height: number;
+    };
+    format: string;
+    size?: number;
+}
+type ImageExtractionEngine = "pdf-lib" | "poppler" | "auto";
+interface ExtractionOptions {
+    extractText?: boolean;
+    extractImages?: boolean;
+    extractImageFiles?: boolean;
+    useImagePaths?: boolean;
+    imageOutputDir?: string;
+    imageRefFormat?: string;
+    includeImageRefs?: boolean;
+    includePageMarkers?: boolean;
+    pageMarkerFormat?: string;
+    /** Page number offset to align with visual PDF pages (e.g., +1 if PDF has cover page) */
+    pageOffset?: number;
+    /** Use combined extractor for accurate page boundaries (recommended) */
+    useCombinedExtractor?: boolean;
+    generateStructuredData?: boolean;
+    extractTextItems?: boolean;
+    specificPages?: number[];
+    useCache?: boolean;
+    /** Image extraction engine to use */
+    imageEngine?: ImageExtractionEngine;
+    cacheDir?: string;
+    baseName?: string;
+    verbose?: boolean;
+    memoryLimit?: string;
+    batchSize?: number;
+    progressCallback?: (progress: ProgressInfo) => void;
+}
+interface ProgressInfo {
+    currentPage: number;
+    totalPages: number;
+    phase: "text" | "images" | "processing" | "complete";
+    message?: string;
+}
+interface ExtractorConfig {
+    pdfPath: string;
+    outputDir?: string;
+    options: ExtractionOptions;
+}
+interface ValidationError {
+    field: string;
+    message: string;
+    value?: unknown;
+}
+interface PageExtractionResult {
+    pageNumber: number;
+    text: string;
+    rawText: string;
+    textItems: TextItem[];
+    images: ImageItem[];
+    metadata: {
+        wordCount: number;
+        characterCount: number;
+        imageCount: number;
+    };
+}
+interface ExtractionError extends Error {
+    code: string;
+    context?: Record<string, unknown>;
+    validationErrors?: ValidationError[];
+}
+type FormatPlaceholder = "id" | "name" | "page" | "index" | "path";
+interface FormatContext {
+    id: string;
+    name: string;
+    page: number;
+    index: number;
+    path: string;
+}
+interface ProcessingPhase {
+    name: string;
+    description: string;
+    status: "not_started" | "in_progress" | "complete" | "error";
+    progress?: number;
+    error?: string;
+}
+interface MemoryUsage {
+    used: number;
+    total: number;
+    percentage: number;
+    timestamp: number;
+}
+interface StreamingOptions {
+    batchSize: number;
+    memoryLimit: number;
+    enableCaching: boolean;
+    cacheSize?: number;
+}
+interface OCROptions {
+    enabled: boolean;
+    language?: string;
+    confidence?: number;
+    engine?: "tesseract" | "cloud";
+}
+interface AnalyticsData {
+    processingTime: number;
+    memoryPeak: number;
+    pagesPerSecond: number;
+    errorCount: number;
+    qualityScore?: number;
+}
+interface TemplateOptions {
+    format: "markdown" | "html" | "xml" | "json" | "custom";
+    template?: string;
+    variables?: Record<string, unknown>;
+}
+/**
+ * Main PDF content extractor class
+ *
+ * Provides comprehensive PDF content extraction capabilities including:
+ * - Text extraction with positioning and formatting
+ * - Image detection and extraction
+ * - Structured data generation
+ * - Page-specific extraction
+ * - Caching for performance optimization
+ *
+ * @example
+ * ```typescript
+ * const extractor = new PDFExtractor();
+ * const result = await extractor.extract('document.pdf', {
+ *   extractText: true,
+ *   extractImages: true,
+ *   verbose: true
+ * });
+ * ```
+ */
+declare class PDFExtractor {
+    private textExtractor;
+    private imageExtractor;
+    private formatProcessor;
+    private structuredDataGenerator;
+    private cacheManager;
+    /**
+     * Create a new PDFExtractor instance
+     *
+     * @param cacheDir - Optional directory for caching extracted data
+     */
+    constructor(cacheDir?: string);
+    /**
+     * Extract content from a PDF file
+     *
+     * This is the main extraction method that can extract text, images, or both
+     * depending on the provided options. It supports various output formats and
+     * processing modes.
+     *
+     * @param pdfPath - Path to the PDF file to extract content from
+     * @param options - Configuration options for extraction
+     * @returns Promise resolving to complete extraction results
+     *
+     * @throws {ValidationError} When configuration is invalid
+     * @throws {ExtractionError} When PDF processing fails
+     *
+     * @example
+     * ```typescript
+     * // Extract both text and images
+     * const result = await extractor.extract('document.pdf', {
+     *   extractText: true,
+     *   extractImages: true,
+     *   extractImageFiles: true,
+     *   imageOutputDir: './images',
+     *   verbose: true
+     * });
+     *
+     * console.log(`Extracted ${result.images.length} images`);
+     * console.log(`Text: ${result.cleanText.substring(0, 100)}...`);
+     * ```
+     */
+    extract(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult>;
+    /**
+     * Extract only text content (optimized)
+     *
+     * This method is optimized for text-only extraction and is faster than
+     * the full extract() method when you only need text content.
+     *
+     * @param pdfPath - Path to the PDF file
+     * @param options - Partial extraction options (images will be disabled)
+     * @returns Promise resolving to extracted text content
+     *
+     * @example
+     * ```typescript
+     * const text = await extractor.extractText('document.pdf', {
+     *   verbose: true
+     * });
+     * console.log(`Extracted ${text.length} characters`);
+     * ```
+     */
+    extractText(pdfPath: string, options?: Partial<ExtractionOptions>): Promise<string>;
+    /**
+     * Extract only image references (optimized)
+     */
+    extractImages(pdfPath: string, options?: Partial<ExtractionOptions>): Promise<ExtractionResult["images"]>;
+    /**
+     * Extract and save image files
+     */
+    extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
+    private validateConfiguration;
+    private processResults;
+    /**
+     * Get text for a specific page
+     */
+    getText(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<string>;
+    /**
+     * Get images for a specific page
+     */
+    getImages(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<ImageItem[]>;
+    /**
+     * Get text items for a specific page
+     */
+    getTextItems(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<TextItem[]>;
+    /**
+     * Get raw text for a specific page (no page markers, image refs, just clean text)
+     */
+    getRawText(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<string>;
+    /**
+     * Get complete page data (text + images + text items)
+     */
+    getPage(pdfPath: string, pageNumber: number, options?: ExtractionOptions): Promise<PageExtractionResult>;
+    /**
+     * Extract text for a specific page from full text
+     */
+    private extractPageText;
+    /**
+     * Count words in text
+     */
+    private countWords;
+    /**
+     * Extract raw text without page markers, image references, or formatting
+     */
+    private extractRawText;
+    /**
+     * Clear cache for a PDF
+     */
+    clearCache(pdfPath: string): void;
+    /**
+     * Get cache statistics
+     */
+    getCacheStats(): {
+        totalCachedPdfs: number;
+        totalCachedPages: number;
+        totalCacheSize: number;
+        cacheDir: string;
+    };
+    private reportProgress;
+    private createValidationError;
+    private createExtractionError;
+}
+declare const pdfExtractor: PDFExtractor;
+/**
+ * Text extraction from PDF files
+ *
+ * Handles text extraction using pdf-parse library with support for
+ * page-by-page extraction and metadata retrieval.
+ *
+ * @example
+ * ```typescript
+ * const textExtractor = new TextExtractor();
+ * const result = await textExtractor.extract('document.pdf');
+ * console.log(result.text);
+ * ```
+ */
+declare class TextExtractor {
+    /**
+     * Extract text content from PDF
+     *
+     * @param pdfPath - Path to the PDF file
+     * @returns Promise resolving to extraction result with text and metadata
+     * @throws {Error} When PDF extraction fails
+     */
+    extract(pdfPath: string): Promise<any>;
+    /**
+     * Extract text with page information
+     *
+     * @param pdfPath - Path to the PDF file
+     * @returns Promise resolving to extraction result with page-separated text
+     * @throws {Error} When PDF extraction fails
+     */
+    extractWithPages(pdfPath: string): Promise<any>;
+    /**
+     * Split text into approximate pages
+     */
+    private splitTextIntoPages;
+    /**
+     * Extract text items with position and metadata
+     */
+    extractTextItems(pdfPath: string, options?: ExtractionOptions): Promise<TextItem[]>;
+    /**
+     * Extract text statistics
+     */
+    extractStatistics(pdfPath: string): Promise<{
+        characterCount: number;
+        wordCount: number;
+        lineCount: number;
+        pageCount: number;
+        averageWordsPerPage: number;
+        readingTime: number;
+    }>;
+    /**
+     * Extract text with font information (requires PDF.js)
+     */
+    extractWithFontInfo(pdfPath: string): Promise<any>;
+    /**
+     * Clean extracted text
+     */
+    cleanText(text: string): string;
+    /**
+     * Extract text from specific page range
+     */
+    extractPageRange(pdfPath: string, startPage: number, endPage: number): Promise<string>;
+    /**
+     * Search for text in PDF
+     */
+    searchText(pdfPath: string, searchTerm: string, caseSensitive?: boolean): Promise<{
+        found: boolean;
+        occurrences: number;
+        pages: number[];
+        context: string[];
+    }>;
+    /**
+     * Extract text with page markers
+     */
+    extractWithPageMarkers(pdfPath: string, pageMarkerFormat?: string, options?: {
+        pageOffset?: number;
+        includeImageRefs?: boolean;
+        imageRefFormat?: string;
+        imageEngine?: ImageExtractionEngine;
+    }): Promise<{
+        text: string;
+        pages: PageData[];
+    }>;
+    /**
+     * Extract text with accurate page boundaries using pdf-lib + pdf-parse
+     */
+    extractWithAccuratePages(pdfPath: string): Promise<{
+        fullText: string;
+        pages: PageData[];
+        totalPages: number;
+    }>;
+}
+/**
+ * Image extraction from PDF files using pdf-lib (clean implementation based on NestJS)
+ *
+ * Supports multiple extraction engines including pdf-lib and poppler for
+ * maximum compatibility and performance. Can extract image metadata,
+ * save image files, and handle various image formats.
+ *
+ * @example
+ * ```typescript
+ * const imageExtractor = new ImageExtractor();
+ * const result = await imageExtractor.extract('document.pdf', {
+ *   extractImageFiles: true,
+ *   imageOutputDir: './images',
+ *   imageEngine: 'auto'
+ * });
+ * ```
+ */
+declare class ImageExtractor {
+    /**
+     * Extract images from PDF file using configurable engines
+     *
+     * @param pdfPath - Path to the PDF file
+     * @param options - Extraction options including engine selection and output settings
+     * @returns Promise resolving to extraction result with image metadata
+     * @throws {Error} When image extraction fails
+     */
+    extract(pdfPath: string, options?: ExtractionOptions): Promise<any>;
+    /**
+     * Get available image extraction engines
+     */
+    static getAvailableEngines(): Promise<{
+        name: string;
+        description: string;
+        available: boolean;
+        capabilities: {
+            formats: string[];
+            supportsMetadata: boolean;
+            supportsEmbeddedImages: boolean;
+            supportsVectorImages: boolean;
+        };
+    }[]>;
+    /**
+     * Get engine recommendations
+     */
+    static getEngineRecommendations(): ({
+        useCase: string;
+        engine: "pdf-lib";
+        reason: string;
+    } | {
+        useCase: string;
+        engine: "poppler";
+        reason: string;
+    })[];
+    /**
+     * Extract images using pdf-lib (based on working NestJS implementation)
+     * @deprecated Use extract() with imageEngine: 'pdf-lib' instead
+     */
+    extractWithPdfLib(pdfPath: string, options?: ExtractionOptions): Promise<any>;
+    /**
+     * Extract a single image from a PDF object using the working approach
+     */
+    private extractImageFromPdfObject;
+    /**
+     * Extract image data with proper decompression handling using actual PDF metadata
+     */
+    private extractImageData;
+    /**
+     * Detect image format from binary data (from NestJS implementation)
+     */
+    private detectImageFormat;
+    /**
+     * Create a PNG file from raw pixel data using actual PDF metadata
+     */
+    private createPngFromPdfMetadata;
+}
+/**
+ * Handles formatting of image references and text processing
+ */
+declare class FormatProcessor {
+    /**
+     * Generate text with image references inserted
+     */
+    generateTextWithImageRefs(text: string, images: ImageItem[], format: string, totalPages: number): string;
+    /**
+     * Generate image-only reference list
+     */
+    generateImageOnlyRefs(images: ImageItem[], format: string): string;
+    /**
+     * Format a single image reference
+     */
+    formatImageReference(image: ImageItem, format: string, globalIndex: number): string;
+    /**
+     * Replace placeholders in format string
+     */
+    private replacePlaceholders;
+    /**
+     * Extract placeholders from format string
+     */
+    extractPlaceholders(format: string): string[];
+    /**
+     * Validate format string
+     */
+    isValidFormat(format: string): boolean;
+    /**
+     * Get default format based on options
+     */
+    getDefaultFormat(useImagePaths?: boolean): string;
+    /**
+     * Clean text by removing image references
+     */
+    cleanTextFromImageRefs(textWithRefs: string, format: string): string;
+    /**
+     * Count image references in text
+     */
+    countImageReferences(text: string, format: string): number;
+    /**
+     * Generate summary text
+     */
+    generateSummary(totalPages: number, totalTextItems: number, totalImages: number, totalTextLength: number, processingTime?: number): string;
+    /**
+     * Format file size
+     */
+    formatFileSize(bytes: number): string;
+    /**
+     * Format duration
+     */
+    formatDuration(milliseconds: number): string;
+}
+/**
+ * Validate extractor configuration
+ */
+declare function validateConfig(config: ExtractorConfig): ValidationError[];
+/**
+ * Validate image reference format
+ */
+declare function validateImageRefFormat(format: string): ValidationError[];
+/**
+ * Validate file path
+ */
+declare function validateFilePath(filePath: string, extensions?: string[]): ValidationError[];
+/**
+ * Extract content from a PDF file (convenience function)
+ *
+ * @param pdfPath - Path to the PDF file
+ * @param options - Extraction options
+ * @returns Promise resolving to extraction result
+ *
+ * @example
+ * ```typescript
+ * import { extractPdfContent } from 'pdfnode';
+ *
+ * const result = await extractPdfContent('document.pdf', {
+ *   extractText: true,
+ *   extractImages: true,
+ *   verbose: true
+ * });
+ *
+ * console.log(`Extracted ${result.images.length} images from ${result.document.pages} pages`);
+ * ```
+ */
+declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions): Promise<ExtractionResult>;
+/**
+ * Extract only text content from a PDF (convenience function)
+ *
+ * @param pdfPath - Path to the PDF file
+ * @param options - Extraction options
+ * @returns Promise resolving to text content
+ *
+ * @example
+ * ```typescript
+ * import { extractText } from 'pdfnode';
+ *
+ * const text = await extractText('document.pdf');
+ * console.log(`Extracted ${text.length} characters`);
+ * ```
+ */
+declare function extractText(pdfPath: string, options?: Partial<ExtractionOptions>): Promise<string>;
+/**
+ * Extract only image references from a PDF (convenience function)
+ *
+ * @param pdfPath - Path to the PDF file
+ * @param options - Extraction options
+ * @returns Promise resolving to array of image items
+ *
+ * @example
+ * ```typescript
+ * import { extractImages } from 'pdfnode';
+ *
+ * const images = await extractImages('document.pdf', {
+ *   extractImageFiles: true,
+ *   imageOutputDir: './my-images'
+ * });
+ *
+ * console.log(`Extracted ${images.length} images`);
+ * ```
+ */
+declare function extractImages(pdfPath: string, options?: Partial<ExtractionOptions>): Promise<ImageItem[]>;
+/**
+ * Extract and save image files from a PDF (convenience function)
+ *
+ * @param pdfPath - Path to the PDF file
+ * @param outputDir - Directory to save images
+ * @param options - Extraction options
+ * @returns Promise resolving to array of saved file paths
+ *
+ * @example
+ * ```typescript
+ * import { extractImageFiles } from 'pdfnode';
+ *
+ * const filePaths = await extractImageFiles('document.pdf', './images', {
+ *   verbose: true
+ * });
+ *
+ * console.log(`Saved ${filePaths.length} image files`);
+ * ```
+ */
+declare function extractImageFiles(pdfPath: string, outputDir?: string, options?: Partial<ExtractionOptions>): Promise<string[]>;
+declare const version = "1.0.0";
+declare const _default: {
+    PDFExtractor: typeof PDFExtractor;
+    pdfExtractor: PDFExtractor;
+    TextExtractor: typeof TextExtractor;
+    ImageExtractor: typeof ImageExtractor;
+    FormatProcessor: typeof FormatProcessor;
+    extractPdfContent: typeof extractPdfContent;
+    extractText: typeof extractText;
+    extractImages: typeof extractImages;
+    extractImageFiles: typeof extractImageFiles;
+    validateConfig: typeof validateConfig;
+    validateImageRefFormat: typeof validateImageRefFormat;
+    validateFilePath: typeof validateFilePath;
+    version: string;
+};
+export { type AnalyticsData, type DocumentMetadata, type DocumentSummary, type ExtractionError, type ExtractionOptions, type ExtractionResult, type ExtractorConfig, type FontInfo, type FormatContext, type FormatPlaceholder, FormatProcessor, type ImageExtractionEngine, ImageExtractor, type ImageItem, type MemoryUsage, type OCROptions, PDFExtractor, type PageInfo, type Position, type ProcessingPhase, type ProgressInfo, type StreamingOptions, type TemplateOptions, TextExtractor, type TextItem, type ValidationError, _default as default, extractImageFiles, extractImages, extractPdfContent, extractText, pdfExtractor, validateConfig, validateFilePath, validateImageRefFormat, version };