npm - pdf-oxide-wasm - Versions diffs - 0.3.10 - Mend

pdf-oxide-wasm 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,47 @@
+# pdf-oxide-wasm
+High-performance PDF text extraction and manipulation via WebAssembly. Built on the [PDF Oxide](https://github.com/yfedoseev/pdf_oxide) Rust core.
+## Quick Start
+```javascript
+const { WasmPdfDocument } = require("pdf-oxide-wasm");
+const fs = require("fs");
+const bytes = new Uint8Array(fs.readFileSync("document.pdf"));
+const doc = new WasmPdfDocument(bytes);
+console.log(`Pages: ${doc.pageCount()}`);
+console.log(doc.extractText(0));
+doc.free();
+```
+### ESM
+```javascript
+import { WasmPdfDocument } from "pdf-oxide-wasm";
+const bytes = new Uint8Array(await fs.promises.readFile("document.pdf"));
+const doc = new WasmPdfDocument(bytes);
+const text = doc.extractText(0);
+doc.free();
+```
+## Features
+- Text extraction (plain text, Markdown, HTML)
+- Character-level and span-level extraction with positions
+- PDF creation from Markdown, HTML, text, and images
+- Form field extraction and filling
+- PDF editing (metadata, rotation, cropping, annotations)
+- Encryption (AES-256)
+- Search with regex support
+## Documentation
+Full API reference and examples: [Getting Started (WASM)](https://github.com/yfedoseev/pdf_oxide/blob/main/docs/getting-started-wasm.md)
+## License
+MIT OR Apache-2.0

package/package.json ADDED Viewed

@@ -0,0 +1,27 @@
+{
+  "name": "pdf-oxide-wasm",
+  "version": "0.3.10",
+  "description": "High-performance PDF text extraction and manipulation via WebAssembly",
+  "license": "MIT OR Apache-2.0",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/yfedoseev/pdf_oxide"
+  },
+  "homepage": "https://github.com/yfedoseev/pdf_oxide/blob/main/docs/getting-started-wasm.md",
+  "files": [
+    "pdf_oxide_bg.wasm",
+    "pdf_oxide.js",
+    "pdf_oxide.d.ts",
+    "pdf_oxide_bg.wasm.d.ts",
+    "README.md"
+  ],
+  "main": "pdf_oxide.js",
+  "types": "pdf_oxide.d.ts",
+  "keywords": [
+    "pdf",
+    "wasm",
+    "webassembly",
+    "text-extraction",
+    "pdf-parser"
+  ]
+}

package/pdf_oxide.d.ts ADDED Viewed

@@ -0,0 +1,404 @@
+/* tslint:disable */
+/* eslint-disable */
+/**
+ * Create new PDF documents from Markdown, HTML, or plain text.
+ *
+ * ```javascript
+ * const pdf = WasmPdf.fromMarkdown("# Hello\n\nWorld");
+ * const bytes = pdf.toBytes(); // Uint8Array
+ * console.log(`PDF size: ${pdf.size} bytes`);
+ * ```
+ */
+export class WasmPdf {
+    private constructor();
+    free(): void;
+    [Symbol.dispose](): void;
+    /**
+     * Create a PDF from HTML content.
+     *
+     * @param content - HTML string
+     * @param title - Optional document title
+     * @param author - Optional document author
+     */
+    static fromHtml(content: string, title?: string | null, author?: string | null): WasmPdf;
+    /**
+     * Create a PDF from image bytes (PNG, JPEG, etc.).
+     *
+     * @param data - Image file contents as a Uint8Array
+     */
+    static fromImageBytes(data: Uint8Array): WasmPdf;
+    /**
+     * Create a PDF from Markdown content.
+     *
+     * @param content - Markdown string
+     * @param title - Optional document title
+     * @param author - Optional document author
+     */
+    static fromMarkdown(content: string, title?: string | null, author?: string | null): WasmPdf;
+    /**
+     * Create a PDF from multiple image byte arrays.
+     *
+     * Each image becomes a separate page. Pass an array of Uint8Arrays.
+     *
+     * @param images_array - Array of Uint8Arrays, each containing image file bytes (PNG/JPEG)
+     */
+    static fromMultipleImageBytes(images_array: any): WasmPdf;
+    /**
+     * Create a PDF from plain text.
+     *
+     * @param content - Plain text string
+     * @param title - Optional document title
+     * @param author - Optional document author
+     */
+    static fromText(content: string, title?: string | null, author?: string | null): WasmPdf;
+    /**
+     * Get the PDF as a Uint8Array.
+     */
+    toBytes(): Uint8Array;
+    /**
+     * Get the size of the PDF in bytes.
+     */
+    readonly size: number;
+}
+/**
+ * A PDF document loaded from bytes for use in WebAssembly.
+ *
+ * Create an instance by passing PDF file bytes to the constructor.
+ * Call `.free()` when done to release memory.
+ */
+export class WasmPdfDocument {
+    free(): void;
+    [Symbol.dispose](): void;
+    /**
+     * Apply all redactions in the document.
+     */
+    applyAllRedactions(): void;
+    /**
+     * Apply redactions on a page (removes redacted content permanently).
+     */
+    applyPageRedactions(page_index: number): void;
+    /**
+     * Authenticate with a password to decrypt an encrypted PDF.
+     *
+     * @param password - The password string
+     * @returns true if authentication succeeded
+     */
+    authenticate(password: string): boolean;
+    /**
+     * Clear all pending erase operations for a page.
+     */
+    clearEraseRegions(page_index: number): void;
+    /**
+     * Crop margins from all pages.
+     */
+    cropMargins(left: number, right: number, top: number, bottom: number): void;
+    /**
+     * Embed a file into the PDF document.
+     *
+     * @param name - Display name for the embedded file
+     * @param data - File contents as a Uint8Array
+     */
+    embedFile(name: string, data: Uint8Array): void;
+    /**
+     * Erase (whiteout) a rectangular region on a page.
+     */
+    eraseRegion(page_index: number, llx: number, lly: number, urx: number, ury: number): void;
+    /**
+     * Erase multiple rectangular regions on a page.
+     *
+     * @param page_index - Zero-based page number
+     * @param rects - Flat array of coordinates [llx1,lly1,urx1,ury1, llx2,lly2,urx2,ury2, ...]
+     */
+    eraseRegions(page_index: number, rects: Float32Array): void;
+    /**
+     * Export form field data as FDF or XFDF bytes.
+     *
+     * @param format - "fdf" or "xfdf" (default: "fdf")
+     * @returns Uint8Array containing the exported form data
+     */
+    exportFormData(format?: string | null): Uint8Array;
+    /**
+     * Extract plain text from all pages, separated by form feed characters.
+     */
+    extractAllText(): string;
+    /**
+     * Extract character-level data from a page.
+     *
+     * Returns an array of objects with: char, bbox {x, y, width, height},
+     * font_name, font_size, font_weight, is_italic, color {r, g, b}, etc.
+     */
+    extractChars(page_index: number): any;
+    /**
+     * Extract image bytes from a page as PNG data.
+     *
+     * Returns an array of objects with: width, height, data (Uint8Array of PNG bytes), format ("png").
+     */
+    extractImageBytes(page_index: number): any;
+    /**
+     * Extract image metadata from a page.
+     *
+     * Returns an array of objects with: width, height, color_space,
+     * bits_per_component, bbox (if available). Does NOT return raw image bytes.
+     */
+    extractImages(page_index: number): any;
+    /**
+     * Extract vector paths (lines, curves, shapes) from a page.
+     *
+     * @param page_index - Zero-based page number
+     * @returns Array of path objects with bbox, stroke_color, fill_color, etc.
+     */
+    extractPaths(page_index: number): any;
+    /**
+     * Extract span-level data from a page.
+     *
+     * Returns an array of objects with: text, bbox, font_name, font_size,
+     * font_weight, is_italic, color, etc.
+     */
+    extractSpans(page_index: number): any;
+    /**
+     * Extract plain text from a single page.
+     *
+     * @param page_index - Zero-based page number
+     */
+    extractText(page_index: number): string;
+    /**
+     * Flatten all annotations in the document into page content.
+     */
+    flattenAllAnnotations(): void;
+    /**
+     * Flatten all form fields into page content.
+     *
+     * After flattening, form field values become static text and are no longer editable.
+     */
+    flattenForms(): void;
+    /**
+     * Flatten form fields on a specific page.
+     *
+     * @param page_index - Zero-based page number
+     */
+    flattenFormsOnPage(page_index: number): void;
+    /**
+     * Flatten annotations on a page into the page content.
+     */
+    flattenPageAnnotations(page_index: number): void;
+    /**
+     * Get annotations from a page.
+     *
+     * @param page_index - Zero-based page number
+     * @returns Array of annotation objects with fields like subtype, rect, contents, etc.
+     */
+    getAnnotations(page_index: number): any;
+    /**
+     * Get the value of a specific form field by name.
+     *
+     * @param name - Full qualified field name (e.g., "name" or "topmostSubform[0].Page1[0].f1_01[0]")
+     * @returns The field value: string for text, boolean for checkbox, null if not found
+     */
+    getFormFieldValue(name: string): any;
+    /**
+     * Get all form fields from the document.
+     *
+     * Returns an array of form field objects, each with:
+     * - name: Full qualified field name
+     * - field_type: "text", "button", "choice", "signature", or "unknown"
+     * - value: string, boolean, array of strings, or null
+     * - tooltip: string or null
+     * - bounds: [x1, y1, x2, y2] or null
+     * - flags: number or null
+     * - max_length: number or null
+     * - is_readonly: boolean
+     * - is_required: boolean
+     */
+    getFormFields(): any;
+    /**
+     * Get the document outline (bookmarks / table of contents).
+     *
+     * @returns Array of outline items or null if no outline exists.
+     * Each item has: { title, page (number|null), dest_name (string, optional), children (array) }
+     */
+    getOutline(): any;
+    /**
+     * Check if the document has a structure tree (Tagged PDF).
+     */
+    hasStructureTree(): boolean;
+    /**
+     * Check if the document contains XFA form data.
+     *
+     * @returns true if the document has XFA form data
+     */
+    hasXfa(): boolean;
+    /**
+     * Merge another PDF (provided as bytes) into this document.
+     *
+     * @param data - The PDF file contents to merge as a Uint8Array
+     * @returns Number of pages merged
+     */
+    mergeFrom(data: Uint8Array): number;
+    /**
+     * Load a PDF document from raw bytes.
+     *
+     * @param data - The PDF file contents as a Uint8Array
+     * @throws Error if the PDF is invalid or cannot be parsed
+     */
+    constructor(data: Uint8Array);
+    /**
+     * Get the number of pages in the document.
+     */
+    pageCount(): number;
+    /**
+     * Get the CropBox of a page as [llx, lly, urx, ury], or null if not set.
+     */
+    pageCropBox(page_index: number): any;
+    /**
+     * Get information about images on a page.
+     *
+     * Returns an array of {name, bounds: [x, y, width, height], matrix: [a, b, c, d, e, f]}.
+     */
+    pageImages(page_index: number): any;
+    /**
+     * Get page label ranges from the document.
+     *
+     * @returns Array of {start_page, style, prefix, start_value} objects, or empty array
+     */
+    pageLabels(): any;
+    /**
+     * Get the MediaBox of a page as [llx, lly, urx, ury].
+     */
+    pageMediaBox(page_index: number): Float32Array;
+    /**
+     * Get the rotation of a page in degrees (0, 90, 180, 270).
+     */
+    pageRotation(page_index: number): number;
+    /**
+     * Reposition an image on a page.
+     */
+    repositionImage(page_index: number, name: string, x: number, y: number): void;
+    /**
+     * Resize an image on a page.
+     */
+    resizeImage(page_index: number, name: string, width: number, height: number): void;
+    /**
+     * Rotate all pages by the given degrees.
+     */
+    rotateAllPages(degrees: number): void;
+    /**
+     * Rotate a page by the given degrees (adds to current rotation).
+     */
+    rotatePage(page_index: number, degrees: number): void;
+    /**
+     * Save with encryption and return the resulting PDF as bytes.
+     *
+     * @param user_password - Password required to open the document
+     * @param owner_password - Password for full access (defaults to user_password)
+     * @param allow_print - Allow printing (default: true)
+     * @param allow_copy - Allow copying text (default: true)
+     * @param allow_modify - Allow modifying (default: true)
+     * @param allow_annotate - Allow annotations (default: true)
+     */
+    saveEncryptedToBytes(user_password: string, owner_password?: string | null, allow_print?: boolean | null, allow_copy?: boolean | null, allow_modify?: boolean | null, allow_annotate?: boolean | null): Uint8Array;
+    /**
+     * Save all edits and return the resulting PDF as bytes.
+     *
+     * @returns Uint8Array containing the modified PDF
+     */
+    saveToBytes(): Uint8Array;
+    /**
+     * Search for text across all pages.
+     *
+     * @param pattern - Regex pattern or literal text to search for
+     * @param case_insensitive - Case insensitive search (default: false)
+     * @param literal - Treat pattern as literal text, not regex (default: false)
+     * @param whole_word - Match whole words only (default: false)
+     * @param max_results - Maximum results to return, 0 = unlimited (default: 0)
+     *
+     * Returns an array of {page, text, bbox, start_index, end_index, span_boxes}.
+     */
+    search(pattern: string, case_insensitive?: boolean | null, literal?: boolean | null, whole_word?: boolean | null, max_results?: number | null): any;
+    /**
+     * Search for text on a specific page.
+     */
+    searchPage(page_index: number, pattern: string, case_insensitive?: boolean | null, literal?: boolean | null, whole_word?: boolean | null, max_results?: number | null): any;
+    /**
+     * Set the document author.
+     */
+    setAuthor(author: string): void;
+    /**
+     * Set the value of a form field.
+     *
+     * @param name - Full qualified field name
+     * @param value - New value: string for text fields, boolean for checkboxes
+     */
+    setFormFieldValue(name: string, value: any): void;
+    /**
+     * Set the complete bounds of an image on a page.
+     */
+    setImageBounds(page_index: number, name: string, x: number, y: number, width: number, height: number): void;
+    /**
+     * Set the document keywords.
+     */
+    setKeywords(keywords: string): void;
+    /**
+     * Set the CropBox of a page.
+     */
+    setPageCropBox(page_index: number, llx: number, lly: number, urx: number, ury: number): void;
+    /**
+     * Set the MediaBox of a page.
+     */
+    setPageMediaBox(page_index: number, llx: number, lly: number, urx: number, ury: number): void;
+    /**
+     * Set the rotation of a page (0, 90, 180, or 270 degrees).
+     */
+    setPageRotation(page_index: number, degrees: number): void;
+    /**
+     * Set the document subject.
+     */
+    setSubject(subject: string): void;
+    /**
+     * Set the document title.
+     */
+    setTitle(title: string): void;
+    /**
+     * Convert a single page to HTML.
+     *
+     * @param page_index - Zero-based page number
+     * @param preserve_layout - Use CSS positioning to preserve layout (default: false)
+     * @param detect_headings - Whether to detect headings (default: true)
+     */
+    toHtml(page_index: number, preserve_layout?: boolean | null, detect_headings?: boolean | null, include_form_fields?: boolean | null): string;
+    /**
+     * Convert all pages to HTML.
+     */
+    toHtmlAll(preserve_layout?: boolean | null, detect_headings?: boolean | null, include_form_fields?: boolean | null): string;
+    /**
+     * Convert a single page to Markdown.
+     *
+     * @param page_index - Zero-based page number
+     * @param detect_headings - Whether to detect headings (default: true)
+     * @param include_images - Whether to include images (default: true)
+     */
+    toMarkdown(page_index: number, detect_headings?: boolean | null, include_images?: boolean | null, include_form_fields?: boolean | null): string;
+    /**
+     * Convert all pages to Markdown.
+     */
+    toMarkdownAll(detect_headings?: boolean | null, include_images?: boolean | null, include_form_fields?: boolean | null): string;
+    /**
+     * Convert a single page to plain text (with layout preservation options).
+     */
+    toPlainText(page_index: number): string;
+    /**
+     * Convert all pages to plain text.
+     */
+    toPlainTextAll(): string;
+    /**
+     * Get the PDF version as [major, minor].
+     */
+    version(): Uint8Array;
+    /**
+     * Get XMP metadata from the document.
+     *
+     * @returns Object with XMP fields (dc_title, dc_creator, etc.) or null if no XMP
+     */
+    xmpMetadata(): any;
+}