npm - pdf-oxide-wasm - Versions diffs - 0.3.13 → 0.3.15 - Mend

pdf-oxide-wasm 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "pdf-oxide-wasm",
-  "version": "0.3.13",
+  "version": "0.3.15",
   "description": "Fast, zero-dependency PDF toolkit for Node.js, browsers, and edge runtimes — text extraction, markdown/HTML conversion, search, form filling, creation, and editing. Rust core compiled to WebAssembly.",
   "license": "MIT OR Apache-2.0",
   "repository": {

package/pdf_oxide.d.ts CHANGED Viewed

@@ -1,6 +1,86 @@
 /* tslint:disable */
 /* eslint-disable */
+/**
+ * Style configuration for header/footer text.
+ */
+export class ArtifactStyle {
+    free(): void;
+    [Symbol.dispose](): void;
+    bold(): ArtifactStyle;
+    color(r: number, g: number, b: number): ArtifactStyle;
+    font(name: string, size: number): ArtifactStyle;
+    constructor();
+}
+/**
+ * A header or footer artifact definition.
+ */
+export class WasmArtifact {
+    free(): void;
+    [Symbol.dispose](): void;
+    static center(text: string): WasmArtifact;
+    static left(text: string): WasmArtifact;
+    constructor();
+    static right(text: string): WasmArtifact;
+    withOffset(offset: number): WasmArtifact;
+    withStyle(style: ArtifactStyle): WasmArtifact;
+}
+/**
+ * A footer definition.
+ */
+export class WasmFooter {
+    free(): void;
+    [Symbol.dispose](): void;
+    static center(text: string): WasmFooter;
+    static left(text: string): WasmFooter;
+    constructor();
+    static right(text: string): WasmFooter;
+}
+/**
+ * A header definition.
+ */
+export class WasmHeader {
+    free(): void;
+    [Symbol.dispose](): void;
+    static center(text: string): WasmHeader;
+    static left(text: string): WasmHeader;
+    constructor();
+    static right(text: string): WasmHeader;
+}
+/**
+ * OCR configuration for WebAssembly.
+ */
+export class WasmOcrConfig {
+    free(): void;
+    [Symbol.dispose](): void;
+    constructor();
+}
+/**
+ * OCR engine for WebAssembly.
+ */
+export class WasmOcrEngine {
+    free(): void;
+    [Symbol.dispose](): void;
+    constructor(_det_model_path: string, _rec_model_path: string, _dict_path: string, _config?: WasmOcrConfig | null);
+}
+/**
+ * A complete page template with header and footer.
+ */
+export class WasmPageTemplate {
+    free(): void;
+    [Symbol.dispose](): void;
+    footer(footer: WasmArtifact): WasmPageTemplate;
+    header(header: WasmArtifact): WasmPageTemplate;
+    constructor();
+    skipFirstPage(): WasmPageTemplate;
+}
 /**
  * Create new PDF documents from Markdown, HTML, or plain text.
  *
@@ -101,6 +181,28 @@ export class WasmPdfDocument {
      * @param data - File contents as a Uint8Array
      */
     embedFile(name: string, data: Uint8Array): void;
+    /**
+     * Erase both header and footer content.
+     *
+     * @param page_index - Zero-based page number
+     */
+    eraseArtifacts(page_index: number): void;
+    /**
+     * Erase existing footer content.
+     *
+     * Identifies existing text in the footer area (bottom 15%) and marks it for erasure.
+     *
+     * @param page_index - Zero-based page number
+     */
+    eraseFooter(page_index: number): void;
+    /**
+     * Erase existing header content.
+     *
+     * Identifies existing text in the header area (top 15%) and marks it for erasure.
+     *
+     * @param page_index - Zero-based page number
+     */
+    eraseHeader(page_index: number): void;
     /**
      * Erase (whiteout) a rectangular region on a page.
      */
@@ -128,8 +230,11 @@ export class WasmPdfDocument {
      *
      * Returns an array of objects with: char, bbox {x, y, width, height},
      * font_name, font_size, font_weight, is_italic, color {r, g, b}, etc.
+     *
+     * @param page_index - Zero-based page number
+     * @param region - Optional [x, y, width, height] to filter by
      */
-    extractChars(page_index: number): any;
+    extractChars(page_index: number, region?: Float32Array | null): any;
     /**
      * Extract image bytes from a page as PNG data.
      *
@@ -141,28 +246,80 @@ export class WasmPdfDocument {
      *
      * Returns an array of objects with: width, height, color_space,
      * bits_per_component, bbox (if available). Does NOT return raw image bytes.
+     *
+     * @param page_index - Zero-based page number
+     * @param region - Optional [x, y, width, height] to filter by
+     */
+    extractImages(page_index: number, region?: Float32Array | null): any;
+    /**
+     * Extract only straight lines from a page (v0.3.14).
+     *
+     * Identifies paths that form a single straight line segment.
+     *
+     * @param page_index - Zero-based page number
+     * @param region - Optional [x, y, width, height] to filter by
+     * @returns Array of path objects
      */
-    extractImages(page_index: number): any;
+    extractLines(page_index: number, region?: Float32Array | null): any;
     /**
      * Extract vector paths (lines, curves, shapes) from a page.
      *
      * @param page_index - Zero-based page number
+     * @param region - Optional [x, y, width, height] to filter by
      * @returns Array of path objects with bbox, stroke_color, fill_color, etc.
      */
-    extractPaths(page_index: number): any;
+    extractPaths(page_index: number, region?: Float32Array | null): any;
+    /**
+     * Extract only rectangles from a page (v0.3.14).
+     *
+     * Identifies paths that form axis-aligned rectangles.
+     *
+     * @param page_index - Zero-based page number
+     * @param region - Optional [x, y, width, height] to filter by
+     * @returns Array of path objects
+     */
+    extractRects(page_index: number, region?: Float32Array | null): any;
     /**
      * Extract span-level data from a page.
      *
      * Returns an array of objects with: text, bbox, font_name, font_size,
      * font_weight, is_italic, color, etc.
      */
-    extractSpans(page_index: number): any;
+    extractSpans(page_index: number, region?: Float32Array | null): any;
+    /**
+     * Extract tables from a page (v0.3.14).
+     *
+     * @param page_index - Zero-based page number
+     * @param region - Optional [x, y, width, height] to filter by
+     */
+    extractTables(page_index: number, region?: Float32Array | null): any;
     /**
      * Extract plain text from a single page.
      *
      * @param page_index - Zero-based page number
+     * @param region - Optional [x, y, width, height] to filter by
+     */
+    extractText(page_index: number, region?: Float32Array | null): string;
+    /**
+     * Extract text lines from a page.
+     *
+     * Returns an array of objects with: text, bbox, words (array of Word objects).
+     */
+    extractTextLines(page_index: number, region?: Float32Array | null): any;
+    /**
+     * Extract text using OCR (optical character recognition).
+     *
+     * NOTE: OCR is not yet supported in the WebAssembly build due to missing
+     * ONNX Runtime support for the web backend in the current implementation.
      */
-    extractText(page_index: number): string;
+    extractTextOcr(_page_index: number, _engine?: WasmOcrEngine | null): string;
+    /**
+     * Extract word-level data from a page.
+     *
+     * Returns an array of objects with: text, bbox, font_name, font_size,
+     * font_weight, is_italic, is_bold.
+     */
+    extractWords(page_index: number, region?: Float32Array | null): any;
     /**
      * Flatten all annotations in the document into page content.
      */
@@ -271,6 +428,33 @@ export class WasmPdfDocument {
      * Get the rotation of a page in degrees (0, 90, 180, 270).
      */
     pageRotation(page_index: number): number;
+    /**
+     * Identify and remove both headers and footers.
+     *
+     * Prioritizes ISO 32000 spec-compliant /Artifact tags, with a heuristic
+     * fallback for untagged PDFs.
+     *
+     * @param threshold - Fraction of pages (0.0-1.0) where text must repeat (heuristic mode)
+     */
+    removeArtifacts(threshold: number): number;
+    /**
+     * Identify and remove footers.
+     *
+     * Uses spec-compliant /Artifact tags when available (100% accuracy), or
+     * falls back to heuristic analysis of the bottom 15% of pages.
+     *
+     * @param threshold - Fraction of pages (0.0-1.0) where text must repeat (heuristic mode)
+     */
+    removeFooters(threshold: number): number;
+    /**
+     * Identify and remove headers.
+     *
+     * Uses spec-compliant /Artifact tags when available (100% accuracy), or
+     * falls back to heuristic analysis of the top 15% of pages.
+     *
+     * @param threshold - Fraction of pages (0.0-1.0) where text must repeat (heuristic mode)
+     */
+    removeHeaders(threshold: number): number;
     /**
      * Reposition an image on a page.
      */
@@ -289,13 +473,6 @@ export class WasmPdfDocument {
     rotatePage(page_index: number, degrees: number): void;
     /**
      * Save with encryption and return the resulting PDF as bytes.
-     *
-     * @param user_password - Password required to open the document
-     * @param owner_password - Password for full access (defaults to user_password)
-     * @param allow_print - Allow printing (default: true)
-     * @param allow_copy - Allow copying text (default: true)
-     * @param allow_modify - Allow modifying (default: true)
-     * @param allow_annotate - Allow annotations (default: true)
      */
     saveEncryptedToBytes(user_password: string, owner_password?: string | null, allow_print?: boolean | null, allow_copy?: boolean | null, allow_modify?: boolean | null, allow_annotate?: boolean | null): Uint8Array;
     /**
@@ -395,6 +572,13 @@ export class WasmPdfDocument {
      * Get the PDF version as [major, minor].
      */
     version(): Uint8Array;
+    /**
+     * Focus extraction on a specific rectangular region of a page (v0.3.14).
+     *
+     * @param page_index - Zero-based page number
+     * @param region - [x, y, width, height] in points
+     */
+    within(page_index: number, region: Float32Array): WasmPdfPageRegion;
     /**
      * Get XMP metadata from the document.
      *
@@ -402,3 +586,52 @@ export class WasmPdfDocument {
      */
     xmpMetadata(): any;
 }
+/**
+ * A focused view of a PDF page region for scoped extraction (v0.3.14).
+ */
+export class WasmPdfPageRegion {
+    private constructor();
+    free(): void;
+    [Symbol.dispose](): void;
+    /**
+     * Extract character-level data from this region.
+     */
+    extractChars(): any;
+    /**
+     * Extract images from this region.
+     */
+    extractImages(): any;
+    /**
+     * Extract straight lines from this region.
+     */
+    extractLines(): any;
+    /**
+     * Extract vector paths from this region.
+     */
+    extractPaths(): any;
+    /**
+     * Extract rectangles from this region.
+     */
+    extractRects(): any;
+    /**
+     * Extract tables from this region.
+     */
+    extractTables(): any;
+    /**
+     * Extract text from this region.
+     */
+    extractText(): string;
+    /**
+     * Extract text lines from this region.
+     */
+    extractTextLines(): any;
+    /**
+     * Extract text using OCR from this region.
+     */
+    extractTextOcr(_engine?: WasmOcrEngine | null): string;
+    /**
+     * Extract words from this region.
+     */
+    extractWords(): any;
+}