npm - @bentopdf/pymupdf-wasm - Versions diffs - 0.1.10 → 0.11.12 - Mend

@bentopdf/pymupdf-wasm 0.1.10 → 0.11.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -1,7 +1,20 @@
 // src/page.ts
+function base64ToUint8Array(base64) {
+  const binaryStr = atob(base64);
+  const len = binaryStr.length;
+  const bytes = new Uint8Array(len);
+  const CHUNK_SIZE = 32768;
+  for (let i = 0; i < len; i += CHUNK_SIZE) {
+    const end = Math.min(i + CHUNK_SIZE, len);
+    for (let j = i; j < end; j++) {
+      bytes[j] = binaryStr.charCodeAt(j);
+    }
+  }
+  return bytes;
+}
 function uint8ArrayToBase64(bytes) {
-  let binary = "";
   const chunkSize = 32768;
+  let binary = "";
   for (let i = 0; i < bytes.length; i += chunkSize) {
     const chunk = bytes.subarray(i, Math.min(i + chunkSize, bytes.length));
     binary += String.fromCharCode.apply(null, Array.from(chunk));
@@ -114,11 +127,7 @@ _result
 `);
     if (result === "null") return null;
     const parsed = JSON.parse(result);
-    const binary = atob(parsed.data);
-    const bytes = new Uint8Array(binary.length);
-    for (let i = 0; i < binary.length; i++) {
-      bytes[i] = binary.charCodeAt(i);
-    }
+    const bytes = base64ToUint8Array(parsed.data);
     return { ...parsed, data: bytes };
   }
   insertImage(rect, imageData, options) {
@@ -235,11 +244,7 @@ mat = pymupdf.Matrix(${zoom}, ${zoom}).prerotate(${rotation})
 pix = page.get_pixmap(matrix=mat, alpha=${alpha ? "True" : "False"}, clip=${clipStr})
 base64.b64encode(pix.tobytes("png")).decode('ascii')
 `);
-    const binary = atob(result);
-    const bytes = new Uint8Array(binary.length);
-    for (let i = 0; i < binary.length; i++) {
-      bytes[i] = binary.charCodeAt(i);
-    }
+    const bytes = base64ToUint8Array(result);
     return bytes;
   }
   toSvg() {
@@ -1209,6 +1214,28 @@ async function convertPdfToRgb(pdfData) {
   console.log("[convertPdfToRgb] Conversion complete, output size:", copy.length);
   return copy;
 }
+function base64ToUint8Array2(base64) {
+  const binaryStr = atob(base64);
+  const len = binaryStr.length;
+  const bytes = new Uint8Array(len);
+  const CHUNK_SIZE = 32768;
+  for (let i = 0; i < len; i += CHUNK_SIZE) {
+    const end = Math.min(i + CHUNK_SIZE, len);
+    for (let j = i; j < end; j++) {
+      bytes[j] = binaryStr.charCodeAt(j);
+    }
+  }
+  return bytes;
+}
+function uint8ArrayToBase642(bytes) {
+  const CHUNK_SIZE = 32768;
+  const chunks = [];
+  for (let i = 0; i < bytes.length; i += CHUNK_SIZE) {
+    const chunk = bytes.subarray(i, Math.min(i + CHUNK_SIZE, bytes.length));
+    chunks.push(String.fromCharCode.apply(null, Array.from(chunk)));
+  }
+  return btoa(chunks.join(""));
+}
 var ASSETS = {
   pyodide: "pyodide.js",
   wheels: [
@@ -1265,6 +1292,8 @@ var PyMuPDF = class {
     );
     pyodide.runPython(`
 import pymupdf
+import cv2
+import numpy as np
 pymupdf.TOOLS.store_shrink(100)
 def repair_pdf(doc, save_path=None):
@@ -1280,6 +1309,91 @@ def repair_pdf(doc, save_path=None):
             f.write(repair_bytes)
         return None
     return pymupdf.open("pdf", repair_bytes)
+def detect_skew_hough(gray):
+    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100, minLineLength=100, maxLineGap=10)
+    if lines is None or len(lines) < 5:
+        return None
+    angles = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        if x2 - x1 == 0:
+            continue
+        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
+        if abs(angle) < 45:
+            angles.append(angle)
+    if len(angles) < 3:
+        return None
+    return np.median(angles)
+def detect_skew_minarea(gray):
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    coords = np.column_stack(np.where(binary > 0))
+    if len(coords) < 100:
+        return None, 0
+    rect = cv2.minAreaRect(coords)
+    angle = rect[-1]
+    if angle < -45:
+        angle = 90 + angle
+    elif angle > 45:
+        angle = angle - 90
+    return -angle, len(coords)
+def detect_skew_angle(img_array):
+    if len(img_array.shape) == 3:
+        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+    else:
+        gray = img_array
+    angle_minarea, content_count = detect_skew_minarea(gray)
+    if angle_minarea is not None and content_count > 1000 and abs(angle_minarea) > 0.1:
+        return angle_minarea
+    angle_hough = detect_skew_hough(gray)
+    if angle_hough is not None and abs(angle_hough) > 0.1:
+        return angle_hough
+    if angle_minarea is not None:
+        return angle_minarea
+    return 0.0
+def deskew_image(img_array, angle):
+    h, w = img_array.shape[:2]
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, angle, 1.0)
+    cos_val = np.abs(M[0, 0])
+    sin_val = np.abs(M[0, 1])
+    new_w = int(h * sin_val + w * cos_val)
+    new_h = int(h * cos_val + w * sin_val)
+    M[0, 2] += (new_w - w) // 2
+    M[1, 2] += (new_h - h) // 2
+    if len(img_array.shape) == 3:
+        border_color = (255, 255, 255)
+    else:
+        border_color = 255
+    rotated = cv2.warpAffine(
+        img_array, M, (new_w, new_h),
+        borderMode=cv2.BORDER_CONSTANT,
+        borderValue=border_color
+    )
+    return rotated
 `);
     return pyodide;
   }
@@ -1473,12 +1587,8 @@ base64.b64encode(output).decode('ascii')
       pyodide.FS.unlink(inputPath);
     } catch {
     }
-    const binary = atob(result);
-    const bytes = new Uint8Array(binary.length);
-    for (let i = 0; i < binary.length; i++) {
-      bytes[i] = binary.charCodeAt(i);
-    }
-    return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
+    const bytes = base64ToUint8Array2(result);
+    return new Blob([bytes], { type: "application/pdf" });
   }
   /**
    * Repair a PDF by re-opening and re-saving with garbage collection and compression.
@@ -1510,12 +1620,8 @@ base64.b64encode(output).decode('ascii')
       pyodide.FS.unlink(inputPath);
     } catch {
     }
-    const binary = atob(result);
-    const bytes = new Uint8Array(binary.length);
-    for (let i = 0; i < binary.length; i++) {
-      bytes[i] = binary.charCodeAt(i);
-    }
-    return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
+    const bytes = base64ToUint8Array2(result);
+    return new Blob([bytes], { type: "application/pdf" });
   }
   async xpsToPdf(xps) {
     return this.convertToPdf(xps, { filetype: "xps" });
@@ -1559,12 +1665,8 @@ output = _multi_img_pdf.tobytes(garbage=3, deflate=True)
 _multi_img_pdf.close()
 base64.b64encode(output).decode('ascii')
 `);
-    const binary = atob(result);
-    const bytes = new Uint8Array(binary.length);
-    for (let i = 0; i < binary.length; i++) {
-      bytes[i] = binary.charCodeAt(i);
-    }
-    return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
+    const bytes = base64ToUint8Array2(result);
+    return new Blob([bytes], { type: "application/pdf" });
   }
   async pdfToImages(pdf, options) {
     const pyodide = await this.getPyodide();
@@ -1584,11 +1686,7 @@ mat = pymupdf.Matrix(${zoom}, ${zoom})
 pix = page.get_pixmap(matrix=mat)
 base64.b64encode(pix.tobytes("${format}")).decode('ascii')
 `);
-      const binary = atob(result);
-      const bytes = new Uint8Array(binary.length);
-      for (let i = 0; i < binary.length; i++) {
-        bytes[i] = binary.charCodeAt(i);
-      }
+      const bytes = base64ToUint8Array2(result);
       results.push(bytes);
     }
     doc.close();
@@ -1694,17 +1792,26 @@ doc.close()
 base64.b64encode(pdf_bytes).decode('ascii')
 `);
-    const binaryStr = atob(result);
-    const bytes = new Uint8Array(binaryStr.length);
-    for (let i = 0; i < binaryStr.length; i++) {
-      bytes[i] = binaryStr.charCodeAt(i);
-    }
+    const bytes = base64ToUint8Array2(result);
     return new Blob([bytes], { type: "application/pdf" });
   }
   async htmlToPdf(html, options) {
     const pyodide = await this.getPyodide();
-    const escapedHtml = html.replace(/\\/g, "\\\\").replace(/'/g, "\\'").replace(/\n/g, "\\n");
-    const escapedCss = options?.css?.replace(/\\/g, "\\\\").replace(/'/g, "\\'").replace(/\n/g, "\\n") ?? "";
+    const encoder = new TextEncoder();
+    const htmlBase64 = uint8ArrayToBase642(encoder.encode(html));
+    const cssBase64 = options?.css ? uint8ArrayToBase642(encoder.encode(options.css)) : "";
+    const attachmentsList = [];
+    if (options?.attachments) {
+      for (const att of options.attachments) {
+        if (att.content && att.content.length > 0) {
+          attachmentsList.push({
+            name: att.filename,
+            data: uint8ArrayToBase642(att.content)
+          });
+        }
+      }
+    }
+    pyodide.globals.set("attachments_json", JSON.stringify(attachmentsList));
     const pageSize = options?.pageSize ?? "a4";
     let margins = { top: 36, right: 36, bottom: 36, left: 36 };
     if (typeof options?.margins === "number") {
@@ -1715,73 +1822,91 @@ base64.b64encode(pdf_bytes).decode('ascii')
     const result = pyodide.runPython(`
 import base64
 import io
-import re
 import json
+import re
-html_content = '''${escapedHtml}'''
-css_content = '''${escapedCss}'''
-# Extract links from HTML before processing
-link_pattern = r'<a[^>]*href=["\\'](https?://[^"\\'>]+)["\\'"][^>]*>([^<]+)</a>'
-links = re.findall(link_pattern, html_content, re.IGNORECASE)
-# links is a list of (url, text) tuples
+html_content = base64.b64decode("${htmlBase64}").decode('utf-8')
+css_content = base64.b64decode("${cssBase64}").decode('utf-8') if "${cssBase64}" else ""
+# Clean up external resources that Story can't load
 html_content = re.sub(r'<link[^>]*stylesheet[^>]*>', '', html_content, flags=re.IGNORECASE)
 html_content = re.sub(r'<link[^>]*href=[^>]*>', '', html_content, flags=re.IGNORECASE)
-html_content = re.sub(r'<script[^>]*src=[^>]*>.*?<\\/script>', '', html_content, flags=re.IGNORECASE|re.DOTALL)
-html_content = re.sub(r'<script[^>]*src=[^>]*/>', '', html_content, flags=re.IGNORECASE)
+html_content = re.sub(r'<script[^>]*>.*?<\/script>', '', html_content, flags=re.IGNORECASE|re.DOTALL)
+if css_content:
+    if '<head>' in html_content:
+        html_content = html_content.replace('<head>', '<head><style>' + css_content + '</style>')
+    else:
+        html_content = '<style>' + css_content + '</style>' + html_content
 mediabox = pymupdf.paper_rect("${pageSize}")
 where = mediabox + (${margins.left}, ${margins.top}, -${margins.right}, -${margins.bottom})
-story = pymupdf.Story(html=html_content, user_css=css_content if css_content else None)
+doc = pymupdf.open()
+story = pymupdf.Story(html=html_content)
 buffer = io.BytesIO()
 writer = pymupdf.DocumentWriter(buffer)
-def rectfn(rect_num, filled):
-    if rect_num == 0 or filled == 0:
-        return mediabox, where, None
-    return mediabox, where, None
+more_pages = True
+page_num = 0
+while more_pages:
+    dev = writer.begin_page(mediabox)
+    more_content, filled = story.place(where)
+    story.draw(dev)
+    writer.end_page()
+    more_pages = more_content
+    page_num += 1
-story.write(writer, rectfn)
 writer.close()
-# Now open the PDF and add link annotations
 buffer.seek(0)
 doc = pymupdf.open("pdf", buffer.read())
-# For each link found in HTML, search for the text and add a link annotation
-for url, text in links:
-    text = text.strip()
-    if not text:
-        continue
-    # Search all pages for this text
-    for page_num in range(doc.page_count):
-        page = doc[page_num]
-        # Search for the link text
-        text_instances = page.search_for(text)
-        for rect in text_instances:
-            # Add a link annotation
-            link = page.insert_link({
-                "kind": pymupdf.LINK_URI,
-                "from": rect,
-                "uri": url
-            })
-# Save the modified PDF
-output_buffer = io.BytesIO()
-doc.save(output_buffer)
+link_pattern = re.compile(r'<a[^>]+href=["\\'](https?://[^"\\'>]+|mailto:[^"\\'>]+)["\\'][^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
+found_links = link_pattern.findall(html_content)
+for page in doc:
+    for link_uri, anchor_text in found_links:
+        clean_text = re.sub(r'<[^>]+>', '', anchor_text)
+        clean_text = ' '.join(clean_text.split())
+        if len(clean_text) > 3:
+            text_instances = page.search_for(clean_text)
+            for inst in text_instances:
+                try:
+                    link_dict = {
+                        'kind': pymupdf.LINK_URI,
+                        'from': inst,
+                        'uri': link_uri
+                    }
+                    page.insert_link(link_dict)
+                except Exception as e:
+                    pass
+att_json = attachments_json
+if att_json:
+    try:
+        atts = json.loads(att_json)
+        for att in atts:
+            name = att.get('name', 'unnamed')
+            data = base64.b64decode(att.get('data', ''))
+            if data:
+                doc.embfile_add(name, data)
+    except:
+        pass
+final_pdf = doc.tobytes(garbage=3, deflate=True)
 doc.close()
-pdf_bytes = output_buffer.getvalue()
-base64.b64encode(pdf_bytes).decode('ascii')
+base64.b64encode(final_pdf).decode('ascii')
 `);
-    const binaryStr = atob(result);
-    const bytes = new Uint8Array(binaryStr.length);
-    for (let i = 0; i < binaryStr.length; i++) {
-      bytes[i] = binaryStr.charCodeAt(i);
+    try {
+      pyodide.globals.delete("attachments_json");
+    } catch {
     }
+    const bytes = base64ToUint8Array2(result);
     return new Blob([bytes], { type: "application/pdf" });
   }
   async pdfToMarkdown(pdf, options) {
@@ -2034,12 +2159,100 @@ base64.b64encode(pdf_bytes).decode('ascii')
       pyodide.FS.unlink(inputPath);
     } catch {
     }
-    const binary = atob(result);
+    const bytes = base64ToUint8Array2(result);
+    return new Blob([bytes], { type: "application/pdf" });
+  }
+  async deskewPdf(pdf, options) {
+    const pyodide = await this.getPyodide();
+    const docId = ++this.docCounter;
+    const inputPath = `/deskew_input_${docId}`;
+    const threshold = options?.threshold ?? 0.5;
+    const dpi = options?.dpi ?? 150;
+    const maxAngle = options?.maxAngle ?? 45;
+    const pages = options?.pages;
+    const buf = await pdf.arrayBuffer();
+    pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
+    const pagesArg = pages ? `[${pages.join(", ")}]` : "None";
+    const result = pyodide.runPython(`
+import base64
+import json
+src_doc = pymupdf.open("${inputPath}")
+src_doc = repair_pdf(src_doc)
+out_doc = pymupdf.open()
+zoom = ${dpi} / 72.0
+mat = pymupdf.Matrix(zoom, zoom)
+page_indices = ${pagesArg} if ${pagesArg} is not None else range(src_doc.page_count)
+angles = []
+corrected = []
+for page_idx in page_indices:
+    if page_idx < 0 or page_idx >= src_doc.page_count:
+        continue
+    page = src_doc[page_idx]
+    orig_rect = page.rect
+    pix = page.get_pixmap(matrix=mat, alpha=False)
+    img_data = pix.samples
+    img_array = np.frombuffer(img_data, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+    if pix.n == 4:
+        img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2RGB)
+    angle = detect_skew_angle(img_array)
+    angles.append(float(angle))
+    should_correct = abs(angle) >= ${threshold} and abs(angle) <= ${maxAngle}
+    corrected.append(should_correct)
+    if should_correct:
+        corrected_img = deskew_image(img_array, angle)
+        success, img_bytes = cv2.imencode('.png', cv2.cvtColor(corrected_img, cv2.COLOR_RGB2BGR))
+        if not success:
+            raise ValueError(f"Failed to encode corrected image for page {page_idx}")
+        img_bytes = img_bytes.tobytes()
+    else:
+        img_bytes = pix.tobytes("png")
+    new_page = out_doc.new_page(width=orig_rect.width, height=orig_rect.height)
+    new_page.insert_image(new_page.rect, stream=img_bytes)
+src_doc.close()
+total_pages = len(angles)
+corrected_count = sum(1 for c in corrected if c)
+pdf_bytes = out_doc.tobytes(garbage=3, deflate=True)
+out_doc.close()
+result_json = json.dumps({
+    "totalPages": total_pages,
+    "correctedPages": corrected_count,
+    "angles": angles,
+    "corrected": corrected
+})
+(base64.b64encode(pdf_bytes).decode('ascii'), result_json)
+`);
+    try {
+      pyodide.FS.unlink(inputPath);
+    } catch {
+    }
+    const [pdfBase64, resultJson] = result;
+    const binary = atob(pdfBase64);
     const bytes = new Uint8Array(binary.length);
     for (let i = 0; i < binary.length; i++) {
       bytes[i] = binary.charCodeAt(i);
     }
-    return new Blob([bytes], { type: "application/pdf" });
+    const deskewResult = JSON.parse(resultJson);
+    return {
+      pdf: new Blob([bytes], { type: "application/pdf" }),
+      result: deskewResult
+    };
   }
   /**
    * Compress a PDF using multiple optimization techniques.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@bentopdf/pymupdf-wasm",
-    "version": "0.1.10",
+    "version": "0.11.12",
     "description": "PyMuPDF compiled to WebAssembly - Full PDF manipulation in the browser",
     "type": "module",
     "main": "dist/index.js",

package/types/index.d.ts CHANGED Viewed

@@ -342,4 +342,23 @@ export declare class PyMuPDF {
         savingsPercent: number;
         pageCount: number;
     }>;
+    deskewPdf(pdf: Blob | File, options?: DeskewOptions): Promise<{
+        pdf: Blob;
+        result: DeskewResult;
+    }>;
+}
+export interface DeskewOptions {
+    threshold?: number;
+    dpi?: number;
+    maxAngle?: number;
+    pages?: number[];
+}
+export interface DeskewResult {
+    totalPages: number;
+    correctedPages: number;
+    angles: number[];
+    corrected: boolean[];
 }