@bentopdf/pymupdf-wasm 0.1.10 → 0.11.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +300 -87
- package/package.json +1 -1
- package/types/index.d.ts +19 -0
package/dist/index.js
CHANGED
|
@@ -1,7 +1,20 @@
|
|
|
1
1
|
// src/page.ts
|
|
2
|
+
function base64ToUint8Array(base64) {
|
|
3
|
+
const binaryStr = atob(base64);
|
|
4
|
+
const len = binaryStr.length;
|
|
5
|
+
const bytes = new Uint8Array(len);
|
|
6
|
+
const CHUNK_SIZE = 32768;
|
|
7
|
+
for (let i = 0; i < len; i += CHUNK_SIZE) {
|
|
8
|
+
const end = Math.min(i + CHUNK_SIZE, len);
|
|
9
|
+
for (let j = i; j < end; j++) {
|
|
10
|
+
bytes[j] = binaryStr.charCodeAt(j);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
return bytes;
|
|
14
|
+
}
|
|
2
15
|
function uint8ArrayToBase64(bytes) {
|
|
3
|
-
let binary = "";
|
|
4
16
|
const chunkSize = 32768;
|
|
17
|
+
let binary = "";
|
|
5
18
|
for (let i = 0; i < bytes.length; i += chunkSize) {
|
|
6
19
|
const chunk = bytes.subarray(i, Math.min(i + chunkSize, bytes.length));
|
|
7
20
|
binary += String.fromCharCode.apply(null, Array.from(chunk));
|
|
@@ -114,11 +127,7 @@ _result
|
|
|
114
127
|
`);
|
|
115
128
|
if (result === "null") return null;
|
|
116
129
|
const parsed = JSON.parse(result);
|
|
117
|
-
const
|
|
118
|
-
const bytes = new Uint8Array(binary.length);
|
|
119
|
-
for (let i = 0; i < binary.length; i++) {
|
|
120
|
-
bytes[i] = binary.charCodeAt(i);
|
|
121
|
-
}
|
|
130
|
+
const bytes = base64ToUint8Array(parsed.data);
|
|
122
131
|
return { ...parsed, data: bytes };
|
|
123
132
|
}
|
|
124
133
|
insertImage(rect, imageData, options) {
|
|
@@ -235,11 +244,7 @@ mat = pymupdf.Matrix(${zoom}, ${zoom}).prerotate(${rotation})
|
|
|
235
244
|
pix = page.get_pixmap(matrix=mat, alpha=${alpha ? "True" : "False"}, clip=${clipStr})
|
|
236
245
|
base64.b64encode(pix.tobytes("png")).decode('ascii')
|
|
237
246
|
`);
|
|
238
|
-
const
|
|
239
|
-
const bytes = new Uint8Array(binary.length);
|
|
240
|
-
for (let i = 0; i < binary.length; i++) {
|
|
241
|
-
bytes[i] = binary.charCodeAt(i);
|
|
242
|
-
}
|
|
247
|
+
const bytes = base64ToUint8Array(result);
|
|
243
248
|
return bytes;
|
|
244
249
|
}
|
|
245
250
|
toSvg() {
|
|
@@ -1209,6 +1214,28 @@ async function convertPdfToRgb(pdfData) {
|
|
|
1209
1214
|
console.log("[convertPdfToRgb] Conversion complete, output size:", copy.length);
|
|
1210
1215
|
return copy;
|
|
1211
1216
|
}
|
|
1217
|
+
function base64ToUint8Array2(base64) {
|
|
1218
|
+
const binaryStr = atob(base64);
|
|
1219
|
+
const len = binaryStr.length;
|
|
1220
|
+
const bytes = new Uint8Array(len);
|
|
1221
|
+
const CHUNK_SIZE = 32768;
|
|
1222
|
+
for (let i = 0; i < len; i += CHUNK_SIZE) {
|
|
1223
|
+
const end = Math.min(i + CHUNK_SIZE, len);
|
|
1224
|
+
for (let j = i; j < end; j++) {
|
|
1225
|
+
bytes[j] = binaryStr.charCodeAt(j);
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
return bytes;
|
|
1229
|
+
}
|
|
1230
|
+
function uint8ArrayToBase642(bytes) {
|
|
1231
|
+
const CHUNK_SIZE = 32768;
|
|
1232
|
+
const chunks = [];
|
|
1233
|
+
for (let i = 0; i < bytes.length; i += CHUNK_SIZE) {
|
|
1234
|
+
const chunk = bytes.subarray(i, Math.min(i + CHUNK_SIZE, bytes.length));
|
|
1235
|
+
chunks.push(String.fromCharCode.apply(null, Array.from(chunk)));
|
|
1236
|
+
}
|
|
1237
|
+
return btoa(chunks.join(""));
|
|
1238
|
+
}
|
|
1212
1239
|
var ASSETS = {
|
|
1213
1240
|
pyodide: "pyodide.js",
|
|
1214
1241
|
wheels: [
|
|
@@ -1265,6 +1292,8 @@ var PyMuPDF = class {
|
|
|
1265
1292
|
);
|
|
1266
1293
|
pyodide.runPython(`
|
|
1267
1294
|
import pymupdf
|
|
1295
|
+
import cv2
|
|
1296
|
+
import numpy as np
|
|
1268
1297
|
pymupdf.TOOLS.store_shrink(100)
|
|
1269
1298
|
|
|
1270
1299
|
def repair_pdf(doc, save_path=None):
|
|
@@ -1280,6 +1309,91 @@ def repair_pdf(doc, save_path=None):
|
|
|
1280
1309
|
f.write(repair_bytes)
|
|
1281
1310
|
return None
|
|
1282
1311
|
return pymupdf.open("pdf", repair_bytes)
|
|
1312
|
+
|
|
1313
|
+
def detect_skew_hough(gray):
|
|
1314
|
+
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
|
1315
|
+
lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100, minLineLength=100, maxLineGap=10)
|
|
1316
|
+
|
|
1317
|
+
if lines is None or len(lines) < 5:
|
|
1318
|
+
return None
|
|
1319
|
+
|
|
1320
|
+
angles = []
|
|
1321
|
+
for line in lines:
|
|
1322
|
+
x1, y1, x2, y2 = line[0]
|
|
1323
|
+
if x2 - x1 == 0:
|
|
1324
|
+
continue
|
|
1325
|
+
angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
|
|
1326
|
+
if abs(angle) < 45:
|
|
1327
|
+
angles.append(angle)
|
|
1328
|
+
|
|
1329
|
+
if len(angles) < 3:
|
|
1330
|
+
return None
|
|
1331
|
+
|
|
1332
|
+
return np.median(angles)
|
|
1333
|
+
|
|
1334
|
+
def detect_skew_minarea(gray):
|
|
1335
|
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
1336
|
+
coords = np.column_stack(np.where(binary > 0))
|
|
1337
|
+
|
|
1338
|
+
if len(coords) < 100:
|
|
1339
|
+
return None, 0
|
|
1340
|
+
|
|
1341
|
+
rect = cv2.minAreaRect(coords)
|
|
1342
|
+
angle = rect[-1]
|
|
1343
|
+
|
|
1344
|
+
if angle < -45:
|
|
1345
|
+
angle = 90 + angle
|
|
1346
|
+
elif angle > 45:
|
|
1347
|
+
angle = angle - 90
|
|
1348
|
+
|
|
1349
|
+
return -angle, len(coords)
|
|
1350
|
+
|
|
1351
|
+
def detect_skew_angle(img_array):
|
|
1352
|
+
if len(img_array.shape) == 3:
|
|
1353
|
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
|
1354
|
+
else:
|
|
1355
|
+
gray = img_array
|
|
1356
|
+
|
|
1357
|
+
angle_minarea, content_count = detect_skew_minarea(gray)
|
|
1358
|
+
|
|
1359
|
+
if angle_minarea is not None and content_count > 1000 and abs(angle_minarea) > 0.1:
|
|
1360
|
+
return angle_minarea
|
|
1361
|
+
|
|
1362
|
+
angle_hough = detect_skew_hough(gray)
|
|
1363
|
+
|
|
1364
|
+
if angle_hough is not None and abs(angle_hough) > 0.1:
|
|
1365
|
+
return angle_hough
|
|
1366
|
+
|
|
1367
|
+
if angle_minarea is not None:
|
|
1368
|
+
return angle_minarea
|
|
1369
|
+
|
|
1370
|
+
return 0.0
|
|
1371
|
+
|
|
1372
|
+
def deskew_image(img_array, angle):
|
|
1373
|
+
h, w = img_array.shape[:2]
|
|
1374
|
+
center = (w // 2, h // 2)
|
|
1375
|
+
|
|
1376
|
+
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
|
1377
|
+
|
|
1378
|
+
cos_val = np.abs(M[0, 0])
|
|
1379
|
+
sin_val = np.abs(M[0, 1])
|
|
1380
|
+
new_w = int(h * sin_val + w * cos_val)
|
|
1381
|
+
new_h = int(h * cos_val + w * sin_val)
|
|
1382
|
+
|
|
1383
|
+
M[0, 2] += (new_w - w) // 2
|
|
1384
|
+
M[1, 2] += (new_h - h) // 2
|
|
1385
|
+
|
|
1386
|
+
if len(img_array.shape) == 3:
|
|
1387
|
+
border_color = (255, 255, 255)
|
|
1388
|
+
else:
|
|
1389
|
+
border_color = 255
|
|
1390
|
+
|
|
1391
|
+
rotated = cv2.warpAffine(
|
|
1392
|
+
img_array, M, (new_w, new_h),
|
|
1393
|
+
borderMode=cv2.BORDER_CONSTANT,
|
|
1394
|
+
borderValue=border_color
|
|
1395
|
+
)
|
|
1396
|
+
return rotated
|
|
1283
1397
|
`);
|
|
1284
1398
|
return pyodide;
|
|
1285
1399
|
}
|
|
@@ -1473,12 +1587,8 @@ base64.b64encode(output).decode('ascii')
|
|
|
1473
1587
|
pyodide.FS.unlink(inputPath);
|
|
1474
1588
|
} catch {
|
|
1475
1589
|
}
|
|
1476
|
-
const
|
|
1477
|
-
|
|
1478
|
-
for (let i = 0; i < binary.length; i++) {
|
|
1479
|
-
bytes[i] = binary.charCodeAt(i);
|
|
1480
|
-
}
|
|
1481
|
-
return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
|
|
1590
|
+
const bytes = base64ToUint8Array2(result);
|
|
1591
|
+
return new Blob([bytes], { type: "application/pdf" });
|
|
1482
1592
|
}
|
|
1483
1593
|
/**
|
|
1484
1594
|
* Repair a PDF by re-opening and re-saving with garbage collection and compression.
|
|
@@ -1510,12 +1620,8 @@ base64.b64encode(output).decode('ascii')
|
|
|
1510
1620
|
pyodide.FS.unlink(inputPath);
|
|
1511
1621
|
} catch {
|
|
1512
1622
|
}
|
|
1513
|
-
const
|
|
1514
|
-
|
|
1515
|
-
for (let i = 0; i < binary.length; i++) {
|
|
1516
|
-
bytes[i] = binary.charCodeAt(i);
|
|
1517
|
-
}
|
|
1518
|
-
return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
|
|
1623
|
+
const bytes = base64ToUint8Array2(result);
|
|
1624
|
+
return new Blob([bytes], { type: "application/pdf" });
|
|
1519
1625
|
}
|
|
1520
1626
|
async xpsToPdf(xps) {
|
|
1521
1627
|
return this.convertToPdf(xps, { filetype: "xps" });
|
|
@@ -1559,12 +1665,8 @@ output = _multi_img_pdf.tobytes(garbage=3, deflate=True)
|
|
|
1559
1665
|
_multi_img_pdf.close()
|
|
1560
1666
|
base64.b64encode(output).decode('ascii')
|
|
1561
1667
|
`);
|
|
1562
|
-
const
|
|
1563
|
-
|
|
1564
|
-
for (let i = 0; i < binary.length; i++) {
|
|
1565
|
-
bytes[i] = binary.charCodeAt(i);
|
|
1566
|
-
}
|
|
1567
|
-
return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
|
|
1668
|
+
const bytes = base64ToUint8Array2(result);
|
|
1669
|
+
return new Blob([bytes], { type: "application/pdf" });
|
|
1568
1670
|
}
|
|
1569
1671
|
async pdfToImages(pdf, options) {
|
|
1570
1672
|
const pyodide = await this.getPyodide();
|
|
@@ -1584,11 +1686,7 @@ mat = pymupdf.Matrix(${zoom}, ${zoom})
|
|
|
1584
1686
|
pix = page.get_pixmap(matrix=mat)
|
|
1585
1687
|
base64.b64encode(pix.tobytes("${format}")).decode('ascii')
|
|
1586
1688
|
`);
|
|
1587
|
-
const
|
|
1588
|
-
const bytes = new Uint8Array(binary.length);
|
|
1589
|
-
for (let i = 0; i < binary.length; i++) {
|
|
1590
|
-
bytes[i] = binary.charCodeAt(i);
|
|
1591
|
-
}
|
|
1689
|
+
const bytes = base64ToUint8Array2(result);
|
|
1592
1690
|
results.push(bytes);
|
|
1593
1691
|
}
|
|
1594
1692
|
doc.close();
|
|
@@ -1694,17 +1792,26 @@ doc.close()
|
|
|
1694
1792
|
|
|
1695
1793
|
base64.b64encode(pdf_bytes).decode('ascii')
|
|
1696
1794
|
`);
|
|
1697
|
-
const
|
|
1698
|
-
const bytes = new Uint8Array(binaryStr.length);
|
|
1699
|
-
for (let i = 0; i < binaryStr.length; i++) {
|
|
1700
|
-
bytes[i] = binaryStr.charCodeAt(i);
|
|
1701
|
-
}
|
|
1795
|
+
const bytes = base64ToUint8Array2(result);
|
|
1702
1796
|
return new Blob([bytes], { type: "application/pdf" });
|
|
1703
1797
|
}
|
|
1704
1798
|
async htmlToPdf(html, options) {
|
|
1705
1799
|
const pyodide = await this.getPyodide();
|
|
1706
|
-
const
|
|
1707
|
-
const
|
|
1800
|
+
const encoder = new TextEncoder();
|
|
1801
|
+
const htmlBase64 = uint8ArrayToBase642(encoder.encode(html));
|
|
1802
|
+
const cssBase64 = options?.css ? uint8ArrayToBase642(encoder.encode(options.css)) : "";
|
|
1803
|
+
const attachmentsList = [];
|
|
1804
|
+
if (options?.attachments) {
|
|
1805
|
+
for (const att of options.attachments) {
|
|
1806
|
+
if (att.content && att.content.length > 0) {
|
|
1807
|
+
attachmentsList.push({
|
|
1808
|
+
name: att.filename,
|
|
1809
|
+
data: uint8ArrayToBase642(att.content)
|
|
1810
|
+
});
|
|
1811
|
+
}
|
|
1812
|
+
}
|
|
1813
|
+
}
|
|
1814
|
+
pyodide.globals.set("attachments_json", JSON.stringify(attachmentsList));
|
|
1708
1815
|
const pageSize = options?.pageSize ?? "a4";
|
|
1709
1816
|
let margins = { top: 36, right: 36, bottom: 36, left: 36 };
|
|
1710
1817
|
if (typeof options?.margins === "number") {
|
|
@@ -1715,73 +1822,91 @@ base64.b64encode(pdf_bytes).decode('ascii')
|
|
|
1715
1822
|
const result = pyodide.runPython(`
|
|
1716
1823
|
import base64
|
|
1717
1824
|
import io
|
|
1718
|
-
import re
|
|
1719
1825
|
import json
|
|
1826
|
+
import re
|
|
1720
1827
|
|
|
1721
|
-
html_content =
|
|
1722
|
-
css_content = ''
|
|
1723
|
-
|
|
1724
|
-
# Extract links from HTML before processing
|
|
1725
|
-
link_pattern = r'<a[^>]*href=["\\'](https?://[^"\\'>]+)["\\'"][^>]*>([^<]+)</a>'
|
|
1726
|
-
links = re.findall(link_pattern, html_content, re.IGNORECASE)
|
|
1727
|
-
# links is a list of (url, text) tuples
|
|
1828
|
+
html_content = base64.b64decode("${htmlBase64}").decode('utf-8')
|
|
1829
|
+
css_content = base64.b64decode("${cssBase64}").decode('utf-8') if "${cssBase64}" else ""
|
|
1728
1830
|
|
|
1831
|
+
# Clean up external resources that Story can't load
|
|
1729
1832
|
html_content = re.sub(r'<link[^>]*stylesheet[^>]*>', '', html_content, flags=re.IGNORECASE)
|
|
1730
1833
|
html_content = re.sub(r'<link[^>]*href=[^>]*>', '', html_content, flags=re.IGNORECASE)
|
|
1731
|
-
html_content = re.sub(r'<script[^>]
|
|
1732
|
-
|
|
1834
|
+
html_content = re.sub(r'<script[^>]*>.*?<\/script>', '', html_content, flags=re.IGNORECASE|re.DOTALL)
|
|
1835
|
+
|
|
1836
|
+
if css_content:
|
|
1837
|
+
if '<head>' in html_content:
|
|
1838
|
+
html_content = html_content.replace('<head>', '<head><style>' + css_content + '</style>')
|
|
1839
|
+
else:
|
|
1840
|
+
html_content = '<style>' + css_content + '</style>' + html_content
|
|
1733
1841
|
|
|
1734
1842
|
mediabox = pymupdf.paper_rect("${pageSize}")
|
|
1735
1843
|
where = mediabox + (${margins.left}, ${margins.top}, -${margins.right}, -${margins.bottom})
|
|
1736
1844
|
|
|
1737
|
-
|
|
1845
|
+
doc = pymupdf.open()
|
|
1846
|
+
|
|
1847
|
+
story = pymupdf.Story(html=html_content)
|
|
1738
1848
|
|
|
1739
1849
|
buffer = io.BytesIO()
|
|
1740
1850
|
writer = pymupdf.DocumentWriter(buffer)
|
|
1741
1851
|
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1852
|
+
more_pages = True
|
|
1853
|
+
page_num = 0
|
|
1854
|
+
while more_pages:
|
|
1855
|
+
dev = writer.begin_page(mediabox)
|
|
1856
|
+
more_content, filled = story.place(where)
|
|
1857
|
+
story.draw(dev)
|
|
1858
|
+
writer.end_page()
|
|
1859
|
+
more_pages = more_content
|
|
1860
|
+
page_num += 1
|
|
1746
1861
|
|
|
1747
|
-
story.write(writer, rectfn)
|
|
1748
1862
|
writer.close()
|
|
1749
1863
|
|
|
1750
|
-
# Now open the PDF and add link annotations
|
|
1751
1864
|
buffer.seek(0)
|
|
1752
1865
|
doc = pymupdf.open("pdf", buffer.read())
|
|
1753
1866
|
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1867
|
+
link_pattern = re.compile(r'<a[^>]+href=["\\'](https?://[^"\\'>]+|mailto:[^"\\'>]+)["\\'][^>]*>(.*?)</a>', re.IGNORECASE | re.DOTALL)
|
|
1868
|
+
found_links = link_pattern.findall(html_content)
|
|
1869
|
+
|
|
1870
|
+
for page in doc:
|
|
1871
|
+
for link_uri, anchor_text in found_links:
|
|
1872
|
+
clean_text = re.sub(r'<[^>]+>', '', anchor_text)
|
|
1873
|
+
clean_text = ' '.join(clean_text.split())
|
|
1874
|
+
|
|
1875
|
+
if len(clean_text) > 3:
|
|
1876
|
+
text_instances = page.search_for(clean_text)
|
|
1877
|
+
for inst in text_instances:
|
|
1878
|
+
try:
|
|
1879
|
+
link_dict = {
|
|
1880
|
+
'kind': pymupdf.LINK_URI,
|
|
1881
|
+
'from': inst,
|
|
1882
|
+
'uri': link_uri
|
|
1883
|
+
}
|
|
1884
|
+
page.insert_link(link_dict)
|
|
1885
|
+
except Exception as e:
|
|
1886
|
+
pass
|
|
1887
|
+
|
|
1888
|
+
att_json = attachments_json
|
|
1889
|
+
if att_json:
|
|
1890
|
+
try:
|
|
1891
|
+
atts = json.loads(att_json)
|
|
1892
|
+
for att in atts:
|
|
1893
|
+
name = att.get('name', 'unnamed')
|
|
1894
|
+
data = base64.b64decode(att.get('data', ''))
|
|
1895
|
+
if data:
|
|
1896
|
+
doc.embfile_add(name, data)
|
|
1897
|
+
except:
|
|
1898
|
+
pass
|
|
1899
|
+
|
|
1900
|
+
final_pdf = doc.tobytes(garbage=3, deflate=True)
|
|
1775
1901
|
doc.close()
|
|
1776
1902
|
|
|
1777
|
-
|
|
1778
|
-
base64.b64encode(pdf_bytes).decode('ascii')
|
|
1903
|
+
base64.b64encode(final_pdf).decode('ascii')
|
|
1779
1904
|
`);
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
bytes[i] = binaryStr.charCodeAt(i);
|
|
1905
|
+
try {
|
|
1906
|
+
pyodide.globals.delete("attachments_json");
|
|
1907
|
+
} catch {
|
|
1784
1908
|
}
|
|
1909
|
+
const bytes = base64ToUint8Array2(result);
|
|
1785
1910
|
return new Blob([bytes], { type: "application/pdf" });
|
|
1786
1911
|
}
|
|
1787
1912
|
async pdfToMarkdown(pdf, options) {
|
|
@@ -2034,12 +2159,100 @@ base64.b64encode(pdf_bytes).decode('ascii')
|
|
|
2034
2159
|
pyodide.FS.unlink(inputPath);
|
|
2035
2160
|
} catch {
|
|
2036
2161
|
}
|
|
2037
|
-
const
|
|
2162
|
+
const bytes = base64ToUint8Array2(result);
|
|
2163
|
+
return new Blob([bytes], { type: "application/pdf" });
|
|
2164
|
+
}
|
|
2165
|
+
async deskewPdf(pdf, options) {
|
|
2166
|
+
const pyodide = await this.getPyodide();
|
|
2167
|
+
const docId = ++this.docCounter;
|
|
2168
|
+
const inputPath = `/deskew_input_${docId}`;
|
|
2169
|
+
const threshold = options?.threshold ?? 0.5;
|
|
2170
|
+
const dpi = options?.dpi ?? 150;
|
|
2171
|
+
const maxAngle = options?.maxAngle ?? 45;
|
|
2172
|
+
const pages = options?.pages;
|
|
2173
|
+
const buf = await pdf.arrayBuffer();
|
|
2174
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
2175
|
+
const pagesArg = pages ? `[${pages.join(", ")}]` : "None";
|
|
2176
|
+
const result = pyodide.runPython(`
|
|
2177
|
+
import base64
|
|
2178
|
+
import json
|
|
2179
|
+
|
|
2180
|
+
src_doc = pymupdf.open("${inputPath}")
|
|
2181
|
+
src_doc = repair_pdf(src_doc)
|
|
2182
|
+
out_doc = pymupdf.open()
|
|
2183
|
+
|
|
2184
|
+
zoom = ${dpi} / 72.0
|
|
2185
|
+
mat = pymupdf.Matrix(zoom, zoom)
|
|
2186
|
+
|
|
2187
|
+
page_indices = ${pagesArg} if ${pagesArg} is not None else range(src_doc.page_count)
|
|
2188
|
+
angles = []
|
|
2189
|
+
corrected = []
|
|
2190
|
+
|
|
2191
|
+
for page_idx in page_indices:
|
|
2192
|
+
if page_idx < 0 or page_idx >= src_doc.page_count:
|
|
2193
|
+
continue
|
|
2194
|
+
|
|
2195
|
+
page = src_doc[page_idx]
|
|
2196
|
+
orig_rect = page.rect
|
|
2197
|
+
|
|
2198
|
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
2199
|
+
|
|
2200
|
+
img_data = pix.samples
|
|
2201
|
+
img_array = np.frombuffer(img_data, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
|
2202
|
+
|
|
2203
|
+
if pix.n == 4:
|
|
2204
|
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2RGB)
|
|
2205
|
+
|
|
2206
|
+
angle = detect_skew_angle(img_array)
|
|
2207
|
+
angles.append(float(angle))
|
|
2208
|
+
|
|
2209
|
+
should_correct = abs(angle) >= ${threshold} and abs(angle) <= ${maxAngle}
|
|
2210
|
+
corrected.append(should_correct)
|
|
2211
|
+
|
|
2212
|
+
if should_correct:
|
|
2213
|
+
corrected_img = deskew_image(img_array, angle)
|
|
2214
|
+
|
|
2215
|
+
success, img_bytes = cv2.imencode('.png', cv2.cvtColor(corrected_img, cv2.COLOR_RGB2BGR))
|
|
2216
|
+
if not success:
|
|
2217
|
+
raise ValueError(f"Failed to encode corrected image for page {page_idx}")
|
|
2218
|
+
img_bytes = img_bytes.tobytes()
|
|
2219
|
+
else:
|
|
2220
|
+
img_bytes = pix.tobytes("png")
|
|
2221
|
+
|
|
2222
|
+
new_page = out_doc.new_page(width=orig_rect.width, height=orig_rect.height)
|
|
2223
|
+
|
|
2224
|
+
new_page.insert_image(new_page.rect, stream=img_bytes)
|
|
2225
|
+
|
|
2226
|
+
src_doc.close()
|
|
2227
|
+
total_pages = len(angles)
|
|
2228
|
+
corrected_count = sum(1 for c in corrected if c)
|
|
2229
|
+
pdf_bytes = out_doc.tobytes(garbage=3, deflate=True)
|
|
2230
|
+
out_doc.close()
|
|
2231
|
+
|
|
2232
|
+
result_json = json.dumps({
|
|
2233
|
+
"totalPages": total_pages,
|
|
2234
|
+
"correctedPages": corrected_count,
|
|
2235
|
+
"angles": angles,
|
|
2236
|
+
"corrected": corrected
|
|
2237
|
+
})
|
|
2238
|
+
|
|
2239
|
+
(base64.b64encode(pdf_bytes).decode('ascii'), result_json)
|
|
2240
|
+
`);
|
|
2241
|
+
try {
|
|
2242
|
+
pyodide.FS.unlink(inputPath);
|
|
2243
|
+
} catch {
|
|
2244
|
+
}
|
|
2245
|
+
const [pdfBase64, resultJson] = result;
|
|
2246
|
+
const binary = atob(pdfBase64);
|
|
2038
2247
|
const bytes = new Uint8Array(binary.length);
|
|
2039
2248
|
for (let i = 0; i < binary.length; i++) {
|
|
2040
2249
|
bytes[i] = binary.charCodeAt(i);
|
|
2041
2250
|
}
|
|
2042
|
-
|
|
2251
|
+
const deskewResult = JSON.parse(resultJson);
|
|
2252
|
+
return {
|
|
2253
|
+
pdf: new Blob([bytes], { type: "application/pdf" }),
|
|
2254
|
+
result: deskewResult
|
|
2255
|
+
};
|
|
2043
2256
|
}
|
|
2044
2257
|
/**
|
|
2045
2258
|
* Compress a PDF using multiple optimization techniques.
|
package/package.json
CHANGED
package/types/index.d.ts
CHANGED
|
@@ -342,4 +342,23 @@ export declare class PyMuPDF {
|
|
|
342
342
|
savingsPercent: number;
|
|
343
343
|
pageCount: number;
|
|
344
344
|
}>;
|
|
345
|
+
|
|
346
|
+
deskewPdf(pdf: Blob | File, options?: DeskewOptions): Promise<{
|
|
347
|
+
pdf: Blob;
|
|
348
|
+
result: DeskewResult;
|
|
349
|
+
}>;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
export interface DeskewOptions {
|
|
353
|
+
threshold?: number;
|
|
354
|
+
dpi?: number;
|
|
355
|
+
maxAngle?: number;
|
|
356
|
+
pages?: number[];
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
export interface DeskewResult {
|
|
360
|
+
totalPages: number;
|
|
361
|
+
correctedPages: number;
|
|
362
|
+
angles: number[];
|
|
363
|
+
corrected: boolean[];
|
|
345
364
|
}
|