@bentopdf/pymupdf-wasm 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -0
- package/README.md +216 -0
- package/assets/fonttools-4.56.0-py3-none-any.whl +0 -0
- package/assets/lxml-5.4.0-cp313-cp313-pyodide_2025_0_wasm32.whl +0 -0
- package/assets/numpy-2.2.5-cp313-cp313-pyodide_2025_0_wasm32.whl +0 -0
- package/assets/opencv_python-4.11.0.86-cp313-cp313-pyodide_2025_0_wasm32.whl +0 -0
- package/assets/pdf2docx-0.5.8-py3-none-any.whl +0 -0
- package/assets/pymupdf-1.26.3-cp313-none-pyodide_2025_0_wasm32.whl +0 -0
- package/assets/pymupdf4llm-0.0.27-py3-none-any.whl +0 -0
- package/assets/pyodide-lock.json +1 -0
- package/assets/pyodide.asm.js +15 -0
- package/assets/pyodide.asm.wasm +0 -0
- package/assets/pyodide.js +4 -0
- package/assets/python_docx-1.2.0-py3-none-any.whl +0 -0
- package/assets/python_stdlib.zip +0 -0
- package/assets/typing_extensions-4.12.2-py3-none-any.whl +0 -0
- package/dist/index.js +2091 -0
- package/package.json +48 -0
- package/types/index.d.ts +269 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,2091 @@
|
|
|
1
|
+
// src/page.ts
|
|
2
|
+
function uint8ArrayToBase64(bytes) {
|
|
3
|
+
let binary = "";
|
|
4
|
+
const chunkSize = 32768;
|
|
5
|
+
for (let i = 0; i < bytes.length; i += chunkSize) {
|
|
6
|
+
const chunk = bytes.subarray(i, Math.min(i + chunkSize, bytes.length));
|
|
7
|
+
binary += String.fromCharCode.apply(null, Array.from(chunk));
|
|
8
|
+
}
|
|
9
|
+
return btoa(binary);
|
|
10
|
+
}
|
|
11
|
+
var PyMuPDFPage = class {
|
|
12
|
+
constructor(runPython, docVar, pageNumber) {
|
|
13
|
+
this.runPython = runPython;
|
|
14
|
+
this.docVar = docVar;
|
|
15
|
+
this.pageNumber = pageNumber;
|
|
16
|
+
}
|
|
17
|
+
get rect() {
|
|
18
|
+
const result = this.runPython(`
|
|
19
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
20
|
+
r = page.rect
|
|
21
|
+
[r.x0, r.y0, r.x1, r.y1]
|
|
22
|
+
`);
|
|
23
|
+
return { x0: result[0], y0: result[1], x1: result[2], y1: result[3] };
|
|
24
|
+
}
|
|
25
|
+
get width() {
|
|
26
|
+
return this.runPython(`${this.docVar}[${this.pageNumber}].rect.width`);
|
|
27
|
+
}
|
|
28
|
+
get height() {
|
|
29
|
+
return this.runPython(`${this.docVar}[${this.pageNumber}].rect.height`);
|
|
30
|
+
}
|
|
31
|
+
get rotation() {
|
|
32
|
+
return this.runPython(`${this.docVar}[${this.pageNumber}].rotation`);
|
|
33
|
+
}
|
|
34
|
+
setRotation(angle) {
|
|
35
|
+
this.runPython(`${this.docVar}[${this.pageNumber}].set_rotation(${angle})`);
|
|
36
|
+
}
|
|
37
|
+
getText(format = "text") {
|
|
38
|
+
if (format === "text") {
|
|
39
|
+
return this.runPython(`${this.docVar}[${this.pageNumber}].get_text()`);
|
|
40
|
+
}
|
|
41
|
+
const result = this.runPython(`
|
|
42
|
+
import json
|
|
43
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
44
|
+
json.dumps(page.get_text("${format}"))
|
|
45
|
+
`);
|
|
46
|
+
return JSON.parse(result);
|
|
47
|
+
}
|
|
48
|
+
searchFor(text, quads = false) {
|
|
49
|
+
const result = this.runPython(`
|
|
50
|
+
import json
|
|
51
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
52
|
+
rects = page.search_for("${text.replace(/"/g, '\\"')}", quads=${quads ? "True" : "False"})
|
|
53
|
+
json.dumps([[r.x0, r.y0, r.x1, r.y1] for r in rects])
|
|
54
|
+
`);
|
|
55
|
+
return JSON.parse(result).map((r) => ({
|
|
56
|
+
x0: r[0],
|
|
57
|
+
y0: r[1],
|
|
58
|
+
x1: r[2],
|
|
59
|
+
y1: r[3]
|
|
60
|
+
}));
|
|
61
|
+
}
|
|
62
|
+
insertText(point, text, options) {
|
|
63
|
+
const fontsize = options?.fontsize ?? 11;
|
|
64
|
+
const fontname = options?.fontname ?? "helv";
|
|
65
|
+
const color = options?.color ? `(${options.color.r}, ${options.color.g}, ${options.color.b})` : "(0, 0, 0)";
|
|
66
|
+
const rotate = options?.rotate ?? 0;
|
|
67
|
+
this.runPython(`
|
|
68
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
69
|
+
page.insert_text(
|
|
70
|
+
(${point.x}, ${point.y}),
|
|
71
|
+
"""${text.replace(/"""/g, '\\"\\"\\"')}""",
|
|
72
|
+
fontsize=${fontsize},
|
|
73
|
+
fontname="${fontname}",
|
|
74
|
+
color=${color},
|
|
75
|
+
rotate=${rotate}
|
|
76
|
+
)
|
|
77
|
+
`);
|
|
78
|
+
}
|
|
79
|
+
getImages() {
|
|
80
|
+
const result = this.runPython(`
|
|
81
|
+
import json
|
|
82
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
83
|
+
images = page.get_images()
|
|
84
|
+
json.dumps([{
|
|
85
|
+
'xref': img[0],
|
|
86
|
+
'width': img[2],
|
|
87
|
+
'height': img[3],
|
|
88
|
+
'bpc': img[4],
|
|
89
|
+
'colorspace': img[5],
|
|
90
|
+
'size': img[6] if len(img) > 6 else 0,
|
|
91
|
+
'name': img[7] if len(img) > 7 else ''
|
|
92
|
+
} for img in images])
|
|
93
|
+
`);
|
|
94
|
+
return JSON.parse(result);
|
|
95
|
+
}
|
|
96
|
+
extractImage(xref) {
|
|
97
|
+
const result = this.runPython(`
|
|
98
|
+
import json
|
|
99
|
+
import base64
|
|
100
|
+
img = ${this.docVar}.extract_image(${xref})
|
|
101
|
+
_result = 'null'
|
|
102
|
+
if img:
|
|
103
|
+
_result = json.dumps({
|
|
104
|
+
'xref': ${xref},
|
|
105
|
+
'width': img['width'],
|
|
106
|
+
'height': img['height'],
|
|
107
|
+
'bpc': img.get('bpc', 8),
|
|
108
|
+
'colorspace': img.get('colorspace', 'rgb'),
|
|
109
|
+
'size': len(img['image']),
|
|
110
|
+
'ext': img['ext'],
|
|
111
|
+
'data': base64.b64encode(img['image']).decode('ascii')
|
|
112
|
+
})
|
|
113
|
+
_result
|
|
114
|
+
`);
|
|
115
|
+
if (result === "null") return null;
|
|
116
|
+
const parsed = JSON.parse(result);
|
|
117
|
+
const binary = atob(parsed.data);
|
|
118
|
+
const bytes = new Uint8Array(binary.length);
|
|
119
|
+
for (let i = 0; i < binary.length; i++) {
|
|
120
|
+
bytes[i] = binary.charCodeAt(i);
|
|
121
|
+
}
|
|
122
|
+
return { ...parsed, data: bytes };
|
|
123
|
+
}
|
|
124
|
+
insertImage(rect, imageData, options) {
|
|
125
|
+
const overlay = options?.overlay ?? true;
|
|
126
|
+
const keepProportion = options?.keepProportion ?? true;
|
|
127
|
+
const oc = options?.oc;
|
|
128
|
+
const base64Image = uint8ArrayToBase64(imageData);
|
|
129
|
+
const ocParam = oc !== void 0 ? `, oc=${oc}` : "";
|
|
130
|
+
return this.runPython(`
|
|
131
|
+
import base64
|
|
132
|
+
img_data = base64.b64decode("${base64Image}")
|
|
133
|
+
with open("/tmp_insert_img", "wb") as f:
|
|
134
|
+
f.write(img_data)
|
|
135
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
136
|
+
page.insert_image(
|
|
137
|
+
pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}),
|
|
138
|
+
filename="/tmp_insert_img",
|
|
139
|
+
overlay=${overlay ? "True" : "False"},
|
|
140
|
+
keep_proportion=${keepProportion ? "True" : "False"}${ocParam}
|
|
141
|
+
)
|
|
142
|
+
`);
|
|
143
|
+
}
|
|
144
|
+
getAnnotations() {
|
|
145
|
+
const result = this.runPython(`
|
|
146
|
+
import json
|
|
147
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
148
|
+
annots = []
|
|
149
|
+
for annot in page.annots():
|
|
150
|
+
r = annot.rect
|
|
151
|
+
c = annot.colors.get('stroke', (0, 0, 0)) or (0, 0, 0)
|
|
152
|
+
annots.append({
|
|
153
|
+
'type': annot.type[1],
|
|
154
|
+
'rect': {'x0': r.x0, 'y0': r.y0, 'x1': r.x1, 'y1': r.y1},
|
|
155
|
+
'content': annot.info.get('content', ''),
|
|
156
|
+
'author': annot.info.get('title', ''),
|
|
157
|
+
'color': {'r': c[0], 'g': c[1], 'b': c[2]} if c else None
|
|
158
|
+
})
|
|
159
|
+
json.dumps(annots)
|
|
160
|
+
`);
|
|
161
|
+
return JSON.parse(result);
|
|
162
|
+
}
|
|
163
|
+
addHighlight(rect, color) {
|
|
164
|
+
const colorStr = color ? `(${color.r}, ${color.g}, ${color.b})` : "(1, 1, 0)";
|
|
165
|
+
this.runPython(`
|
|
166
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
167
|
+
annot = page.add_highlight_annot(pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}))
|
|
168
|
+
annot.set_colors(stroke=${colorStr})
|
|
169
|
+
annot.update()
|
|
170
|
+
`);
|
|
171
|
+
}
|
|
172
|
+
addTextAnnotation(point, text, icon) {
|
|
173
|
+
const iconStr = icon ?? "Note";
|
|
174
|
+
this.runPython(`
|
|
175
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
176
|
+
annot = page.add_text_annot((${point.x}, ${point.y}), """${text.replace(/"""/g, '\\"\\"\\"')}""", icon="${iconStr}")
|
|
177
|
+
annot.update()
|
|
178
|
+
`);
|
|
179
|
+
}
|
|
180
|
+
addRectAnnotation(rect, color, fill) {
|
|
181
|
+
const strokeColor = color ? `(${color.r}, ${color.g}, ${color.b})` : "(1, 0, 0)";
|
|
182
|
+
const fillColor = fill ? `(${fill.r}, ${fill.g}, ${fill.b})` : "None";
|
|
183
|
+
this.runPython(`
|
|
184
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
185
|
+
annot = page.add_rect_annot(pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}))
|
|
186
|
+
annot.set_colors(stroke=${strokeColor}, fill=${fillColor})
|
|
187
|
+
annot.update()
|
|
188
|
+
`);
|
|
189
|
+
}
|
|
190
|
+
deleteAnnotations() {
|
|
191
|
+
this.runPython(`
|
|
192
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
193
|
+
for annot in list(page.annots()):
|
|
194
|
+
page.delete_annot(annot)
|
|
195
|
+
`);
|
|
196
|
+
}
|
|
197
|
+
getLinks() {
|
|
198
|
+
const result = this.runPython(`
|
|
199
|
+
import json
|
|
200
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
201
|
+
links = page.get_links()
|
|
202
|
+
json.dumps([{
|
|
203
|
+
'rect': {'x0': l['from'].x0, 'y0': l['from'].y0, 'x1': l['from'].x1, 'y1': l['from'].y1},
|
|
204
|
+
'uri': l.get('uri'),
|
|
205
|
+
'page': l.get('page'),
|
|
206
|
+
'dest': {'x': l['to'].x, 'y': l['to'].y} if l.get('to') else None
|
|
207
|
+
} for l in links])
|
|
208
|
+
`);
|
|
209
|
+
return JSON.parse(result);
|
|
210
|
+
}
|
|
211
|
+
insertLink(rect, uri) {
|
|
212
|
+
this.runPython(`
|
|
213
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
214
|
+
page.insert_link({
|
|
215
|
+
'kind': pymupdf.LINK_URI,
|
|
216
|
+
'from': pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}),
|
|
217
|
+
'uri': "${uri}"
|
|
218
|
+
})
|
|
219
|
+
`);
|
|
220
|
+
}
|
|
221
|
+
async toImage(options) {
|
|
222
|
+
const dpi = options?.dpi ?? 150;
|
|
223
|
+
const zoom = dpi / 72;
|
|
224
|
+
const alpha = options?.alpha ?? false;
|
|
225
|
+
const rotation = options?.rotation ?? 0;
|
|
226
|
+
let clipStr = "None";
|
|
227
|
+
if (options?.clip) {
|
|
228
|
+
const c = options.clip;
|
|
229
|
+
clipStr = `pymupdf.Rect(${c.x0}, ${c.y0}, ${c.x1}, ${c.y1})`;
|
|
230
|
+
}
|
|
231
|
+
const result = this.runPython(`
|
|
232
|
+
import base64
|
|
233
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
234
|
+
mat = pymupdf.Matrix(${zoom}, ${zoom}).prerotate(${rotation})
|
|
235
|
+
pix = page.get_pixmap(matrix=mat, alpha=${alpha ? "True" : "False"}, clip=${clipStr})
|
|
236
|
+
base64.b64encode(pix.tobytes("png")).decode('ascii')
|
|
237
|
+
`);
|
|
238
|
+
const binary = atob(result);
|
|
239
|
+
const bytes = new Uint8Array(binary.length);
|
|
240
|
+
for (let i = 0; i < binary.length; i++) {
|
|
241
|
+
bytes[i] = binary.charCodeAt(i);
|
|
242
|
+
}
|
|
243
|
+
return bytes;
|
|
244
|
+
}
|
|
245
|
+
toSvg() {
|
|
246
|
+
return this.runPython(`${this.docVar}[${this.pageNumber}].get_svg_image()`);
|
|
247
|
+
}
|
|
248
|
+
addRedaction(rect, text, fill) {
|
|
249
|
+
const fillColor = fill ? `(${fill.r}, ${fill.g}, ${fill.b})` : "(0, 0, 0)";
|
|
250
|
+
const replaceText = text ?? "";
|
|
251
|
+
this.runPython(`
|
|
252
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
253
|
+
page.add_redact_annot(
|
|
254
|
+
pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}),
|
|
255
|
+
text="${replaceText}",
|
|
256
|
+
fill=${fillColor}
|
|
257
|
+
)
|
|
258
|
+
`);
|
|
259
|
+
}
|
|
260
|
+
applyRedactions() {
|
|
261
|
+
this.runPython(`${this.docVar}[${this.pageNumber}].apply_redactions()`);
|
|
262
|
+
}
|
|
263
|
+
drawLine(from, to, color, width) {
|
|
264
|
+
const colorStr = color ? `(${color.r}, ${color.g}, ${color.b})` : "(0, 0, 0)";
|
|
265
|
+
const lineWidth = width ?? 1;
|
|
266
|
+
this.runPython(`
|
|
267
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
268
|
+
shape = page.new_shape()
|
|
269
|
+
shape.draw_line((${from.x}, ${from.y}), (${to.x}, ${to.y}))
|
|
270
|
+
shape.finish(color=${colorStr}, width=${lineWidth})
|
|
271
|
+
shape.commit()
|
|
272
|
+
`);
|
|
273
|
+
}
|
|
274
|
+
drawRect(rect, color, fill, width) {
|
|
275
|
+
const strokeColor = color ? `(${color.r}, ${color.g}, ${color.b})` : "(0, 0, 0)";
|
|
276
|
+
const fillColor = fill ? `(${fill.r}, ${fill.g}, ${fill.b})` : "None";
|
|
277
|
+
const lineWidth = width ?? 1;
|
|
278
|
+
this.runPython(`
|
|
279
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
280
|
+
shape = page.new_shape()
|
|
281
|
+
shape.draw_rect(pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}))
|
|
282
|
+
shape.finish(color=${strokeColor}, fill=${fillColor}, width=${lineWidth})
|
|
283
|
+
shape.commit()
|
|
284
|
+
`);
|
|
285
|
+
}
|
|
286
|
+
drawCircle(center, radius, color, fill) {
|
|
287
|
+
const strokeColor = color ? `(${color.r}, ${color.g}, ${color.b})` : "(0, 0, 0)";
|
|
288
|
+
const fillColor = fill ? `(${fill.r}, ${fill.g}, ${fill.b})` : "None";
|
|
289
|
+
this.runPython(`
|
|
290
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
291
|
+
shape = page.new_shape()
|
|
292
|
+
shape.draw_circle((${center.x}, ${center.y}), ${radius})
|
|
293
|
+
shape.finish(color=${strokeColor}, fill=${fillColor})
|
|
294
|
+
shape.commit()
|
|
295
|
+
`);
|
|
296
|
+
}
|
|
297
|
+
findTables(options) {
|
|
298
|
+
let optionsStr = "";
|
|
299
|
+
if (options?.clip) {
|
|
300
|
+
const c = options.clip;
|
|
301
|
+
optionsStr += `clip=pymupdf.Rect(${c.x0}, ${c.y0}, ${c.x1}, ${c.y1}), `;
|
|
302
|
+
}
|
|
303
|
+
if (options?.strategy) {
|
|
304
|
+
optionsStr += `strategy="${options.strategy}", `;
|
|
305
|
+
}
|
|
306
|
+
if (options?.verticalStrategy) {
|
|
307
|
+
optionsStr += `vertical_strategy="${options.verticalStrategy}", `;
|
|
308
|
+
}
|
|
309
|
+
if (options?.horizontalStrategy) {
|
|
310
|
+
optionsStr += `horizontal_strategy="${options.horizontalStrategy}", `;
|
|
311
|
+
}
|
|
312
|
+
if (options?.addLines && options.addLines.length > 0) {
|
|
313
|
+
const linesStr = options.addLines.map((l) => `(${l.join(",")})`).join(",");
|
|
314
|
+
optionsStr += `add_lines=[${linesStr}], `;
|
|
315
|
+
}
|
|
316
|
+
const result = this.runPython(`
|
|
317
|
+
import json
|
|
318
|
+
|
|
319
|
+
page = ${this.docVar}[${this.pageNumber}]
|
|
320
|
+
tables = page.find_tables(${optionsStr})
|
|
321
|
+
|
|
322
|
+
result = []
|
|
323
|
+
for table in tables.tables:
|
|
324
|
+
bbox = table.bbox
|
|
325
|
+
header = table.header
|
|
326
|
+
header_data = None
|
|
327
|
+
if header:
|
|
328
|
+
header_bbox = header.bbox
|
|
329
|
+
header_data = {
|
|
330
|
+
'names': list(header.names),
|
|
331
|
+
'cells': [
|
|
332
|
+
{'x0': c[0], 'y0': c[1], 'x1': c[2], 'y1': c[3]} if c else None
|
|
333
|
+
for c in header.cells
|
|
334
|
+
],
|
|
335
|
+
'bbox': {'x0': header_bbox[0], 'y0': header_bbox[1], 'x1': header_bbox[2], 'y1': header_bbox[3]} if header_bbox else None,
|
|
336
|
+
'external': header.external
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
rows = table.extract()
|
|
340
|
+
markdown = table.to_markdown()
|
|
341
|
+
|
|
342
|
+
result.append({
|
|
343
|
+
'bbox': {'x0': bbox[0], 'y0': bbox[1], 'x1': bbox[2], 'y1': bbox[3]},
|
|
344
|
+
'rowCount': table.row_count,
|
|
345
|
+
'colCount': table.col_count,
|
|
346
|
+
'header': header_data,
|
|
347
|
+
'rows': rows,
|
|
348
|
+
'markdown': markdown
|
|
349
|
+
})
|
|
350
|
+
|
|
351
|
+
json.dumps(result)
|
|
352
|
+
`);
|
|
353
|
+
return JSON.parse(result);
|
|
354
|
+
}
|
|
355
|
+
tablesToMarkdown(options) {
|
|
356
|
+
const tables = this.findTables(options);
|
|
357
|
+
return tables.map((t) => t.markdown);
|
|
358
|
+
}
|
|
359
|
+
};
|
|
360
|
+
|
|
361
|
+
// src/document.ts
|
|
362
|
+
var PyMuPDFDocument = class {
|
|
363
|
+
constructor(pyodide, docVar, inputPath) {
|
|
364
|
+
this.closed = false;
|
|
365
|
+
this.pyodide = pyodide;
|
|
366
|
+
this.docVar = docVar;
|
|
367
|
+
this.inputPath = inputPath;
|
|
368
|
+
}
|
|
369
|
+
runPython(code) {
|
|
370
|
+
return this.pyodide.runPython(code);
|
|
371
|
+
}
|
|
372
|
+
ensureOpen() {
|
|
373
|
+
if (this.closed) {
|
|
374
|
+
throw new Error("Document has been closed");
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
get pageCount() {
|
|
378
|
+
this.ensureOpen();
|
|
379
|
+
return this.runPython(`${this.docVar}.page_count`);
|
|
380
|
+
}
|
|
381
|
+
get isPdf() {
|
|
382
|
+
this.ensureOpen();
|
|
383
|
+
return this.runPython(`${this.docVar}.is_pdf`);
|
|
384
|
+
}
|
|
385
|
+
get isEncrypted() {
|
|
386
|
+
this.ensureOpen();
|
|
387
|
+
return this.runPython(`${this.docVar}.is_encrypted`);
|
|
388
|
+
}
|
|
389
|
+
get needsPass() {
|
|
390
|
+
this.ensureOpen();
|
|
391
|
+
return this.runPython(`${this.docVar}.needs_pass`);
|
|
392
|
+
}
|
|
393
|
+
get metadata() {
|
|
394
|
+
this.ensureOpen();
|
|
395
|
+
const result = this.runPython(`
|
|
396
|
+
import json
|
|
397
|
+
m = ${this.docVar}.metadata
|
|
398
|
+
json.dumps(m if m else {})
|
|
399
|
+
`);
|
|
400
|
+
return JSON.parse(result);
|
|
401
|
+
}
|
|
402
|
+
setMetadata(metadata) {
|
|
403
|
+
this.ensureOpen();
|
|
404
|
+
const metaJson = JSON.stringify(metadata);
|
|
405
|
+
this.runPython(`${this.docVar}.set_metadata(${metaJson})`);
|
|
406
|
+
}
|
|
407
|
+
getPage(index) {
|
|
408
|
+
this.ensureOpen();
|
|
409
|
+
if (index < 0 || index >= this.pageCount) {
|
|
410
|
+
throw new Error(`Page index ${index} out of range (0-${this.pageCount - 1})`);
|
|
411
|
+
}
|
|
412
|
+
return new PyMuPDFPage(
|
|
413
|
+
(code) => this.runPython(code),
|
|
414
|
+
this.docVar,
|
|
415
|
+
index
|
|
416
|
+
);
|
|
417
|
+
}
|
|
418
|
+
*pages() {
|
|
419
|
+
this.ensureOpen();
|
|
420
|
+
const count = this.pageCount;
|
|
421
|
+
for (let i = 0; i < count; i++) {
|
|
422
|
+
yield this.getPage(i);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
deletePage(index) {
|
|
426
|
+
this.ensureOpen();
|
|
427
|
+
this.runPython(`${this.docVar}.delete_page(${index})`);
|
|
428
|
+
}
|
|
429
|
+
deletePages(indices) {
|
|
430
|
+
this.ensureOpen();
|
|
431
|
+
const sorted = [...indices].sort((a, b) => b - a);
|
|
432
|
+
for (const i of sorted) {
|
|
433
|
+
this.runPython(`${this.docVar}.delete_page(${i})`);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
insertBlankPage(index, width, height) {
|
|
437
|
+
this.ensureOpen();
|
|
438
|
+
const w = width ?? 595;
|
|
439
|
+
const h = height ?? 842;
|
|
440
|
+
this.runPython(`${this.docVar}.insert_page(${index}, width=${w}, height=${h})`);
|
|
441
|
+
return this.getPage(index);
|
|
442
|
+
}
|
|
443
|
+
movePage(from, to) {
|
|
444
|
+
this.ensureOpen();
|
|
445
|
+
this.runPython(`${this.docVar}.move_page(${from}, ${to})`);
|
|
446
|
+
}
|
|
447
|
+
copyPage(from, to) {
|
|
448
|
+
this.ensureOpen();
|
|
449
|
+
this.runPython(`${this.docVar}.copy_page(${from}, ${to})`);
|
|
450
|
+
}
|
|
451
|
+
selectPages(indices) {
|
|
452
|
+
this.ensureOpen();
|
|
453
|
+
this.runPython(`${this.docVar}.select([${indices.join(", ")}])`);
|
|
454
|
+
}
|
|
455
|
+
insertPdf(sourceDoc, options) {
|
|
456
|
+
this.ensureOpen();
|
|
457
|
+
const fromPage = options?.fromPage ?? 0;
|
|
458
|
+
const toPage = options?.toPage ?? -1;
|
|
459
|
+
const startAt = options?.startAt ?? -1;
|
|
460
|
+
const rotate = options?.rotate ?? 0;
|
|
461
|
+
this.runPython(`
|
|
462
|
+
${this.docVar}.insert_pdf(
|
|
463
|
+
${sourceDoc.docVar},
|
|
464
|
+
from_page=${fromPage},
|
|
465
|
+
to_page=${toPage},
|
|
466
|
+
start_at=${startAt},
|
|
467
|
+
rotate=${rotate}
|
|
468
|
+
)
|
|
469
|
+
`);
|
|
470
|
+
}
|
|
471
|
+
convertToPdf() {
|
|
472
|
+
this.ensureOpen();
|
|
473
|
+
const result = this.runPython(`
|
|
474
|
+
import base64
|
|
475
|
+
pdf_bytes = ${this.docVar}.convert_to_pdf()
|
|
476
|
+
base64.b64encode(pdf_bytes).decode('ascii')
|
|
477
|
+
`);
|
|
478
|
+
const binary = atob(result);
|
|
479
|
+
const bytes = new Uint8Array(binary.length);
|
|
480
|
+
for (let i = 0; i < binary.length; i++) {
|
|
481
|
+
bytes[i] = binary.charCodeAt(i);
|
|
482
|
+
}
|
|
483
|
+
return bytes;
|
|
484
|
+
}
|
|
485
|
+
searchText(query) {
|
|
486
|
+
this.ensureOpen();
|
|
487
|
+
const results = [];
|
|
488
|
+
for (let i = 0; i < this.pageCount; i++) {
|
|
489
|
+
const page = this.getPage(i);
|
|
490
|
+
const rects = page.searchFor(query);
|
|
491
|
+
for (const rect of rects) {
|
|
492
|
+
results.push({ page: i, rect, text: query });
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
return results;
|
|
496
|
+
}
|
|
497
|
+
getToc() {
|
|
498
|
+
this.ensureOpen();
|
|
499
|
+
const result = this.runPython(`
|
|
500
|
+
import json
|
|
501
|
+
toc = ${this.docVar}.get_toc()
|
|
502
|
+
json.dumps([{
|
|
503
|
+
'level': entry[0],
|
|
504
|
+
'title': entry[1],
|
|
505
|
+
'page': entry[2],
|
|
506
|
+
'dest': {'x': entry[3].x, 'y': entry[3].y} if len(entry) > 3 and entry[3] else None
|
|
507
|
+
} for entry in toc])
|
|
508
|
+
`);
|
|
509
|
+
return JSON.parse(result);
|
|
510
|
+
}
|
|
511
|
+
setToc(toc) {
|
|
512
|
+
this.ensureOpen();
|
|
513
|
+
const tocData = toc.map((e) => [e.level, e.title, e.page]);
|
|
514
|
+
this.runPython(`${this.docVar}.set_toc(${JSON.stringify(tocData)})`);
|
|
515
|
+
}
|
|
516
|
+
get isFormPdf() {
|
|
517
|
+
this.ensureOpen();
|
|
518
|
+
return this.runPython(`${this.docVar}.is_form_pdf`);
|
|
519
|
+
}
|
|
520
|
+
getFormFields() {
|
|
521
|
+
this.ensureOpen();
|
|
522
|
+
const result = this.runPython(`
|
|
523
|
+
import json
|
|
524
|
+
fields = []
|
|
525
|
+
for page in ${this.docVar}:
|
|
526
|
+
for widget in page.widgets():
|
|
527
|
+
r = widget.rect
|
|
528
|
+
fields.append({
|
|
529
|
+
'name': widget.field_name,
|
|
530
|
+
'type': widget.field_type_string.lower(),
|
|
531
|
+
'value': widget.field_value,
|
|
532
|
+
'rect': {'x0': r.x0, 'y0': r.y0, 'x1': r.x1, 'y1': r.y1},
|
|
533
|
+
'readonly': widget.field_flags & 1 != 0
|
|
534
|
+
})
|
|
535
|
+
json.dumps(fields)
|
|
536
|
+
`);
|
|
537
|
+
return JSON.parse(result);
|
|
538
|
+
}
|
|
539
|
+
setFormField(name, value) {
|
|
540
|
+
this.ensureOpen();
|
|
541
|
+
const valueStr = typeof value === "boolean" ? value ? "True" : "False" : `"${String(value).replace(/"/g, '\\"')}"`;
|
|
542
|
+
this.runPython(`
|
|
543
|
+
for page in ${this.docVar}:
|
|
544
|
+
for widget in page.widgets():
|
|
545
|
+
if widget.field_name == "${name}":
|
|
546
|
+
widget.field_value = ${valueStr}
|
|
547
|
+
widget.update()
|
|
548
|
+
break
|
|
549
|
+
`);
|
|
550
|
+
}
|
|
551
|
+
authenticate(password) {
|
|
552
|
+
this.ensureOpen();
|
|
553
|
+
return this.runPython(`${this.docVar}.authenticate("${password}")`);
|
|
554
|
+
}
|
|
555
|
+
save(options) {
|
|
556
|
+
this.ensureOpen();
|
|
557
|
+
let encryptParams = "";
|
|
558
|
+
if (options?.encryption) {
|
|
559
|
+
const enc = options.encryption;
|
|
560
|
+
const perms = enc.permissions ?? {};
|
|
561
|
+
const permValue = (perms.print !== false ? 4 : 0) | (perms.modify !== false ? 8 : 0) | (perms.copy !== false ? 16 : 0) | (perms.annotate !== false ? 32 : 0);
|
|
562
|
+
encryptParams = `, encryption=pymupdf.PDF_ENCRYPT_AES_256, owner_pw="${enc.ownerPassword}", user_pw="${enc.userPassword ?? ""}", permissions=${permValue}`;
|
|
563
|
+
}
|
|
564
|
+
const garbage = options?.garbage ?? 1;
|
|
565
|
+
const deflate = options?.deflate !== false;
|
|
566
|
+
const clean = options?.clean !== false;
|
|
567
|
+
const result = this.runPython(`
|
|
568
|
+
import base64
|
|
569
|
+
output = ${this.docVar}.tobytes(garbage=${garbage}, deflate=${deflate ? "True" : "False"}, clean=${clean ? "True" : "False"}${encryptParams})
|
|
570
|
+
base64.b64encode(output).decode('ascii')
|
|
571
|
+
`);
|
|
572
|
+
const binary = atob(result);
|
|
573
|
+
const bytes = new Uint8Array(binary.length);
|
|
574
|
+
for (let i = 0; i < binary.length; i++) {
|
|
575
|
+
bytes[i] = binary.charCodeAt(i);
|
|
576
|
+
}
|
|
577
|
+
return bytes;
|
|
578
|
+
}
|
|
579
|
+
saveAsBlob(options) {
|
|
580
|
+
const bytes = this.save(options);
|
|
581
|
+
return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
|
|
582
|
+
}
|
|
583
|
+
getLayerConfig() {
|
|
584
|
+
this.ensureOpen();
|
|
585
|
+
const result = this.runPython(`
|
|
586
|
+
import json
|
|
587
|
+
import re
|
|
588
|
+
|
|
589
|
+
# Get basic layer info from layer_ui_configs
|
|
590
|
+
layers = ${this.docVar}.layer_ui_configs()
|
|
591
|
+
|
|
592
|
+
# Build a map of layer number to layer info
|
|
593
|
+
layer_map = {}
|
|
594
|
+
xref_to_num = {}
|
|
595
|
+
|
|
596
|
+
for layer in layers:
|
|
597
|
+
num = layer.get('number', 0)
|
|
598
|
+
layer_map[num] = {
|
|
599
|
+
'number': num,
|
|
600
|
+
'text': layer.get('text', ''),
|
|
601
|
+
'on': layer.get('on', False),
|
|
602
|
+
'locked': layer.get('locked', False),
|
|
603
|
+
'depth': 0,
|
|
604
|
+
'xref': 0,
|
|
605
|
+
'parentXref': 0,
|
|
606
|
+
'displayOrder': 0
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
# Try to parse the Order array to get hierarchy and xrefs
|
|
610
|
+
try:
|
|
611
|
+
catalog_xref = ${this.docVar}.pdf_catalog()
|
|
612
|
+
|
|
613
|
+
# Get OCProperties
|
|
614
|
+
t, ocprop_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties")
|
|
615
|
+
|
|
616
|
+
ocgs_str = None
|
|
617
|
+
order_str = None
|
|
618
|
+
|
|
619
|
+
if t == "dict":
|
|
620
|
+
t_ocg, ocgs_str = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/OCGs")
|
|
621
|
+
t2, order_str = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/D/Order")
|
|
622
|
+
elif t != "null":
|
|
623
|
+
ocprop_match = re.search(r'(\\d+)\\s+\\d+\\s+R', ocprop_val)
|
|
624
|
+
if ocprop_match:
|
|
625
|
+
ocprop_xref = int(ocprop_match.group(1))
|
|
626
|
+
t_ocg, ocgs_str = ${this.docVar}.xref_get_key(ocprop_xref, "OCGs")
|
|
627
|
+
t2, d_val = ${this.docVar}.xref_get_key(ocprop_xref, "D")
|
|
628
|
+
if t2 == "dict":
|
|
629
|
+
t2, order_str = ${this.docVar}.xref_get_key(ocprop_xref, "D/Order")
|
|
630
|
+
elif t2 != "null":
|
|
631
|
+
d_match = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
|
|
632
|
+
if d_match:
|
|
633
|
+
d_xref = int(d_match.group(1))
|
|
634
|
+
t2, order_str = ${this.docVar}.xref_get_key(d_xref, "Order")
|
|
635
|
+
|
|
636
|
+
# Parse OCGs array and build xref -> number mapping by matching OCG names to layer text
|
|
637
|
+
if ocgs_str:
|
|
638
|
+
xref_matches = re.findall(r'(\\d+)\\s+0\\s+R', ocgs_str)
|
|
639
|
+
ocg_xrefs = [int(x) for x in xref_matches]
|
|
640
|
+
|
|
641
|
+
# Build a name-to-layer-number map from layer_ui_configs
|
|
642
|
+
name_to_num = {}
|
|
643
|
+
for num, info in layer_map.items():
|
|
644
|
+
name_to_num[info['text']] = num
|
|
645
|
+
|
|
646
|
+
# For each OCG xref, look up its Name and match to layer
|
|
647
|
+
for xref in ocg_xrefs:
|
|
648
|
+
# Get the OCG's Name from its dictionary
|
|
649
|
+
t_name, name_val = ${this.docVar}.xref_get_key(xref, "Name")
|
|
650
|
+
if t_name != "null" and name_val:
|
|
651
|
+
# Remove parentheses from PDF string: "(Layer Name)" -> "Layer Name"
|
|
652
|
+
ocg_name = name_val.strip()
|
|
653
|
+
if ocg_name.startswith('(') and ocg_name.endswith(')'):
|
|
654
|
+
ocg_name = ocg_name[1:-1]
|
|
655
|
+
|
|
656
|
+
# Find the layer with this name
|
|
657
|
+
if ocg_name in name_to_num:
|
|
658
|
+
num = name_to_num[ocg_name]
|
|
659
|
+
layer_map[num]['xref'] = xref
|
|
660
|
+
xref_to_num[xref] = num
|
|
661
|
+
|
|
662
|
+
# Parse Order array with state machine to get proper hierarchy
|
|
663
|
+
# Format: ParentRef [Child1 Child2] or [OCG1 OCG2] or just OCG
|
|
664
|
+
if order_str:
|
|
665
|
+
display_order = [0] # Use list for mutable counter
|
|
666
|
+
|
|
667
|
+
# Strip outer brackets from Order array - it's always wrapped in []
|
|
668
|
+
inner_order = order_str.strip()
|
|
669
|
+
if inner_order.startswith('[') and inner_order.endswith(']'):
|
|
670
|
+
inner_order = inner_order[1:-1]
|
|
671
|
+
|
|
672
|
+
def parse_order_array(order_val, depth=0, parent_xref=0):
|
|
673
|
+
i = 0
|
|
674
|
+
last_xref = 0 # Track last OCG xref at current level
|
|
675
|
+
|
|
676
|
+
while i < len(order_val):
|
|
677
|
+
char = order_val[i]
|
|
678
|
+
|
|
679
|
+
if char == '[':
|
|
680
|
+
# Start of nested array - children of last_xref
|
|
681
|
+
# Find matching closing bracket
|
|
682
|
+
bracket_depth = 1
|
|
683
|
+
start = i + 1
|
|
684
|
+
j = i + 1
|
|
685
|
+
while j < len(order_val) and bracket_depth > 0:
|
|
686
|
+
if order_val[j] == '[':
|
|
687
|
+
bracket_depth += 1
|
|
688
|
+
elif order_val[j] == ']':
|
|
689
|
+
bracket_depth -= 1
|
|
690
|
+
j += 1
|
|
691
|
+
|
|
692
|
+
nested_content = order_val[start:j-1]
|
|
693
|
+
# Recursively parse with last_xref as parent
|
|
694
|
+
parse_order_array(nested_content, depth + 1, last_xref)
|
|
695
|
+
i = j
|
|
696
|
+
elif char == ']':
|
|
697
|
+
i += 1
|
|
698
|
+
elif char.isdigit():
|
|
699
|
+
# Parse xref reference
|
|
700
|
+
ref_match = re.match(r'(\\d+)\\s+0\\s+R', order_val[i:])
|
|
701
|
+
if ref_match:
|
|
702
|
+
xref = int(ref_match.group(1))
|
|
703
|
+
if xref in xref_to_num:
|
|
704
|
+
num = xref_to_num[xref]
|
|
705
|
+
layer_map[num]['depth'] = depth
|
|
706
|
+
layer_map[num]['parentXref'] = parent_xref
|
|
707
|
+
layer_map[num]['displayOrder'] = display_order[0]
|
|
708
|
+
display_order[0] += 1
|
|
709
|
+
last_xref = xref
|
|
710
|
+
i += len(ref_match.group(0))
|
|
711
|
+
else:
|
|
712
|
+
i += 1
|
|
713
|
+
else:
|
|
714
|
+
i += 1
|
|
715
|
+
|
|
716
|
+
parse_order_array(inner_order)
|
|
717
|
+
|
|
718
|
+
except Exception as e:
|
|
719
|
+
# If parsing fails, continue with basic layer info
|
|
720
|
+
pass
|
|
721
|
+
|
|
722
|
+
# Convert to list and sort by displayOrder
|
|
723
|
+
result_list = sorted(layer_map.values(), key=lambda x: x.get('displayOrder', 0))
|
|
724
|
+
json.dumps(result_list)
|
|
725
|
+
`);
|
|
726
|
+
return JSON.parse(result);
|
|
727
|
+
}
|
|
728
|
+
addOCG(name, options) {
|
|
729
|
+
this.ensureOpen();
|
|
730
|
+
const config = options?.config ?? -1;
|
|
731
|
+
const on = options?.on !== false;
|
|
732
|
+
const intent = options?.intent ?? "View";
|
|
733
|
+
const usage = options?.usage ?? "Artwork";
|
|
734
|
+
return this.runPython(`
|
|
735
|
+
${this.docVar}.add_ocg("${name.replace(/"/g, '\\"')}", config=${config}, on=${on ? "True" : "False"}, intent="${intent}", usage="${usage}")
|
|
736
|
+
`);
|
|
737
|
+
}
|
|
738
|
+
addOCGWithParent(name, parentXref, options) {
|
|
739
|
+
this.ensureOpen();
|
|
740
|
+
const config = options?.config ?? -1;
|
|
741
|
+
const on = options?.on !== false;
|
|
742
|
+
const intent = options?.intent ?? "View";
|
|
743
|
+
const usage = options?.usage ?? "Artwork";
|
|
744
|
+
return this.runPython(`
|
|
745
|
+
import re
|
|
746
|
+
|
|
747
|
+
# 1. Create the new OCG (automatically added to root of Order array)
|
|
748
|
+
child_xref = ${this.docVar}.add_ocg("${name.replace(/"/g, '\\"')}", config=${config}, on=${on ? "True" : "False"}, intent="${intent}", usage="${usage}")
|
|
749
|
+
|
|
750
|
+
catalog_xref = ${this.docVar}.pdf_catalog()
|
|
751
|
+
|
|
752
|
+
# 2. Locate OCProperties and Order array
|
|
753
|
+
t, ocprop_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties")
|
|
754
|
+
|
|
755
|
+
order_key_path = None
|
|
756
|
+
order_xref = None
|
|
757
|
+
order_str = None
|
|
758
|
+
|
|
759
|
+
if t == "dict":
|
|
760
|
+
# Inline OCProperties
|
|
761
|
+
t2, order_str = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/D/Order")
|
|
762
|
+
order_key_path = "OCProperties/D/Order"
|
|
763
|
+
order_xref = catalog_xref
|
|
764
|
+
elif t != "null":
|
|
765
|
+
# Reference to OCProperties
|
|
766
|
+
ocprop_match = re.search(r'(\\d+)\\s+\\d+\\s+R', ocprop_val)
|
|
767
|
+
if ocprop_match:
|
|
768
|
+
ocprop_xref = int(ocprop_match.group(1))
|
|
769
|
+
t2, d_val = ${this.docVar}.xref_get_key(ocprop_xref, "D")
|
|
770
|
+
|
|
771
|
+
if t2 == "dict":
|
|
772
|
+
# D is inline
|
|
773
|
+
t2, order_str = ${this.docVar}.xref_get_key(ocprop_xref, "D/Order")
|
|
774
|
+
order_key_path = "D/Order"
|
|
775
|
+
order_xref = ocprop_xref
|
|
776
|
+
elif t2 != "null":
|
|
777
|
+
# D is reference
|
|
778
|
+
d_match = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
|
|
779
|
+
if d_match:
|
|
780
|
+
d_xref = int(d_match.group(1))
|
|
781
|
+
t2, order_str = ${this.docVar}.xref_get_key(d_xref, "Order")
|
|
782
|
+
order_key_path = "Order"
|
|
783
|
+
order_xref = d_xref
|
|
784
|
+
|
|
785
|
+
parent_ref = f"{${parentXref}} 0 R"
|
|
786
|
+
child_ref = f"{child_xref} 0 R"
|
|
787
|
+
|
|
788
|
+
def modify_pdf_order(order_string, p_ref, c_ref):
|
|
789
|
+
if not order_string:
|
|
790
|
+
return order_string
|
|
791
|
+
|
|
792
|
+
# --- STEP 1: Remove the Child from Root ---
|
|
793
|
+
# add_ocg usually appends to the end of the root array.
|
|
794
|
+
# We find the child ref that is strictly at depth 1 (root).
|
|
795
|
+
|
|
796
|
+
cleaned_order = ""
|
|
797
|
+
depth = 0
|
|
798
|
+
i = 0
|
|
799
|
+
removed = False
|
|
800
|
+
|
|
801
|
+
while i < len(order_string):
|
|
802
|
+
char = order_string[i]
|
|
803
|
+
|
|
804
|
+
if char == '[':
|
|
805
|
+
depth += 1
|
|
806
|
+
cleaned_order += char
|
|
807
|
+
i += 1
|
|
808
|
+
elif char == ']':
|
|
809
|
+
depth -= 1
|
|
810
|
+
cleaned_order += char
|
|
811
|
+
i += 1
|
|
812
|
+
else:
|
|
813
|
+
# Check if we are looking at the child ref
|
|
814
|
+
# We match strictly "xref 0 R"
|
|
815
|
+
match = None
|
|
816
|
+
if not removed and depth == 1: # Only remove from root
|
|
817
|
+
chunk = order_string[i:]
|
|
818
|
+
# Check if chunk starts with child_ref followed by non-digit
|
|
819
|
+
if chunk.startswith(c_ref):
|
|
820
|
+
# verify boundary (next char is space, ], or end)
|
|
821
|
+
if len(chunk) == len(c_ref) or chunk[len(c_ref)] in ' ]':
|
|
822
|
+
match = True
|
|
823
|
+
|
|
824
|
+
if match:
|
|
825
|
+
# Skip this ref
|
|
826
|
+
i += len(c_ref)
|
|
827
|
+
removed = True
|
|
828
|
+
# Skip following whitespace if any
|
|
829
|
+
while i < len(order_string) and order_string[i].isspace():
|
|
830
|
+
i += 1
|
|
831
|
+
else:
|
|
832
|
+
cleaned_order += char
|
|
833
|
+
i += 1
|
|
834
|
+
|
|
835
|
+
# --- STEP 2: Insert Child Under Parent ---
|
|
836
|
+
# Logic: Find Parent. Check next non-space char.
|
|
837
|
+
# If '[': Parent already has children. Insert inside that array.
|
|
838
|
+
# If not '[': Create new array [ Child ] after Parent.
|
|
839
|
+
|
|
840
|
+
final_order = cleaned_order
|
|
841
|
+
|
|
842
|
+
# Find parent index
|
|
843
|
+
p_idx = final_order.find(p_ref)
|
|
844
|
+
|
|
845
|
+
if p_idx != -1:
|
|
846
|
+
# Look ahead
|
|
847
|
+
scan_idx = p_idx + len(p_ref)
|
|
848
|
+
insertion_point = -1
|
|
849
|
+
is_existing_array = False
|
|
850
|
+
|
|
851
|
+
# Scan forward for next significant char
|
|
852
|
+
next_char_idx = -1
|
|
853
|
+
for k in range(scan_idx, len(final_order)):
|
|
854
|
+
if not final_order[k].isspace():
|
|
855
|
+
next_char_idx = k
|
|
856
|
+
break
|
|
857
|
+
|
|
858
|
+
if next_char_idx != -1 and final_order[next_char_idx] == '[':
|
|
859
|
+
# Parent has existing children array.
|
|
860
|
+
# We must find the closing bracket for THIS array.
|
|
861
|
+
is_existing_array = True
|
|
862
|
+
arr_depth = 1
|
|
863
|
+
for k in range(next_char_idx + 1, len(final_order)):
|
|
864
|
+
if final_order[k] == '[': arr_depth += 1
|
|
865
|
+
elif final_order[k] == ']': arr_depth -= 1
|
|
866
|
+
|
|
867
|
+
if arr_depth == 0:
|
|
868
|
+
# Found the closing bracket
|
|
869
|
+
insertion_point = k
|
|
870
|
+
break
|
|
871
|
+
else:
|
|
872
|
+
# No existing array, insert after parent
|
|
873
|
+
insertion_point = scan_idx
|
|
874
|
+
|
|
875
|
+
if insertion_point != -1:
|
|
876
|
+
if is_existing_array:
|
|
877
|
+
# Insert inside existing array (before the closing bracket)
|
|
878
|
+
prefix = final_order[:insertion_point]
|
|
879
|
+
suffix = final_order[insertion_point:]
|
|
880
|
+
final_order = prefix + " " + c_ref + suffix
|
|
881
|
+
else:
|
|
882
|
+
# Create new array after parent
|
|
883
|
+
prefix = final_order[:insertion_point]
|
|
884
|
+
suffix = final_order[insertion_point:]
|
|
885
|
+
final_order = prefix + " [" + c_ref + "]" + suffix
|
|
886
|
+
|
|
887
|
+
return final_order
|
|
888
|
+
|
|
889
|
+
if order_str and order_xref:
|
|
890
|
+
new_order = modify_pdf_order(order_str, parent_ref, child_ref)
|
|
891
|
+
${this.docVar}.xref_set_key(order_xref, order_key_path, new_order)
|
|
892
|
+
|
|
893
|
+
child_xref
|
|
894
|
+
`);
|
|
895
|
+
}
|
|
896
|
+
setLayerVisibility(ocgXref, on) {
|
|
897
|
+
this.ensureOpen();
|
|
898
|
+
this.runPython(`
|
|
899
|
+
import re
|
|
900
|
+
|
|
901
|
+
catalog_xref = ${this.docVar}.pdf_catalog()
|
|
902
|
+
t, ocprop_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties")
|
|
903
|
+
|
|
904
|
+
# Find the D (default config) and its xref/path
|
|
905
|
+
d_xref = None
|
|
906
|
+
d_path = None
|
|
907
|
+
is_inline_d = False
|
|
908
|
+
|
|
909
|
+
if t == "dict":
|
|
910
|
+
# Inline OCProperties
|
|
911
|
+
t2, d_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/D")
|
|
912
|
+
if t2 == "dict":
|
|
913
|
+
d_xref = catalog_xref
|
|
914
|
+
d_path = "OCProperties/D"
|
|
915
|
+
is_inline_d = True
|
|
916
|
+
elif t2 != "null":
|
|
917
|
+
m = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
|
|
918
|
+
if m:
|
|
919
|
+
d_xref = int(m.group(1))
|
|
920
|
+
d_path = ""
|
|
921
|
+
elif t != "null":
|
|
922
|
+
m = re.search(r'(\\d+)\\s+\\d+\\s+R', ocprop_val)
|
|
923
|
+
if m:
|
|
924
|
+
ocprop_xref = int(m.group(1))
|
|
925
|
+
t2, d_val = ${this.docVar}.xref_get_key(ocprop_xref, "D")
|
|
926
|
+
if t2 == "dict":
|
|
927
|
+
d_xref = ocprop_xref
|
|
928
|
+
d_path = "D"
|
|
929
|
+
is_inline_d = True
|
|
930
|
+
elif t2 != "null":
|
|
931
|
+
m2 = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
|
|
932
|
+
if m2:
|
|
933
|
+
d_xref = int(m2.group(1))
|
|
934
|
+
d_path = ""
|
|
935
|
+
|
|
936
|
+
if d_xref is None:
|
|
937
|
+
raise ValueError("Could not find OCProperties/D config")
|
|
938
|
+
|
|
939
|
+
ocg_ref = f"${ocgXref} 0 R"
|
|
940
|
+
|
|
941
|
+
# Helper to add/remove xref from an array
|
|
942
|
+
def add_to_array(arr_str, xref_ref):
|
|
943
|
+
if not arr_str or arr_str == "null":
|
|
944
|
+
return "[" + xref_ref + "]"
|
|
945
|
+
# Check if already in array
|
|
946
|
+
if xref_ref in arr_str:
|
|
947
|
+
return arr_str
|
|
948
|
+
# Add before closing bracket
|
|
949
|
+
return arr_str.rstrip(']') + " " + xref_ref + "]"
|
|
950
|
+
|
|
951
|
+
def remove_from_array(arr_str, xref_ref):
|
|
952
|
+
if not arr_str or arr_str == "null":
|
|
953
|
+
return arr_str
|
|
954
|
+
# Remove the xref reference
|
|
955
|
+
pattern = r'\\s*' + str(${ocgXref}) + r'\\s+0\\s+R'
|
|
956
|
+
result = re.sub(pattern, '', arr_str)
|
|
957
|
+
# Clean up any double spaces
|
|
958
|
+
result = re.sub(r'\\s+', ' ', result)
|
|
959
|
+
result = result.replace('[ ', '[').replace(' ]', ']')
|
|
960
|
+
return result
|
|
961
|
+
|
|
962
|
+
# Get current ON and OFF arrays
|
|
963
|
+
on_key = d_path + "/ON" if d_path else "ON"
|
|
964
|
+
off_key = d_path + "/OFF" if d_path else "OFF"
|
|
965
|
+
|
|
966
|
+
t_on, on_arr = ${this.docVar}.xref_get_key(d_xref, on_key)
|
|
967
|
+
t_off, off_arr = ${this.docVar}.xref_get_key(d_xref, off_key)
|
|
968
|
+
|
|
969
|
+
if ${on ? "True" : "False"}:
|
|
970
|
+
# Turn ON: add to ON array, remove from OFF array
|
|
971
|
+
new_on = add_to_array(on_arr if t_on != "null" else "", ocg_ref)
|
|
972
|
+
new_off = remove_from_array(off_arr if t_off != "null" else "", ocg_ref)
|
|
973
|
+
${this.docVar}.xref_set_key(d_xref, on_key, new_on)
|
|
974
|
+
if new_off and new_off != "[]":
|
|
975
|
+
${this.docVar}.xref_set_key(d_xref, off_key, new_off)
|
|
976
|
+
else:
|
|
977
|
+
# Turn OFF: add to OFF array, remove from ON array
|
|
978
|
+
new_off = add_to_array(off_arr if t_off != "null" else "", ocg_ref)
|
|
979
|
+
new_on = remove_from_array(on_arr if t_on != "null" else "", ocg_ref)
|
|
980
|
+
${this.docVar}.xref_set_key(d_xref, off_key, new_off)
|
|
981
|
+
if new_on and new_on != "[]":
|
|
982
|
+
${this.docVar}.xref_set_key(d_xref, on_key, new_on)
|
|
983
|
+
`);
|
|
984
|
+
}
|
|
985
|
+
setOC(xref, ocgXref) {
|
|
986
|
+
this.ensureOpen();
|
|
987
|
+
this.runPython(`${this.docVar}.set_oc(${xref}, ${ocgXref})`);
|
|
988
|
+
}
|
|
989
|
+
getOC(xref) {
|
|
990
|
+
this.ensureOpen();
|
|
991
|
+
return this.runPython(`${this.docVar}.get_oc(${xref})`);
|
|
992
|
+
}
|
|
993
|
+
deleteOCG(layerNumber) {
|
|
994
|
+
this.ensureOpen();
|
|
995
|
+
this.runPython(`
|
|
996
|
+
import re
|
|
997
|
+
|
|
998
|
+
# First, get the actual OCG xref from the layer number
|
|
999
|
+
# layer_ui_configs returns items with "number" which is an index, not xref
|
|
1000
|
+
# We need to find the actual OCG xref by looking at the OCProperties
|
|
1001
|
+
|
|
1002
|
+
catalog_xref = ${this.docVar}.pdf_catalog()
|
|
1003
|
+
|
|
1004
|
+
# Get OCProperties - it might be inline dict or a reference
|
|
1005
|
+
t, ocprop_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties")
|
|
1006
|
+
|
|
1007
|
+
# Determine if OCProperties is inline (dict) or a reference
|
|
1008
|
+
if t == "dict":
|
|
1009
|
+
# OCProperties is inline in catalog - we work directly with catalog_xref
|
|
1010
|
+
ocprop_xref = catalog_xref
|
|
1011
|
+
is_inline = True
|
|
1012
|
+
else:
|
|
1013
|
+
# It's a reference like "X 0 R"
|
|
1014
|
+
ocprop_match = re.search(r'(\\d+)\\s+\\d+\\s+R', ocprop_val)
|
|
1015
|
+
if not ocprop_match:
|
|
1016
|
+
raise ValueError("Cannot find OCProperties")
|
|
1017
|
+
ocprop_xref = int(ocprop_match.group(1))
|
|
1018
|
+
is_inline = False
|
|
1019
|
+
|
|
1020
|
+
# Get the OCGs array to find the actual xref at this index
|
|
1021
|
+
if is_inline:
|
|
1022
|
+
# For inline, we need to get it from the full catalog dict
|
|
1023
|
+
t, ocgs_str = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/OCGs")
|
|
1024
|
+
else:
|
|
1025
|
+
t, ocgs_str = ${this.docVar}.xref_get_key(ocprop_xref, "OCGs")
|
|
1026
|
+
|
|
1027
|
+
if t == "null" or not ocgs_str:
|
|
1028
|
+
raise ValueError("No OCGs array found")
|
|
1029
|
+
|
|
1030
|
+
# Parse all xrefs from the array like "[5 0 R 6 0 R 7 0 R]"
|
|
1031
|
+
xref_matches = re.findall(r'(\\d+)\\s+0\\s+R', ocgs_str)
|
|
1032
|
+
ocg_xrefs = [int(x) for x in xref_matches]
|
|
1033
|
+
|
|
1034
|
+
# The layer number from layer_ui_configs corresponds to index in this array
|
|
1035
|
+
if ${layerNumber} < 0 or ${layerNumber} >= len(ocg_xrefs):
|
|
1036
|
+
# layerNumber might actually BE the xref in some cases
|
|
1037
|
+
target_xref = ${layerNumber}
|
|
1038
|
+
else:
|
|
1039
|
+
target_xref = ocg_xrefs[${layerNumber}]
|
|
1040
|
+
|
|
1041
|
+
# Helper to remove xref from array string
|
|
1042
|
+
def remove_xref_from_array(arr_str, xref_to_remove):
|
|
1043
|
+
# Remove "X 0 R" pattern
|
|
1044
|
+
pattern = r'\\s*' + str(xref_to_remove) + r'\\s+0\\s+R'
|
|
1045
|
+
return re.sub(pattern, '', arr_str)
|
|
1046
|
+
|
|
1047
|
+
# Update the OCGs array
|
|
1048
|
+
new_ocgs = remove_xref_from_array(ocgs_str, target_xref)
|
|
1049
|
+
if is_inline:
|
|
1050
|
+
${this.docVar}.xref_set_key(catalog_xref, "OCProperties/OCGs", new_ocgs)
|
|
1051
|
+
else:
|
|
1052
|
+
${this.docVar}.xref_set_key(ocprop_xref, "OCGs", new_ocgs)
|
|
1053
|
+
|
|
1054
|
+
# Get D (default config) and update its arrays
|
|
1055
|
+
if is_inline:
|
|
1056
|
+
t, d_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/D")
|
|
1057
|
+
else:
|
|
1058
|
+
t, d_val = ${this.docVar}.xref_get_key(ocprop_xref, "D")
|
|
1059
|
+
|
|
1060
|
+
if t == "dict":
|
|
1061
|
+
# D is inline
|
|
1062
|
+
d_xref = ocprop_xref if not is_inline else catalog_xref
|
|
1063
|
+
d_prefix = "OCProperties/D/" if is_inline else "D/"
|
|
1064
|
+
|
|
1065
|
+
# Try to update ON, OFF, Order arrays
|
|
1066
|
+
for key in ["ON", "OFF", "Order"]:
|
|
1067
|
+
try:
|
|
1068
|
+
tk, val = ${this.docVar}.xref_get_key(d_xref, d_prefix.rstrip('/') + '/' + key if d_prefix else key)
|
|
1069
|
+
if tk != "null" and val:
|
|
1070
|
+
new_val = remove_xref_from_array(val, target_xref)
|
|
1071
|
+
${this.docVar}.xref_set_key(d_xref, d_prefix.rstrip('/') + '/' + key if d_prefix else key, new_val)
|
|
1072
|
+
except:
|
|
1073
|
+
pass
|
|
1074
|
+
elif t != "null":
|
|
1075
|
+
# D is a reference
|
|
1076
|
+
d_match = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
|
|
1077
|
+
if d_match:
|
|
1078
|
+
d_xref = int(d_match.group(1))
|
|
1079
|
+
for key in ["ON", "OFF", "Order"]:
|
|
1080
|
+
try:
|
|
1081
|
+
tk, val = ${this.docVar}.xref_get_key(d_xref, key)
|
|
1082
|
+
if tk != "null" and val:
|
|
1083
|
+
new_val = remove_xref_from_array(val, target_xref)
|
|
1084
|
+
${this.docVar}.xref_set_key(d_xref, key, new_val)
|
|
1085
|
+
except:
|
|
1086
|
+
pass
|
|
1087
|
+
`);
|
|
1088
|
+
}
|
|
1089
|
+
close() {
|
|
1090
|
+
if (this.closed) return;
|
|
1091
|
+
try {
|
|
1092
|
+
this.runPython(`${this.docVar}.close()`);
|
|
1093
|
+
this.pyodide.FS.unlink(this.inputPath);
|
|
1094
|
+
} catch {
|
|
1095
|
+
}
|
|
1096
|
+
this.closed = true;
|
|
1097
|
+
}
|
|
1098
|
+
};
|
|
1099
|
+
|
|
1100
|
+
// src/pymupdf.ts
|
|
1101
|
+
import loadGhostscriptWASM from "@okathira/ghostpdl-wasm";
|
|
1102
|
+
async function convertPdfToRgb(pdfData) {
|
|
1103
|
+
console.log("[convertPdfToRgb] Starting Ghostscript RGB conversion...");
|
|
1104
|
+
console.log("[convertPdfToRgb] Input size:", pdfData.length);
|
|
1105
|
+
const gs = await loadGhostscriptWASM({
|
|
1106
|
+
locateFile: (path) => {
|
|
1107
|
+
if (path.endsWith(".wasm")) {
|
|
1108
|
+
return "/ghostscript-wasm/gs.wasm";
|
|
1109
|
+
}
|
|
1110
|
+
return path;
|
|
1111
|
+
},
|
|
1112
|
+
print: (text) => console.log("[GS RGB]", text),
|
|
1113
|
+
printErr: (text) => console.error("[GS RGB Error]", text)
|
|
1114
|
+
});
|
|
1115
|
+
const inputPath = "/tmp/cmyk_input.pdf";
|
|
1116
|
+
const outputPath = "/tmp/rgb_output.pdf";
|
|
1117
|
+
gs.FS.writeFile(inputPath, pdfData);
|
|
1118
|
+
console.log("[convertPdfToRgb] Wrote input file");
|
|
1119
|
+
const args = [
|
|
1120
|
+
"-dBATCH",
|
|
1121
|
+
"-dNOPAUSE",
|
|
1122
|
+
"-dNOSAFER",
|
|
1123
|
+
"-dQUIET",
|
|
1124
|
+
"-sDEVICE=pdfwrite",
|
|
1125
|
+
"-sColorConversionStrategy=sRGB",
|
|
1126
|
+
"-sColorConversionStrategyForImages=sRGB",
|
|
1127
|
+
"-dConvertCMYKImagesToRGB=true",
|
|
1128
|
+
"-dProcessColorModel=/DeviceRGB",
|
|
1129
|
+
"-dAutoFilterColorImages=true",
|
|
1130
|
+
"-dAutoFilterGrayImages=true",
|
|
1131
|
+
"-dColorImageFilter=/DCTEncode",
|
|
1132
|
+
"-dGrayImageFilter=/DCTEncode",
|
|
1133
|
+
"-dCompatibilityLevel=1.4",
|
|
1134
|
+
`-sOutputFile=${outputPath}`,
|
|
1135
|
+
inputPath
|
|
1136
|
+
];
|
|
1137
|
+
console.log("[convertPdfToRgb] Running Ghostscript with args:", args.join(" "));
|
|
1138
|
+
let exitCode;
|
|
1139
|
+
try {
|
|
1140
|
+
exitCode = gs.callMain(args);
|
|
1141
|
+
} catch (e) {
|
|
1142
|
+
console.error("[convertPdfToRgb] Ghostscript exception:", e);
|
|
1143
|
+
try {
|
|
1144
|
+
gs.FS.unlink(inputPath);
|
|
1145
|
+
} catch {
|
|
1146
|
+
}
|
|
1147
|
+
throw new Error(`Ghostscript threw exception: ${e}`);
|
|
1148
|
+
}
|
|
1149
|
+
console.log("[convertPdfToRgb] Ghostscript exit code:", exitCode);
|
|
1150
|
+
if (exitCode !== 0) {
|
|
1151
|
+
try {
|
|
1152
|
+
gs.FS.unlink(inputPath);
|
|
1153
|
+
} catch {
|
|
1154
|
+
}
|
|
1155
|
+
try {
|
|
1156
|
+
gs.FS.unlink(outputPath);
|
|
1157
|
+
} catch {
|
|
1158
|
+
}
|
|
1159
|
+
throw new Error(`Ghostscript RGB conversion failed with exit code ${exitCode}`);
|
|
1160
|
+
}
|
|
1161
|
+
let output;
|
|
1162
|
+
try {
|
|
1163
|
+
const stat = gs.FS.stat(outputPath);
|
|
1164
|
+
console.log("[convertPdfToRgb] Output file size:", stat.size);
|
|
1165
|
+
output = gs.FS.readFile(outputPath);
|
|
1166
|
+
} catch (e) {
|
|
1167
|
+
console.error("[convertPdfToRgb] Failed to read output:", e);
|
|
1168
|
+
try {
|
|
1169
|
+
gs.FS.unlink(inputPath);
|
|
1170
|
+
} catch {
|
|
1171
|
+
}
|
|
1172
|
+
throw new Error("Ghostscript did not produce output file");
|
|
1173
|
+
}
|
|
1174
|
+
try {
|
|
1175
|
+
gs.FS.unlink(inputPath);
|
|
1176
|
+
} catch {
|
|
1177
|
+
}
|
|
1178
|
+
try {
|
|
1179
|
+
gs.FS.unlink(outputPath);
|
|
1180
|
+
} catch {
|
|
1181
|
+
}
|
|
1182
|
+
const copy = new Uint8Array(output.length);
|
|
1183
|
+
copy.set(output);
|
|
1184
|
+
console.log("[convertPdfToRgb] Conversion complete, output size:", copy.length);
|
|
1185
|
+
return copy;
|
|
1186
|
+
}
|
|
1187
|
+
var ASSETS = {
|
|
1188
|
+
pyodide: "pyodide.js",
|
|
1189
|
+
wheels: [
|
|
1190
|
+
"pymupdf-1.26.3-cp313-none-pyodide_2025_0_wasm32.whl",
|
|
1191
|
+
"pymupdf4llm-0.0.27-py3-none-any.whl",
|
|
1192
|
+
"fonttools-4.56.0-py3-none-any.whl",
|
|
1193
|
+
"lxml-5.4.0-cp313-cp313-pyodide_2025_0_wasm32.whl",
|
|
1194
|
+
"numpy-2.2.5-cp313-cp313-pyodide_2025_0_wasm32.whl",
|
|
1195
|
+
"opencv_python-4.11.0.86-cp313-cp313-pyodide_2025_0_wasm32.whl",
|
|
1196
|
+
"pdf2docx-0.5.8-py3-none-any.whl",
|
|
1197
|
+
"python_docx-1.2.0-py3-none-any.whl",
|
|
1198
|
+
"typing_extensions-4.12.2-py3-none-any.whl"
|
|
1199
|
+
]
|
|
1200
|
+
};
|
|
1201
|
+
var PyMuPDF = class {
|
|
1202
|
+
constructor(options) {
|
|
1203
|
+
this.pyodidePromise = null;
|
|
1204
|
+
this.pyodide = null;
|
|
1205
|
+
this.docCounter = 0;
|
|
1206
|
+
if (typeof options === "string") {
|
|
1207
|
+
this.assetPath = options;
|
|
1208
|
+
} else {
|
|
1209
|
+
this.assetPath = options?.assetPath ?? "./";
|
|
1210
|
+
}
|
|
1211
|
+
if (!this.assetPath.endsWith("/")) {
|
|
1212
|
+
this.assetPath += "/";
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
getAssetPath(name) {
|
|
1216
|
+
return this.assetPath + name;
|
|
1217
|
+
}
|
|
1218
|
+
async load() {
|
|
1219
|
+
await this.getPyodide();
|
|
1220
|
+
}
|
|
1221
|
+
async getPyodide() {
|
|
1222
|
+
if (this.pyodide) return this.pyodide;
|
|
1223
|
+
if (this.pyodidePromise) return this.pyodidePromise;
|
|
1224
|
+
this.pyodidePromise = this.initPyodide();
|
|
1225
|
+
this.pyodide = await this.pyodidePromise;
|
|
1226
|
+
return this.pyodide;
|
|
1227
|
+
}
|
|
1228
|
+
async initPyodide() {
|
|
1229
|
+
const pyodideUrl = this.getAssetPath(ASSETS.pyodide);
|
|
1230
|
+
const pyodideModule = await import(
|
|
1231
|
+
/* @vite-ignore */
|
|
1232
|
+
pyodideUrl
|
|
1233
|
+
);
|
|
1234
|
+
const { loadPyodide } = pyodideModule;
|
|
1235
|
+
const pyodide = await loadPyodide({
|
|
1236
|
+
indexURL: this.assetPath
|
|
1237
|
+
});
|
|
1238
|
+
await Promise.all(
|
|
1239
|
+
ASSETS.wheels.map((wheel) => pyodide.loadPackage(this.getAssetPath(wheel)))
|
|
1240
|
+
);
|
|
1241
|
+
pyodide.runPython(`
|
|
1242
|
+
import pymupdf
|
|
1243
|
+
pymupdf.TOOLS.store_shrink(100)
|
|
1244
|
+
`);
|
|
1245
|
+
return pyodide;
|
|
1246
|
+
}
|
|
1247
|
+
async open(input) {
|
|
1248
|
+
const pyodide = await this.getPyodide();
|
|
1249
|
+
const docId = ++this.docCounter;
|
|
1250
|
+
const docVar = `_doc${docId}`;
|
|
1251
|
+
const inputPath = `/input_${docId}`;
|
|
1252
|
+
const buf = await input.arrayBuffer();
|
|
1253
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
1254
|
+
pyodide.runPython(`${docVar} = pymupdf.open("${inputPath}")`);
|
|
1255
|
+
return new PyMuPDFDocument(pyodide, docVar, inputPath);
|
|
1256
|
+
}
|
|
1257
|
+
async openUrl(url) {
|
|
1258
|
+
const response = await fetch(url);
|
|
1259
|
+
if (!response.ok) {
|
|
1260
|
+
throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
|
|
1261
|
+
}
|
|
1262
|
+
const blob = await response.blob();
|
|
1263
|
+
return this.open(blob);
|
|
1264
|
+
}
|
|
1265
|
+
async create() {
|
|
1266
|
+
const pyodide = await this.getPyodide();
|
|
1267
|
+
const docId = ++this.docCounter;
|
|
1268
|
+
const docVar = `_doc${docId}`;
|
|
1269
|
+
const inputPath = `/input_${docId}`;
|
|
1270
|
+
pyodide.runPython(`${docVar} = pymupdf.open()`);
|
|
1271
|
+
return new PyMuPDFDocument(pyodide, docVar, inputPath);
|
|
1272
|
+
}
|
|
1273
|
+
async pdfToDocx(pdf, pages) {
|
|
1274
|
+
const pyodide = await this.getPyodide();
|
|
1275
|
+
const buf = await pdf.arrayBuffer();
|
|
1276
|
+
let pdfData = new Uint8Array(buf);
|
|
1277
|
+
console.log("[pdfToDocx] Converting PDF to RGB colorspace with Ghostscript...");
|
|
1278
|
+
try {
|
|
1279
|
+
const rgbData = await convertPdfToRgb(pdfData);
|
|
1280
|
+
pdfData = rgbData;
|
|
1281
|
+
console.log("[pdfToDocx] RGB conversion complete");
|
|
1282
|
+
} catch (e) {
|
|
1283
|
+
console.warn("[pdfToDocx] Ghostscript RGB conversion failed, trying original:", e);
|
|
1284
|
+
}
|
|
1285
|
+
pyodide.FS.writeFile("/input.pdf", pdfData);
|
|
1286
|
+
const pagesArg = pages ? `[${pages.join(", ")}]` : "None";
|
|
1287
|
+
pyodide.runPython(`
|
|
1288
|
+
import pymupdf
|
|
1289
|
+
from pdf2docx import Converter
|
|
1290
|
+
from pdf2docx.image.ImagesExtractor import ImagesExtractor
|
|
1291
|
+
|
|
1292
|
+
# Store original _to_raw_dict static method
|
|
1293
|
+
_orig_to_raw_dict = ImagesExtractor._to_raw_dict
|
|
1294
|
+
|
|
1295
|
+
def _patched_to_raw_dict(image, bbox):
|
|
1296
|
+
"""Convert non-RGB pixmaps to RGB before processing.
|
|
1297
|
+
|
|
1298
|
+
This is a staticmethod that takes (image, bbox).
|
|
1299
|
+
PNG format only supports grayscale and RGB, so we need to convert
|
|
1300
|
+
CMYK and other colorspaces to RGB.
|
|
1301
|
+
"""
|
|
1302
|
+
pix = image
|
|
1303
|
+
|
|
1304
|
+
# Check if pixmap needs conversion to RGB
|
|
1305
|
+
# PNG only supports: Grayscale (n=1), Grayscale+Alpha (n=2), RGB (n=3), RGBA (n=4)
|
|
1306
|
+
needs_conversion = False
|
|
1307
|
+
|
|
1308
|
+
if hasattr(pix, 'colorspace') and pix.colorspace:
|
|
1309
|
+
cs_name = pix.colorspace.name.upper() if pix.colorspace.name else ''
|
|
1310
|
+
# Convert if not grayscale or RGB
|
|
1311
|
+
if 'CMYK' in cs_name or 'DEVICECMYK' in cs_name:
|
|
1312
|
+
needs_conversion = True
|
|
1313
|
+
elif cs_name not in ('DEVICEGRAY', 'GRAY', 'DEVICERGB', 'RGB', 'SRGB', ''):
|
|
1314
|
+
# Unknown colorspace - try to convert to RGB
|
|
1315
|
+
needs_conversion = True
|
|
1316
|
+
|
|
1317
|
+
# Also check by component count: CMYK has n=4 without alpha
|
|
1318
|
+
if not needs_conversion and hasattr(pix, 'n') and hasattr(pix, 'alpha'):
|
|
1319
|
+
if pix.n == 4 and not pix.alpha:
|
|
1320
|
+
# Likely CMYK (4 components, no alpha)
|
|
1321
|
+
needs_conversion = True
|
|
1322
|
+
elif pix.n > 4:
|
|
1323
|
+
# More than 4 components - definitely needs conversion
|
|
1324
|
+
needs_conversion = True
|
|
1325
|
+
|
|
1326
|
+
if needs_conversion:
|
|
1327
|
+
try:
|
|
1328
|
+
# Convert to RGB
|
|
1329
|
+
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
|
1330
|
+
except Exception as e:
|
|
1331
|
+
# If direct conversion fails, try via samples
|
|
1332
|
+
try:
|
|
1333
|
+
# Create a new RGB pixmap with same dimensions
|
|
1334
|
+
new_pix = pymupdf.Pixmap(pymupdf.csRGB, pix.irect)
|
|
1335
|
+
new_pix.set_rect(pix.irect, (255, 255, 255)) # White background
|
|
1336
|
+
# Insert the original (this handles conversion)
|
|
1337
|
+
new_pix.copy(pix, pix.irect)
|
|
1338
|
+
pix = new_pix
|
|
1339
|
+
except:
|
|
1340
|
+
# Last resort: just pass through and hope for the best
|
|
1341
|
+
pass
|
|
1342
|
+
|
|
1343
|
+
# Call original static method with converted pixmap and bbox
|
|
1344
|
+
return _orig_to_raw_dict(pix, bbox)
|
|
1345
|
+
|
|
1346
|
+
# Apply patch as staticmethod
|
|
1347
|
+
ImagesExtractor._to_raw_dict = staticmethod(_patched_to_raw_dict)
|
|
1348
|
+
|
|
1349
|
+
cv = Converter("/input.pdf")
|
|
1350
|
+
cv.convert("/output.docx", pages=${pagesArg})
|
|
1351
|
+
cv.close()
|
|
1352
|
+
|
|
1353
|
+
# Restore original
|
|
1354
|
+
ImagesExtractor._to_raw_dict = _orig_to_raw_dict
|
|
1355
|
+
`);
|
|
1356
|
+
const outputBuf = pyodide.FS.readFile("/output.docx");
|
|
1357
|
+
try {
|
|
1358
|
+
pyodide.FS.unlink("/input.pdf");
|
|
1359
|
+
pyodide.FS.unlink("/output.docx");
|
|
1360
|
+
} catch {
|
|
1361
|
+
}
|
|
1362
|
+
return new Blob([new Uint8Array(outputBuf)], {
|
|
1363
|
+
type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
1364
|
+
});
|
|
1365
|
+
}
|
|
1366
|
+
async merge(pdfs) {
|
|
1367
|
+
if (pdfs.length === 0) {
|
|
1368
|
+
throw new Error("No PDFs provided for merging");
|
|
1369
|
+
}
|
|
1370
|
+
const result = await this.open(pdfs[0]);
|
|
1371
|
+
for (let i = 1; i < pdfs.length; i++) {
|
|
1372
|
+
const doc = await this.open(pdfs[i]);
|
|
1373
|
+
result.insertPdf(doc);
|
|
1374
|
+
doc.close();
|
|
1375
|
+
}
|
|
1376
|
+
const blob = result.saveAsBlob();
|
|
1377
|
+
result.close();
|
|
1378
|
+
return blob;
|
|
1379
|
+
}
|
|
1380
|
+
async split(pdf, ranges) {
|
|
1381
|
+
const results = [];
|
|
1382
|
+
const source = await this.open(pdf);
|
|
1383
|
+
const pageCount = source.pageCount;
|
|
1384
|
+
for (const range of ranges) {
|
|
1385
|
+
const start = Math.max(0, range.start);
|
|
1386
|
+
const end = Math.min(pageCount - 1, range.end);
|
|
1387
|
+
if (start > end) continue;
|
|
1388
|
+
const newDoc = await this.create();
|
|
1389
|
+
newDoc.insertPdf(source, { fromPage: start, toPage: end });
|
|
1390
|
+
results.push(newDoc.saveAsBlob());
|
|
1391
|
+
newDoc.close();
|
|
1392
|
+
}
|
|
1393
|
+
source.close();
|
|
1394
|
+
return results;
|
|
1395
|
+
}
|
|
1396
|
+
async extractText(pdf) {
|
|
1397
|
+
const doc = await this.open(pdf);
|
|
1398
|
+
let text = "";
|
|
1399
|
+
for (const page of doc.pages()) {
|
|
1400
|
+
text += page.getText() + "\n";
|
|
1401
|
+
}
|
|
1402
|
+
doc.close();
|
|
1403
|
+
return text.trim();
|
|
1404
|
+
}
|
|
1405
|
+
async renderPage(pdf, pageIndex, dpi = 150) {
|
|
1406
|
+
const doc = await this.open(pdf);
|
|
1407
|
+
const page = doc.getPage(pageIndex);
|
|
1408
|
+
const image = await page.toImage({ dpi });
|
|
1409
|
+
doc.close();
|
|
1410
|
+
return image;
|
|
1411
|
+
}
|
|
1412
|
+
async convertToPdf(file, options) {
|
|
1413
|
+
const pyodide = await this.getPyodide();
|
|
1414
|
+
const docId = ++this.docCounter;
|
|
1415
|
+
const inputPath = `/convert_input_${docId}`;
|
|
1416
|
+
const filename = file instanceof File ? file.name : "document";
|
|
1417
|
+
const ext = options?.filetype ?? filename.split(".").pop()?.toLowerCase() ?? "";
|
|
1418
|
+
const buf = await file.arrayBuffer();
|
|
1419
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
1420
|
+
const result = pyodide.runPython(`
|
|
1421
|
+
import base64
|
|
1422
|
+
|
|
1423
|
+
src = pymupdf.open("${inputPath}"${ext ? `, filetype="${ext}"` : ""})
|
|
1424
|
+
pdf_bytes = src.convert_to_pdf()
|
|
1425
|
+
src.close()
|
|
1426
|
+
|
|
1427
|
+
pdf = pymupdf.open("pdf", pdf_bytes)
|
|
1428
|
+
output = pdf.tobytes(garbage=3, deflate=True)
|
|
1429
|
+
pdf.close()
|
|
1430
|
+
|
|
1431
|
+
base64.b64encode(output).decode('ascii')
|
|
1432
|
+
`);
|
|
1433
|
+
try {
|
|
1434
|
+
pyodide.FS.unlink(inputPath);
|
|
1435
|
+
} catch {
|
|
1436
|
+
}
|
|
1437
|
+
const binary = atob(result);
|
|
1438
|
+
const bytes = new Uint8Array(binary.length);
|
|
1439
|
+
for (let i = 0; i < binary.length; i++) {
|
|
1440
|
+
bytes[i] = binary.charCodeAt(i);
|
|
1441
|
+
}
|
|
1442
|
+
return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
|
|
1443
|
+
}
|
|
1444
|
+
/**
|
|
1445
|
+
* Repair a PDF by re-opening and re-saving with garbage collection and compression.
|
|
1446
|
+
* This fixes stream length issues that can occur from Ghostscript WASM output.
|
|
1447
|
+
* @param pdf The PDF to repair
|
|
1448
|
+
* @returns Repaired PDF blob
|
|
1449
|
+
*/
|
|
1450
|
+
async repairPdf(pdf) {
|
|
1451
|
+
const pyodide = await this.getPyodide();
|
|
1452
|
+
const docId = ++this.docCounter;
|
|
1453
|
+
const inputPath = `/repair_input_${docId}`;
|
|
1454
|
+
const buf = await pdf.arrayBuffer();
|
|
1455
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
1456
|
+
const result = pyodide.runPython(`
|
|
1457
|
+
import base64
|
|
1458
|
+
|
|
1459
|
+
# Open the PDF (this re-parses and fixes internal structure)
|
|
1460
|
+
doc = pymupdf.open("${inputPath}")
|
|
1461
|
+
|
|
1462
|
+
# Re-save with garbage collection and deflate compression
|
|
1463
|
+
# garbage=4 is the most aggressive cleanup (includes unused objects and duplicate streams)
|
|
1464
|
+
# deflate=True compresses streams
|
|
1465
|
+
output = doc.tobytes(garbage=4, deflate=True, clean=True)
|
|
1466
|
+
doc.close()
|
|
1467
|
+
|
|
1468
|
+
base64.b64encode(output).decode('ascii')
|
|
1469
|
+
`);
|
|
1470
|
+
try {
|
|
1471
|
+
pyodide.FS.unlink(inputPath);
|
|
1472
|
+
} catch {
|
|
1473
|
+
}
|
|
1474
|
+
const binary = atob(result);
|
|
1475
|
+
const bytes = new Uint8Array(binary.length);
|
|
1476
|
+
for (let i = 0; i < binary.length; i++) {
|
|
1477
|
+
bytes[i] = binary.charCodeAt(i);
|
|
1478
|
+
}
|
|
1479
|
+
return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
|
|
1480
|
+
}
|
|
1481
|
+
async xpsToPdf(xps) {
|
|
1482
|
+
return this.convertToPdf(xps, { filetype: "xps" });
|
|
1483
|
+
}
|
|
1484
|
+
async epubToPdf(epub) {
|
|
1485
|
+
return this.convertToPdf(epub, { filetype: "epub" });
|
|
1486
|
+
}
|
|
1487
|
+
async imageToPdf(image, options) {
|
|
1488
|
+
return this.convertToPdf(image, { filetype: options?.imageType });
|
|
1489
|
+
}
|
|
1490
|
+
async svgToPdf(svg) {
|
|
1491
|
+
return this.convertToPdf(svg, { filetype: "svg" });
|
|
1492
|
+
}
|
|
1493
|
+
async imagesToPdf(images) {
|
|
1494
|
+
if (images.length === 0) {
|
|
1495
|
+
throw new Error("No images provided");
|
|
1496
|
+
}
|
|
1497
|
+
const pyodide = await this.getPyodide();
|
|
1498
|
+
pyodide.runPython(`_multi_img_pdf = pymupdf.open()`);
|
|
1499
|
+
for (let i = 0; i < images.length; i++) {
|
|
1500
|
+
const image = images[i];
|
|
1501
|
+
const inputPath = `/multi_img_${i}`;
|
|
1502
|
+
const buf = await image.arrayBuffer();
|
|
1503
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
1504
|
+
pyodide.runPython(`
|
|
1505
|
+
img_doc = pymupdf.open("${inputPath}")
|
|
1506
|
+
pdf_bytes = img_doc.convert_to_pdf()
|
|
1507
|
+
img_pdf = pymupdf.open("pdf", pdf_bytes)
|
|
1508
|
+
_multi_img_pdf.insert_pdf(img_pdf)
|
|
1509
|
+
img_pdf.close()
|
|
1510
|
+
img_doc.close()
|
|
1511
|
+
`);
|
|
1512
|
+
try {
|
|
1513
|
+
pyodide.FS.unlink(inputPath);
|
|
1514
|
+
} catch {
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
const result = pyodide.runPython(`
|
|
1518
|
+
import base64
|
|
1519
|
+
output = _multi_img_pdf.tobytes(garbage=3, deflate=True)
|
|
1520
|
+
_multi_img_pdf.close()
|
|
1521
|
+
base64.b64encode(output).decode('ascii')
|
|
1522
|
+
`);
|
|
1523
|
+
const binary = atob(result);
|
|
1524
|
+
const bytes = new Uint8Array(binary.length);
|
|
1525
|
+
for (let i = 0; i < binary.length; i++) {
|
|
1526
|
+
bytes[i] = binary.charCodeAt(i);
|
|
1527
|
+
}
|
|
1528
|
+
return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
|
|
1529
|
+
}
|
|
1530
|
+
async pdfToImages(pdf, options) {
|
|
1531
|
+
const pyodide = await this.getPyodide();
|
|
1532
|
+
const doc = await this.open(pdf);
|
|
1533
|
+
const format = options?.format ?? "png";
|
|
1534
|
+
const dpi = options?.dpi ?? 150;
|
|
1535
|
+
const zoom = dpi / 72;
|
|
1536
|
+
const pageCount = doc.pageCount;
|
|
1537
|
+
const pagesToExport = options?.pages ?? Array.from({ length: pageCount }, (_, i) => i);
|
|
1538
|
+
const results = [];
|
|
1539
|
+
for (const pageIdx of pagesToExport) {
|
|
1540
|
+
if (pageIdx < 0 || pageIdx >= pageCount) continue;
|
|
1541
|
+
const result = pyodide.runPython(`
|
|
1542
|
+
import base64
|
|
1543
|
+
page = ${doc.docVar}[${pageIdx}]
|
|
1544
|
+
mat = pymupdf.Matrix(${zoom}, ${zoom})
|
|
1545
|
+
pix = page.get_pixmap(matrix=mat)
|
|
1546
|
+
base64.b64encode(pix.tobytes("${format}")).decode('ascii')
|
|
1547
|
+
`);
|
|
1548
|
+
const binary = atob(result);
|
|
1549
|
+
const bytes = new Uint8Array(binary.length);
|
|
1550
|
+
for (let i = 0; i < binary.length; i++) {
|
|
1551
|
+
bytes[i] = binary.charCodeAt(i);
|
|
1552
|
+
}
|
|
1553
|
+
results.push(bytes);
|
|
1554
|
+
}
|
|
1555
|
+
doc.close();
|
|
1556
|
+
return results;
|
|
1557
|
+
}
|
|
1558
|
+
async pdfToSvg(pdf, pages) {
|
|
1559
|
+
const doc = await this.open(pdf);
|
|
1560
|
+
const pageCount = doc.pageCount;
|
|
1561
|
+
const pagesToExport = pages ?? Array.from({ length: pageCount }, (_, i) => i);
|
|
1562
|
+
const results = [];
|
|
1563
|
+
for (const pageIdx of pagesToExport) {
|
|
1564
|
+
if (pageIdx < 0 || pageIdx >= pageCount) continue;
|
|
1565
|
+
const page = doc.getPage(pageIdx);
|
|
1566
|
+
results.push(page.toSvg());
|
|
1567
|
+
}
|
|
1568
|
+
doc.close();
|
|
1569
|
+
return results;
|
|
1570
|
+
}
|
|
1571
|
+
async pdfToText(pdf) {
|
|
1572
|
+
return this.extractText(pdf);
|
|
1573
|
+
}
|
|
1574
|
+
async pdfToHtml(pdf) {
|
|
1575
|
+
const doc = await this.open(pdf);
|
|
1576
|
+
let html = "";
|
|
1577
|
+
for (const page of doc.pages()) {
|
|
1578
|
+
html += page.getText("html") + "\n";
|
|
1579
|
+
}
|
|
1580
|
+
doc.close();
|
|
1581
|
+
return html;
|
|
1582
|
+
}
|
|
1583
|
+
async pdfToJson(pdf) {
|
|
1584
|
+
const doc = await this.open(pdf);
|
|
1585
|
+
const results = [];
|
|
1586
|
+
for (const page of doc.pages()) {
|
|
1587
|
+
const text = page.getText("dict");
|
|
1588
|
+
results.push(text);
|
|
1589
|
+
}
|
|
1590
|
+
doc.close();
|
|
1591
|
+
return results;
|
|
1592
|
+
}
|
|
1593
|
+
async pdfToXml(pdf) {
|
|
1594
|
+
const doc = await this.open(pdf);
|
|
1595
|
+
let xml = '<?xml version="1.0" encoding="UTF-8"?>\n<document>\n';
|
|
1596
|
+
for (const page of doc.pages()) {
|
|
1597
|
+
xml += page.getText("xml") + "\n";
|
|
1598
|
+
}
|
|
1599
|
+
xml += "</document>";
|
|
1600
|
+
doc.close();
|
|
1601
|
+
return xml;
|
|
1602
|
+
}
|
|
1603
|
+
hasRtlCharacters(text) {
|
|
1604
|
+
const rtlPattern = /[\u0590-\u05FF\u0600-\u06FF\u0700-\u074F\u0750-\u077F\u0780-\u07BF\u07C0-\u07FF\u08A0-\u08FF\uFB1D-\uFB4F\uFB50-\uFDFF\uFE70-\uFEFF]/;
|
|
1605
|
+
return rtlPattern.test(text);
|
|
1606
|
+
}
|
|
1607
|
+
async textToPdf(text, options) {
|
|
1608
|
+
const pyodide = await this.getPyodide();
|
|
1609
|
+
const isRtl = this.hasRtlCharacters(text);
|
|
1610
|
+
const directionStyle = isRtl ? "direction: rtl; text-align: right;" : "";
|
|
1611
|
+
const escapedText = text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'").replace(/\\/g, "\\\\").replace(/\n/g, "<br>");
|
|
1612
|
+
const fontSize = options?.fontSize ?? 11;
|
|
1613
|
+
const pageSize = options?.pageSize ?? "a4";
|
|
1614
|
+
const margins = options?.margins ?? 72;
|
|
1615
|
+
const fontMap = {
|
|
1616
|
+
"helv": "sans-serif",
|
|
1617
|
+
"tiro": "serif",
|
|
1618
|
+
"cour": "monospace",
|
|
1619
|
+
"times": "serif"
|
|
1620
|
+
};
|
|
1621
|
+
const fontName = options?.fontName ?? "helv";
|
|
1622
|
+
const fontFamily = fontMap[fontName] || "sans-serif";
|
|
1623
|
+
const result = pyodide.runPython(`
|
|
1624
|
+
import base64
|
|
1625
|
+
|
|
1626
|
+
html_content = '''
|
|
1627
|
+
<p style="font-family: ${fontFamily}; font-size: ${fontSize}pt; margin: 0; padding: 0; ${directionStyle}">
|
|
1628
|
+
${escapedText}
|
|
1629
|
+
</p>
|
|
1630
|
+
'''
|
|
1631
|
+
|
|
1632
|
+
doc = pymupdf.open()
|
|
1633
|
+
mediabox = pymupdf.paper_rect("${pageSize}")
|
|
1634
|
+
margin = ${margins}
|
|
1635
|
+
where = mediabox + (margin, margin, -margin, -margin)
|
|
1636
|
+
|
|
1637
|
+
more = True
|
|
1638
|
+
page_count = 0
|
|
1639
|
+
max_pages = 100
|
|
1640
|
+
|
|
1641
|
+
while more and page_count < max_pages:
|
|
1642
|
+
page = doc.new_page(width=mediabox.width, height=mediabox.height)
|
|
1643
|
+
more, _ = page.insert_htmlbox(where, html_content, css="* { font-family: ${fontFamily}; font-size: ${fontSize}pt; }")
|
|
1644
|
+
page_count += 1
|
|
1645
|
+
|
|
1646
|
+
# Subset and embed fonts for PDF/A compatibility
|
|
1647
|
+
doc.subset_fonts()
|
|
1648
|
+
|
|
1649
|
+
pdf_bytes = doc.tobytes(garbage=3, deflate=True)
|
|
1650
|
+
doc.close()
|
|
1651
|
+
|
|
1652
|
+
base64.b64encode(pdf_bytes).decode('ascii')
|
|
1653
|
+
`);
|
|
1654
|
+
const binaryStr = atob(result);
|
|
1655
|
+
const bytes = new Uint8Array(binaryStr.length);
|
|
1656
|
+
for (let i = 0; i < binaryStr.length; i++) {
|
|
1657
|
+
bytes[i] = binaryStr.charCodeAt(i);
|
|
1658
|
+
}
|
|
1659
|
+
return new Blob([bytes], { type: "application/pdf" });
|
|
1660
|
+
}
|
|
1661
|
+
async htmlToPdf(html, options) {
|
|
1662
|
+
const pyodide = await this.getPyodide();
|
|
1663
|
+
const escapedHtml = html.replace(/\\/g, "\\\\").replace(/'/g, "\\'").replace(/\n/g, "\\n");
|
|
1664
|
+
const escapedCss = options?.css?.replace(/\\/g, "\\\\").replace(/'/g, "\\'").replace(/\n/g, "\\n") ?? "";
|
|
1665
|
+
const pageSize = options?.pageSize ?? "a4";
|
|
1666
|
+
let margins = { top: 36, right: 36, bottom: 36, left: 36 };
|
|
1667
|
+
if (typeof options?.margins === "number") {
|
|
1668
|
+
margins = { top: options.margins, right: options.margins, bottom: options.margins, left: options.margins };
|
|
1669
|
+
} else if (options?.margins) {
|
|
1670
|
+
margins = options.margins;
|
|
1671
|
+
}
|
|
1672
|
+
const result = pyodide.runPython(`
|
|
1673
|
+
import base64
|
|
1674
|
+
import io
|
|
1675
|
+
import re
|
|
1676
|
+
import json
|
|
1677
|
+
|
|
1678
|
+
html_content = '''${escapedHtml}'''
|
|
1679
|
+
css_content = '''${escapedCss}'''
|
|
1680
|
+
|
|
1681
|
+
# Extract links from HTML before processing
|
|
1682
|
+
link_pattern = r'<a[^>]*href=["\\'](https?://[^"\\'>]+)["\\'"][^>]*>([^<]+)</a>'
|
|
1683
|
+
links = re.findall(link_pattern, html_content, re.IGNORECASE)
|
|
1684
|
+
# links is a list of (url, text) tuples
|
|
1685
|
+
|
|
1686
|
+
html_content = re.sub(r'<link[^>]*stylesheet[^>]*>', '', html_content, flags=re.IGNORECASE)
|
|
1687
|
+
html_content = re.sub(r'<link[^>]*href=[^>]*>', '', html_content, flags=re.IGNORECASE)
|
|
1688
|
+
html_content = re.sub(r'<script[^>]*src=[^>]*>.*?<\\/script>', '', html_content, flags=re.IGNORECASE|re.DOTALL)
|
|
1689
|
+
html_content = re.sub(r'<script[^>]*src=[^>]*/>', '', html_content, flags=re.IGNORECASE)
|
|
1690
|
+
|
|
1691
|
+
mediabox = pymupdf.paper_rect("${pageSize}")
|
|
1692
|
+
where = mediabox + (${margins.left}, ${margins.top}, -${margins.right}, -${margins.bottom})
|
|
1693
|
+
|
|
1694
|
+
story = pymupdf.Story(html=html_content, user_css=css_content if css_content else None)
|
|
1695
|
+
|
|
1696
|
+
buffer = io.BytesIO()
|
|
1697
|
+
writer = pymupdf.DocumentWriter(buffer)
|
|
1698
|
+
|
|
1699
|
+
def rectfn(rect_num, filled):
|
|
1700
|
+
if rect_num == 0 or filled == 0:
|
|
1701
|
+
return mediabox, where, None
|
|
1702
|
+
return mediabox, where, None
|
|
1703
|
+
|
|
1704
|
+
story.write(writer, rectfn)
|
|
1705
|
+
writer.close()
|
|
1706
|
+
|
|
1707
|
+
# Now open the PDF and add link annotations
|
|
1708
|
+
buffer.seek(0)
|
|
1709
|
+
doc = pymupdf.open("pdf", buffer.read())
|
|
1710
|
+
|
|
1711
|
+
# For each link found in HTML, search for the text and add a link annotation
|
|
1712
|
+
for url, text in links:
|
|
1713
|
+
text = text.strip()
|
|
1714
|
+
if not text:
|
|
1715
|
+
continue
|
|
1716
|
+
# Search all pages for this text
|
|
1717
|
+
for page_num in range(doc.page_count):
|
|
1718
|
+
page = doc[page_num]
|
|
1719
|
+
# Search for the link text
|
|
1720
|
+
text_instances = page.search_for(text)
|
|
1721
|
+
for rect in text_instances:
|
|
1722
|
+
# Add a link annotation
|
|
1723
|
+
link = page.insert_link({
|
|
1724
|
+
"kind": pymupdf.LINK_URI,
|
|
1725
|
+
"from": rect,
|
|
1726
|
+
"uri": url
|
|
1727
|
+
})
|
|
1728
|
+
|
|
1729
|
+
# Save the modified PDF
|
|
1730
|
+
output_buffer = io.BytesIO()
|
|
1731
|
+
doc.save(output_buffer)
|
|
1732
|
+
doc.close()
|
|
1733
|
+
|
|
1734
|
+
pdf_bytes = output_buffer.getvalue()
|
|
1735
|
+
base64.b64encode(pdf_bytes).decode('ascii')
|
|
1736
|
+
`);
|
|
1737
|
+
const binaryStr = atob(result);
|
|
1738
|
+
const bytes = new Uint8Array(binaryStr.length);
|
|
1739
|
+
for (let i = 0; i < binaryStr.length; i++) {
|
|
1740
|
+
bytes[i] = binaryStr.charCodeAt(i);
|
|
1741
|
+
}
|
|
1742
|
+
return new Blob([bytes], { type: "application/pdf" });
|
|
1743
|
+
}
|
|
1744
|
+
async pdfToMarkdown(pdf, options) {
|
|
1745
|
+
const pyodide = await this.getPyodide();
|
|
1746
|
+
const docId = ++this.docCounter;
|
|
1747
|
+
const inputPath = `/md_input_${docId}`;
|
|
1748
|
+
const buf = await pdf.arrayBuffer();
|
|
1749
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
1750
|
+
const embedImages = options?.includeImages ? "True" : "False";
|
|
1751
|
+
const pageBreaks = options?.pageBreaks !== false ? "True" : "False";
|
|
1752
|
+
const pagesArg = options?.pages ? `pages=[${options.pages.join(", ")}]` : "";
|
|
1753
|
+
const result = pyodide.runPython(`
|
|
1754
|
+
import pymupdf4llm
|
|
1755
|
+
|
|
1756
|
+
md_text = pymupdf4llm.to_markdown(
|
|
1757
|
+
"${inputPath}",
|
|
1758
|
+
embed_images=${embedImages},
|
|
1759
|
+
page_chunks=${pageBreaks}${pagesArg ? ", " + pagesArg : ""}
|
|
1760
|
+
)
|
|
1761
|
+
|
|
1762
|
+
if isinstance(md_text, list):
|
|
1763
|
+
result = "\\n\\n---\\n\\n".join([chunk.get('text', '') if isinstance(chunk, dict) else str(chunk) for chunk in md_text])
|
|
1764
|
+
else:
|
|
1765
|
+
result = md_text if md_text else ""
|
|
1766
|
+
|
|
1767
|
+
result
|
|
1768
|
+
`);
|
|
1769
|
+
try {
|
|
1770
|
+
pyodide.FS.unlink(inputPath);
|
|
1771
|
+
} catch {
|
|
1772
|
+
}
|
|
1773
|
+
return result;
|
|
1774
|
+
}
|
|
1775
|
+
async pdfToLlmChunks(pdf) {
|
|
1776
|
+
const pyodide = await this.getPyodide();
|
|
1777
|
+
const docId = ++this.docCounter;
|
|
1778
|
+
const inputPath = `/llm_input_${docId}`;
|
|
1779
|
+
const buf = await pdf.arrayBuffer();
|
|
1780
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
1781
|
+
const result = pyodide.runPython(`
|
|
1782
|
+
import pymupdf4llm
|
|
1783
|
+
import json
|
|
1784
|
+
|
|
1785
|
+
chunks = pymupdf4llm.to_markdown(
|
|
1786
|
+
"${inputPath}",
|
|
1787
|
+
page_chunks=True
|
|
1788
|
+
)
|
|
1789
|
+
|
|
1790
|
+
result = []
|
|
1791
|
+
for chunk in chunks:
|
|
1792
|
+
if isinstance(chunk, dict):
|
|
1793
|
+
result.append({
|
|
1794
|
+
"text": chunk.get("text", ""),
|
|
1795
|
+
"metadata": {
|
|
1796
|
+
"page": chunk.get("metadata", {}).get("page", None)
|
|
1797
|
+
}
|
|
1798
|
+
})
|
|
1799
|
+
else:
|
|
1800
|
+
result.append({"text": str(chunk), "metadata": {}})
|
|
1801
|
+
|
|
1802
|
+
json.dumps(result)
|
|
1803
|
+
`);
|
|
1804
|
+
try {
|
|
1805
|
+
pyodide.FS.unlink(inputPath);
|
|
1806
|
+
} catch {
|
|
1807
|
+
}
|
|
1808
|
+
return JSON.parse(result);
|
|
1809
|
+
}
|
|
1810
|
+
/**
|
|
1811
|
+
* Extract PDF as LlamaIndex-compatible documents using PyMuPDF4LLM.
|
|
1812
|
+
* Uses to_markdown with page_chunks=True to produce LlamaIndex Document format.
|
|
1813
|
+
* @param pdf The PDF file to extract
|
|
1814
|
+
* @returns Array of LlamaIndex-compatible documents
|
|
1815
|
+
*/
|
|
1816
|
+
async pdfToLlamaIndex(pdf) {
|
|
1817
|
+
const pyodide = await this.getPyodide();
|
|
1818
|
+
const docId = ++this.docCounter;
|
|
1819
|
+
const inputPath = `/llama_input_${docId}`;
|
|
1820
|
+
const filename = pdf instanceof File ? pdf.name : "document.pdf";
|
|
1821
|
+
const buf = await pdf.arrayBuffer();
|
|
1822
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
1823
|
+
const result = pyodide.runPython(`
|
|
1824
|
+
import pymupdf4llm
|
|
1825
|
+
import pymupdf
|
|
1826
|
+
import json
|
|
1827
|
+
|
|
1828
|
+
# Use to_markdown with page_chunks=True - same output as LlamaMarkdownReader
|
|
1829
|
+
chunks = pymupdf4llm.to_markdown("${inputPath}", page_chunks=True)
|
|
1830
|
+
|
|
1831
|
+
# Get document metadata
|
|
1832
|
+
doc = pymupdf.open("${inputPath}")
|
|
1833
|
+
doc_meta = doc.metadata
|
|
1834
|
+
page_count = doc.page_count
|
|
1835
|
+
doc.close()
|
|
1836
|
+
|
|
1837
|
+
# Convert to LlamaIndex Document format
|
|
1838
|
+
result = []
|
|
1839
|
+
for chunk in chunks:
|
|
1840
|
+
if isinstance(chunk, dict):
|
|
1841
|
+
doc_dict = {
|
|
1842
|
+
"text": chunk.get("text", ""),
|
|
1843
|
+
"metadata": {
|
|
1844
|
+
"file_name": "${filename.replace(/"/g, '\\"')}",
|
|
1845
|
+
"total_pages": page_count
|
|
1846
|
+
}
|
|
1847
|
+
}
|
|
1848
|
+
|
|
1849
|
+
# Copy chunk metadata
|
|
1850
|
+
chunk_meta = chunk.get("metadata", {})
|
|
1851
|
+
if chunk_meta:
|
|
1852
|
+
if "page" in chunk_meta:
|
|
1853
|
+
doc_dict["metadata"]["page"] = chunk_meta["page"]
|
|
1854
|
+
if "page_count" in chunk_meta:
|
|
1855
|
+
doc_dict["metadata"]["page_count"] = chunk_meta["page_count"]
|
|
1856
|
+
if "file_path" in chunk_meta:
|
|
1857
|
+
doc_dict["metadata"]["file_path"] = chunk_meta["file_path"]
|
|
1858
|
+
|
|
1859
|
+
# Add document-level metadata
|
|
1860
|
+
if doc_meta:
|
|
1861
|
+
for key in ["author", "title", "subject", "keywords", "creator", "producer", "creationDate", "modDate"]:
|
|
1862
|
+
if doc_meta.get(key):
|
|
1863
|
+
doc_dict["metadata"][key] = doc_meta[key]
|
|
1864
|
+
|
|
1865
|
+
# Include tables info if available (convert Rect to list)
|
|
1866
|
+
if "tables" in chunk and chunk["tables"]:
|
|
1867
|
+
tables_serializable = []
|
|
1868
|
+
for t in chunk["tables"]:
|
|
1869
|
+
if isinstance(t, dict):
|
|
1870
|
+
t_copy = dict(t)
|
|
1871
|
+
if "bbox" in t_copy and hasattr(t_copy["bbox"], "__iter__"):
|
|
1872
|
+
t_copy["bbox"] = list(t_copy["bbox"])
|
|
1873
|
+
tables_serializable.append(t_copy)
|
|
1874
|
+
doc_dict["metadata"]["tables"] = tables_serializable
|
|
1875
|
+
|
|
1876
|
+
# Include images info if available (convert Rect to list)
|
|
1877
|
+
if "images" in chunk and chunk["images"]:
|
|
1878
|
+
images_serializable = []
|
|
1879
|
+
for img in chunk["images"]:
|
|
1880
|
+
if isinstance(img, dict):
|
|
1881
|
+
img_copy = dict(img)
|
|
1882
|
+
if "bbox" in img_copy and hasattr(img_copy["bbox"], "__iter__"):
|
|
1883
|
+
img_copy["bbox"] = list(img_copy["bbox"])
|
|
1884
|
+
images_serializable.append(img_copy)
|
|
1885
|
+
doc_dict["metadata"]["images"] = images_serializable
|
|
1886
|
+
|
|
1887
|
+
if "toc_items" in chunk:
|
|
1888
|
+
doc_dict["metadata"]["toc_items"] = chunk["toc_items"]
|
|
1889
|
+
|
|
1890
|
+
result.append(doc_dict)
|
|
1891
|
+
else:
|
|
1892
|
+
result.append({"text": str(chunk), "metadata": {"file_name": "${filename.replace(/"/g, '\\"')}"}})
|
|
1893
|
+
|
|
1894
|
+
json.dumps(result)
|
|
1895
|
+
`);
|
|
1896
|
+
try {
|
|
1897
|
+
pyodide.FS.unlink(inputPath);
|
|
1898
|
+
} catch {
|
|
1899
|
+
}
|
|
1900
|
+
return JSON.parse(result);
|
|
1901
|
+
}
|
|
1902
|
+
/**
|
|
1903
|
+
* Rasterize a PDF - convert all pages to images and create a new PDF from those images.
|
|
1904
|
+
* This flattens all vector graphics, text, and layers into raster images.
|
|
1905
|
+
* Useful for: printing, reducing file complexity, removing selectable text, or creating image-based PDFs.
|
|
1906
|
+
*/
|
|
1907
|
+
async rasterizePdf(pdf, options) {
|
|
1908
|
+
const pyodide = await this.getPyodide();
|
|
1909
|
+
const docId = ++this.docCounter;
|
|
1910
|
+
const inputPath = `/rasterize_input_${docId}`;
|
|
1911
|
+
const dpi = options?.dpi ?? 150;
|
|
1912
|
+
const format = options?.format ?? "png";
|
|
1913
|
+
const quality = options?.quality ?? 95;
|
|
1914
|
+
const alpha = options?.alpha ?? false;
|
|
1915
|
+
const pages = options?.pages;
|
|
1916
|
+
const grayscale = options?.grayscale ?? false;
|
|
1917
|
+
const buf = await pdf.arrayBuffer();
|
|
1918
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
1919
|
+
const pagesArg = pages ? `[${pages.join(", ")}]` : "None";
|
|
1920
|
+
const result = pyodide.runPython(`
|
|
1921
|
+
import base64
|
|
1922
|
+
|
|
1923
|
+
src_doc = pymupdf.open("${inputPath}")
|
|
1924
|
+
out_doc = pymupdf.open()
|
|
1925
|
+
|
|
1926
|
+
zoom = ${dpi} / 72.0
|
|
1927
|
+
mat = pymupdf.Matrix(zoom, zoom)
|
|
1928
|
+
|
|
1929
|
+
page_indices = ${pagesArg} if ${pagesArg} is not None else range(src_doc.page_count)
|
|
1930
|
+
|
|
1931
|
+
for page_idx in page_indices:
|
|
1932
|
+
if page_idx < 0 or page_idx >= src_doc.page_count:
|
|
1933
|
+
continue
|
|
1934
|
+
|
|
1935
|
+
page = src_doc[page_idx]
|
|
1936
|
+
|
|
1937
|
+
# Render page to pixmap
|
|
1938
|
+
pix = page.get_pixmap(matrix=mat, alpha=${alpha ? "True" : "False"})
|
|
1939
|
+
|
|
1940
|
+
# Convert to grayscale if requested
|
|
1941
|
+
if ${grayscale ? "True" : "False"}:
|
|
1942
|
+
pix = pymupdf.Pixmap(pymupdf.csGRAY, pix)
|
|
1943
|
+
|
|
1944
|
+
# Get image bytes
|
|
1945
|
+
img_bytes = pix.tobytes("${format}"${format === "jpeg" ? `, jpg_quality=${quality}` : ""})
|
|
1946
|
+
|
|
1947
|
+
# Create new page with same dimensions as rendered image
|
|
1948
|
+
# Scale back to original page size for the PDF
|
|
1949
|
+
orig_rect = page.rect
|
|
1950
|
+
new_page = out_doc.new_page(width=orig_rect.width, height=orig_rect.height)
|
|
1951
|
+
|
|
1952
|
+
# Insert the rasterized image
|
|
1953
|
+
new_page.insert_image(new_page.rect, stream=img_bytes)
|
|
1954
|
+
|
|
1955
|
+
src_doc.close()
|
|
1956
|
+
|
|
1957
|
+
# Save output PDF
|
|
1958
|
+
pdf_bytes = out_doc.tobytes(garbage=3, deflate=True)
|
|
1959
|
+
out_doc.close()
|
|
1960
|
+
|
|
1961
|
+
base64.b64encode(pdf_bytes).decode('ascii')
|
|
1962
|
+
`);
|
|
1963
|
+
try {
|
|
1964
|
+
pyodide.FS.unlink(inputPath);
|
|
1965
|
+
} catch {
|
|
1966
|
+
}
|
|
1967
|
+
const binary = atob(result);
|
|
1968
|
+
const bytes = new Uint8Array(binary.length);
|
|
1969
|
+
for (let i = 0; i < binary.length; i++) {
|
|
1970
|
+
bytes[i] = binary.charCodeAt(i);
|
|
1971
|
+
}
|
|
1972
|
+
return new Blob([bytes], { type: "application/pdf" });
|
|
1973
|
+
}
|
|
1974
|
+
/**
|
|
1975
|
+
* Compress a PDF using multiple optimization techniques.
|
|
1976
|
+
* Combines dead-weight removal, image compression, font subsetting, and advanced save options.
|
|
1977
|
+
* Based on PyMuPDF's optimization capabilities.
|
|
1978
|
+
*/
|
|
1979
|
+
async compressPdf(pdf, options) {
|
|
1980
|
+
const pyodide = await this.getPyodide();
|
|
1981
|
+
const docId = ++this.docCounter;
|
|
1982
|
+
const inputPath = `/compress_input_${docId}`;
|
|
1983
|
+
const buf = await pdf.arrayBuffer();
|
|
1984
|
+
const originalSize = buf.byteLength;
|
|
1985
|
+
pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
|
|
1986
|
+
const scrubOpts = options?.scrub ?? {};
|
|
1987
|
+
const scrubMetadata = scrubOpts.metadata !== false;
|
|
1988
|
+
const scrubXmlMetadata = scrubOpts.xmlMetadata !== false;
|
|
1989
|
+
const scrubAttachedFiles = scrubOpts.attachedFiles ?? false;
|
|
1990
|
+
const scrubEmbeddedFiles = scrubOpts.embeddedFiles ?? false;
|
|
1991
|
+
const scrubThumbnails = scrubOpts.thumbnails !== false;
|
|
1992
|
+
const scrubResetFields = scrubOpts.resetFields ?? false;
|
|
1993
|
+
const scrubResetResponses = scrubOpts.resetResponses ?? false;
|
|
1994
|
+
const imageOpts = options?.images ?? {};
|
|
1995
|
+
const compressImages = imageOpts.enabled !== false;
|
|
1996
|
+
const dpiThreshold = imageOpts.dpiThreshold ?? 150;
|
|
1997
|
+
const dpiTarget = imageOpts.dpiTarget ?? 96;
|
|
1998
|
+
const imageQuality = imageOpts.quality ?? 75;
|
|
1999
|
+
const processLossy = imageOpts.lossy !== false;
|
|
2000
|
+
const processLossless = imageOpts.lossless !== false;
|
|
2001
|
+
const processBitonal = imageOpts.bitonal ?? false;
|
|
2002
|
+
const processColor = imageOpts.color !== false;
|
|
2003
|
+
const processGray = imageOpts.gray !== false;
|
|
2004
|
+
const convertToGray = imageOpts.convertToGray ?? false;
|
|
2005
|
+
const subsetFonts = options?.subsetFonts !== false;
|
|
2006
|
+
const saveOpts = options?.save ?? {};
|
|
2007
|
+
const garbage = saveOpts.garbage ?? 4;
|
|
2008
|
+
const deflate = saveOpts.deflate !== false;
|
|
2009
|
+
const clean = saveOpts.clean !== false;
|
|
2010
|
+
const useObjstms = saveOpts.useObjstms !== false;
|
|
2011
|
+
const result = pyodide.runPython(`
|
|
2012
|
+
import base64
|
|
2013
|
+
import json
|
|
2014
|
+
|
|
2015
|
+
doc = pymupdf.open("${inputPath}")
|
|
2016
|
+
original_page_count = doc.page_count
|
|
2017
|
+
|
|
2018
|
+
# 1. Dead-weight removal (scrub)
|
|
2019
|
+
doc.scrub(
|
|
2020
|
+
metadata=${scrubMetadata ? "True" : "False"},
|
|
2021
|
+
xml_metadata=${scrubXmlMetadata ? "True" : "False"},
|
|
2022
|
+
attached_files=${scrubAttachedFiles ? "True" : "False"},
|
|
2023
|
+
embedded_files=${scrubEmbeddedFiles ? "True" : "False"},
|
|
2024
|
+
thumbnails=${scrubThumbnails ? "True" : "False"},
|
|
2025
|
+
reset_fields=${scrubResetFields ? "True" : "False"},
|
|
2026
|
+
reset_responses=${scrubResetResponses ? "True" : "False"},
|
|
2027
|
+
)
|
|
2028
|
+
|
|
2029
|
+
# 2. Image compression
|
|
2030
|
+
if ${compressImages ? "True" : "False"}:
|
|
2031
|
+
doc.rewrite_images(
|
|
2032
|
+
dpi_threshold=${dpiThreshold},
|
|
2033
|
+
dpi_target=${dpiTarget},
|
|
2034
|
+
quality=${imageQuality},
|
|
2035
|
+
lossy=${processLossy ? "True" : "False"},
|
|
2036
|
+
lossless=${processLossless ? "True" : "False"},
|
|
2037
|
+
bitonal=${processBitonal ? "True" : "False"},
|
|
2038
|
+
color=${processColor ? "True" : "False"},
|
|
2039
|
+
gray=${processGray ? "True" : "False"},
|
|
2040
|
+
set_to_gray=${convertToGray ? "True" : "False"},
|
|
2041
|
+
)
|
|
2042
|
+
|
|
2043
|
+
# 3. Font subsetting
|
|
2044
|
+
if ${subsetFonts ? "True" : "False"}:
|
|
2045
|
+
doc.subset_fonts()
|
|
2046
|
+
|
|
2047
|
+
# 4. Save with optimization options
|
|
2048
|
+
pdf_bytes = doc.tobytes(
|
|
2049
|
+
garbage=${garbage},
|
|
2050
|
+
deflate=${deflate ? "True" : "False"},
|
|
2051
|
+
use_objstms=${useObjstms ? "True" : "False"},
|
|
2052
|
+
clean=${clean ? "True" : "False"}
|
|
2053
|
+
)
|
|
2054
|
+
|
|
2055
|
+
compressed_size = len(pdf_bytes)
|
|
2056
|
+
doc.close()
|
|
2057
|
+
|
|
2058
|
+
json.dumps({
|
|
2059
|
+
'data': base64.b64encode(pdf_bytes).decode('ascii'),
|
|
2060
|
+
'compressedSize': compressed_size,
|
|
2061
|
+
'pageCount': original_page_count
|
|
2062
|
+
})
|
|
2063
|
+
`);
|
|
2064
|
+
try {
|
|
2065
|
+
pyodide.FS.unlink(inputPath);
|
|
2066
|
+
} catch {
|
|
2067
|
+
}
|
|
2068
|
+
const parsed = JSON.parse(result);
|
|
2069
|
+
const binary = atob(parsed.data);
|
|
2070
|
+
const bytes = new Uint8Array(binary.length);
|
|
2071
|
+
for (let i = 0; i < binary.length; i++) {
|
|
2072
|
+
bytes[i] = binary.charCodeAt(i);
|
|
2073
|
+
}
|
|
2074
|
+
const compressedSize = parsed.compressedSize;
|
|
2075
|
+
const savings = originalSize - compressedSize;
|
|
2076
|
+
const savingsPercent = originalSize > 0 ? savings / originalSize * 100 : 0;
|
|
2077
|
+
return {
|
|
2078
|
+
blob: new Blob([bytes], { type: "application/pdf" }),
|
|
2079
|
+
originalSize,
|
|
2080
|
+
compressedSize,
|
|
2081
|
+
savings,
|
|
2082
|
+
savingsPercent: Math.round(savingsPercent * 10) / 10,
|
|
2083
|
+
pageCount: parsed.pageCount
|
|
2084
|
+
};
|
|
2085
|
+
}
|
|
2086
|
+
};
|
|
2087
|
+
export {
|
|
2088
|
+
PyMuPDF,
|
|
2089
|
+
PyMuPDFDocument,
|
|
2090
|
+
PyMuPDFPage
|
|
2091
|
+
};
|