@bentopdf/pymupdf-wasm 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,2091 @@
1
+ // src/page.ts
2
+ function uint8ArrayToBase64(bytes) {
3
+ let binary = "";
4
+ const chunkSize = 32768;
5
+ for (let i = 0; i < bytes.length; i += chunkSize) {
6
+ const chunk = bytes.subarray(i, Math.min(i + chunkSize, bytes.length));
7
+ binary += String.fromCharCode.apply(null, Array.from(chunk));
8
+ }
9
+ return btoa(binary);
10
+ }
11
+ var PyMuPDFPage = class {
12
+ constructor(runPython, docVar, pageNumber) {
13
+ this.runPython = runPython;
14
+ this.docVar = docVar;
15
+ this.pageNumber = pageNumber;
16
+ }
17
+ get rect() {
18
+ const result = this.runPython(`
19
+ page = ${this.docVar}[${this.pageNumber}]
20
+ r = page.rect
21
+ [r.x0, r.y0, r.x1, r.y1]
22
+ `);
23
+ return { x0: result[0], y0: result[1], x1: result[2], y1: result[3] };
24
+ }
25
+ get width() {
26
+ return this.runPython(`${this.docVar}[${this.pageNumber}].rect.width`);
27
+ }
28
+ get height() {
29
+ return this.runPython(`${this.docVar}[${this.pageNumber}].rect.height`);
30
+ }
31
+ get rotation() {
32
+ return this.runPython(`${this.docVar}[${this.pageNumber}].rotation`);
33
+ }
34
+ setRotation(angle) {
35
+ this.runPython(`${this.docVar}[${this.pageNumber}].set_rotation(${angle})`);
36
+ }
37
+ getText(format = "text") {
38
+ if (format === "text") {
39
+ return this.runPython(`${this.docVar}[${this.pageNumber}].get_text()`);
40
+ }
41
+ const result = this.runPython(`
42
+ import json
43
+ page = ${this.docVar}[${this.pageNumber}]
44
+ json.dumps(page.get_text("${format}"))
45
+ `);
46
+ return JSON.parse(result);
47
+ }
48
+ searchFor(text, quads = false) {
49
+ const result = this.runPython(`
50
+ import json
51
+ page = ${this.docVar}[${this.pageNumber}]
52
+ rects = page.search_for("${text.replace(/"/g, '\\"')}", quads=${quads ? "True" : "False"})
53
+ json.dumps([[r.x0, r.y0, r.x1, r.y1] for r in rects])
54
+ `);
55
+ return JSON.parse(result).map((r) => ({
56
+ x0: r[0],
57
+ y0: r[1],
58
+ x1: r[2],
59
+ y1: r[3]
60
+ }));
61
+ }
62
+ insertText(point, text, options) {
63
+ const fontsize = options?.fontsize ?? 11;
64
+ const fontname = options?.fontname ?? "helv";
65
+ const color = options?.color ? `(${options.color.r}, ${options.color.g}, ${options.color.b})` : "(0, 0, 0)";
66
+ const rotate = options?.rotate ?? 0;
67
+ this.runPython(`
68
+ page = ${this.docVar}[${this.pageNumber}]
69
+ page.insert_text(
70
+ (${point.x}, ${point.y}),
71
+ """${text.replace(/"""/g, '\\"\\"\\"')}""",
72
+ fontsize=${fontsize},
73
+ fontname="${fontname}",
74
+ color=${color},
75
+ rotate=${rotate}
76
+ )
77
+ `);
78
+ }
79
+ getImages() {
80
+ const result = this.runPython(`
81
+ import json
82
+ page = ${this.docVar}[${this.pageNumber}]
83
+ images = page.get_images()
84
+ json.dumps([{
85
+ 'xref': img[0],
86
+ 'width': img[2],
87
+ 'height': img[3],
88
+ 'bpc': img[4],
89
+ 'colorspace': img[5],
90
+ 'size': img[6] if len(img) > 6 else 0,
91
+ 'name': img[7] if len(img) > 7 else ''
92
+ } for img in images])
93
+ `);
94
+ return JSON.parse(result);
95
+ }
96
+ extractImage(xref) {
97
+ const result = this.runPython(`
98
+ import json
99
+ import base64
100
+ img = ${this.docVar}.extract_image(${xref})
101
+ _result = 'null'
102
+ if img:
103
+ _result = json.dumps({
104
+ 'xref': ${xref},
105
+ 'width': img['width'],
106
+ 'height': img['height'],
107
+ 'bpc': img.get('bpc', 8),
108
+ 'colorspace': img.get('colorspace', 'rgb'),
109
+ 'size': len(img['image']),
110
+ 'ext': img['ext'],
111
+ 'data': base64.b64encode(img['image']).decode('ascii')
112
+ })
113
+ _result
114
+ `);
115
+ if (result === "null") return null;
116
+ const parsed = JSON.parse(result);
117
+ const binary = atob(parsed.data);
118
+ const bytes = new Uint8Array(binary.length);
119
+ for (let i = 0; i < binary.length; i++) {
120
+ bytes[i] = binary.charCodeAt(i);
121
+ }
122
+ return { ...parsed, data: bytes };
123
+ }
124
+ insertImage(rect, imageData, options) {
125
+ const overlay = options?.overlay ?? true;
126
+ const keepProportion = options?.keepProportion ?? true;
127
+ const oc = options?.oc;
128
+ const base64Image = uint8ArrayToBase64(imageData);
129
+ const ocParam = oc !== void 0 ? `, oc=${oc}` : "";
130
+ return this.runPython(`
131
+ import base64
132
+ img_data = base64.b64decode("${base64Image}")
133
+ with open("/tmp_insert_img", "wb") as f:
134
+ f.write(img_data)
135
+ page = ${this.docVar}[${this.pageNumber}]
136
+ page.insert_image(
137
+ pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}),
138
+ filename="/tmp_insert_img",
139
+ overlay=${overlay ? "True" : "False"},
140
+ keep_proportion=${keepProportion ? "True" : "False"}${ocParam}
141
+ )
142
+ `);
143
+ }
144
+ getAnnotations() {
145
+ const result = this.runPython(`
146
+ import json
147
+ page = ${this.docVar}[${this.pageNumber}]
148
+ annots = []
149
+ for annot in page.annots():
150
+ r = annot.rect
151
+ c = annot.colors.get('stroke', (0, 0, 0)) or (0, 0, 0)
152
+ annots.append({
153
+ 'type': annot.type[1],
154
+ 'rect': {'x0': r.x0, 'y0': r.y0, 'x1': r.x1, 'y1': r.y1},
155
+ 'content': annot.info.get('content', ''),
156
+ 'author': annot.info.get('title', ''),
157
+ 'color': {'r': c[0], 'g': c[1], 'b': c[2]} if c else None
158
+ })
159
+ json.dumps(annots)
160
+ `);
161
+ return JSON.parse(result);
162
+ }
163
+ addHighlight(rect, color) {
164
+ const colorStr = color ? `(${color.r}, ${color.g}, ${color.b})` : "(1, 1, 0)";
165
+ this.runPython(`
166
+ page = ${this.docVar}[${this.pageNumber}]
167
+ annot = page.add_highlight_annot(pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}))
168
+ annot.set_colors(stroke=${colorStr})
169
+ annot.update()
170
+ `);
171
+ }
172
+ addTextAnnotation(point, text, icon) {
173
+ const iconStr = icon ?? "Note";
174
+ this.runPython(`
175
+ page = ${this.docVar}[${this.pageNumber}]
176
+ annot = page.add_text_annot((${point.x}, ${point.y}), """${text.replace(/"""/g, '\\"\\"\\"')}""", icon="${iconStr}")
177
+ annot.update()
178
+ `);
179
+ }
180
+ addRectAnnotation(rect, color, fill) {
181
+ const strokeColor = color ? `(${color.r}, ${color.g}, ${color.b})` : "(1, 0, 0)";
182
+ const fillColor = fill ? `(${fill.r}, ${fill.g}, ${fill.b})` : "None";
183
+ this.runPython(`
184
+ page = ${this.docVar}[${this.pageNumber}]
185
+ annot = page.add_rect_annot(pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}))
186
+ annot.set_colors(stroke=${strokeColor}, fill=${fillColor})
187
+ annot.update()
188
+ `);
189
+ }
190
+ deleteAnnotations() {
191
+ this.runPython(`
192
+ page = ${this.docVar}[${this.pageNumber}]
193
+ for annot in list(page.annots()):
194
+ page.delete_annot(annot)
195
+ `);
196
+ }
197
+ getLinks() {
198
+ const result = this.runPython(`
199
+ import json
200
+ page = ${this.docVar}[${this.pageNumber}]
201
+ links = page.get_links()
202
+ json.dumps([{
203
+ 'rect': {'x0': l['from'].x0, 'y0': l['from'].y0, 'x1': l['from'].x1, 'y1': l['from'].y1},
204
+ 'uri': l.get('uri'),
205
+ 'page': l.get('page'),
206
+ 'dest': {'x': l['to'].x, 'y': l['to'].y} if l.get('to') else None
207
+ } for l in links])
208
+ `);
209
+ return JSON.parse(result);
210
+ }
211
+ insertLink(rect, uri) {
212
+ this.runPython(`
213
+ page = ${this.docVar}[${this.pageNumber}]
214
+ page.insert_link({
215
+ 'kind': pymupdf.LINK_URI,
216
+ 'from': pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}),
217
+ 'uri': "${uri}"
218
+ })
219
+ `);
220
+ }
221
+ async toImage(options) {
222
+ const dpi = options?.dpi ?? 150;
223
+ const zoom = dpi / 72;
224
+ const alpha = options?.alpha ?? false;
225
+ const rotation = options?.rotation ?? 0;
226
+ let clipStr = "None";
227
+ if (options?.clip) {
228
+ const c = options.clip;
229
+ clipStr = `pymupdf.Rect(${c.x0}, ${c.y0}, ${c.x1}, ${c.y1})`;
230
+ }
231
+ const result = this.runPython(`
232
+ import base64
233
+ page = ${this.docVar}[${this.pageNumber}]
234
+ mat = pymupdf.Matrix(${zoom}, ${zoom}).prerotate(${rotation})
235
+ pix = page.get_pixmap(matrix=mat, alpha=${alpha ? "True" : "False"}, clip=${clipStr})
236
+ base64.b64encode(pix.tobytes("png")).decode('ascii')
237
+ `);
238
+ const binary = atob(result);
239
+ const bytes = new Uint8Array(binary.length);
240
+ for (let i = 0; i < binary.length; i++) {
241
+ bytes[i] = binary.charCodeAt(i);
242
+ }
243
+ return bytes;
244
+ }
245
+ toSvg() {
246
+ return this.runPython(`${this.docVar}[${this.pageNumber}].get_svg_image()`);
247
+ }
248
+ addRedaction(rect, text, fill) {
249
+ const fillColor = fill ? `(${fill.r}, ${fill.g}, ${fill.b})` : "(0, 0, 0)";
250
+ const replaceText = text ?? "";
251
+ this.runPython(`
252
+ page = ${this.docVar}[${this.pageNumber}]
253
+ page.add_redact_annot(
254
+ pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}),
255
+ text="${replaceText}",
256
+ fill=${fillColor}
257
+ )
258
+ `);
259
+ }
260
+ applyRedactions() {
261
+ this.runPython(`${this.docVar}[${this.pageNumber}].apply_redactions()`);
262
+ }
263
+ drawLine(from, to, color, width) {
264
+ const colorStr = color ? `(${color.r}, ${color.g}, ${color.b})` : "(0, 0, 0)";
265
+ const lineWidth = width ?? 1;
266
+ this.runPython(`
267
+ page = ${this.docVar}[${this.pageNumber}]
268
+ shape = page.new_shape()
269
+ shape.draw_line((${from.x}, ${from.y}), (${to.x}, ${to.y}))
270
+ shape.finish(color=${colorStr}, width=${lineWidth})
271
+ shape.commit()
272
+ `);
273
+ }
274
+ drawRect(rect, color, fill, width) {
275
+ const strokeColor = color ? `(${color.r}, ${color.g}, ${color.b})` : "(0, 0, 0)";
276
+ const fillColor = fill ? `(${fill.r}, ${fill.g}, ${fill.b})` : "None";
277
+ const lineWidth = width ?? 1;
278
+ this.runPython(`
279
+ page = ${this.docVar}[${this.pageNumber}]
280
+ shape = page.new_shape()
281
+ shape.draw_rect(pymupdf.Rect(${rect.x0}, ${rect.y0}, ${rect.x1}, ${rect.y1}))
282
+ shape.finish(color=${strokeColor}, fill=${fillColor}, width=${lineWidth})
283
+ shape.commit()
284
+ `);
285
+ }
286
+ drawCircle(center, radius, color, fill) {
287
+ const strokeColor = color ? `(${color.r}, ${color.g}, ${color.b})` : "(0, 0, 0)";
288
+ const fillColor = fill ? `(${fill.r}, ${fill.g}, ${fill.b})` : "None";
289
+ this.runPython(`
290
+ page = ${this.docVar}[${this.pageNumber}]
291
+ shape = page.new_shape()
292
+ shape.draw_circle((${center.x}, ${center.y}), ${radius})
293
+ shape.finish(color=${strokeColor}, fill=${fillColor})
294
+ shape.commit()
295
+ `);
296
+ }
297
+ findTables(options) {
298
+ let optionsStr = "";
299
+ if (options?.clip) {
300
+ const c = options.clip;
301
+ optionsStr += `clip=pymupdf.Rect(${c.x0}, ${c.y0}, ${c.x1}, ${c.y1}), `;
302
+ }
303
+ if (options?.strategy) {
304
+ optionsStr += `strategy="${options.strategy}", `;
305
+ }
306
+ if (options?.verticalStrategy) {
307
+ optionsStr += `vertical_strategy="${options.verticalStrategy}", `;
308
+ }
309
+ if (options?.horizontalStrategy) {
310
+ optionsStr += `horizontal_strategy="${options.horizontalStrategy}", `;
311
+ }
312
+ if (options?.addLines && options.addLines.length > 0) {
313
+ const linesStr = options.addLines.map((l) => `(${l.join(",")})`).join(",");
314
+ optionsStr += `add_lines=[${linesStr}], `;
315
+ }
316
+ const result = this.runPython(`
317
+ import json
318
+
319
+ page = ${this.docVar}[${this.pageNumber}]
320
+ tables = page.find_tables(${optionsStr})
321
+
322
+ result = []
323
+ for table in tables.tables:
324
+ bbox = table.bbox
325
+ header = table.header
326
+ header_data = None
327
+ if header:
328
+ header_bbox = header.bbox
329
+ header_data = {
330
+ 'names': list(header.names),
331
+ 'cells': [
332
+ {'x0': c[0], 'y0': c[1], 'x1': c[2], 'y1': c[3]} if c else None
333
+ for c in header.cells
334
+ ],
335
+ 'bbox': {'x0': header_bbox[0], 'y0': header_bbox[1], 'x1': header_bbox[2], 'y1': header_bbox[3]} if header_bbox else None,
336
+ 'external': header.external
337
+ }
338
+
339
+ rows = table.extract()
340
+ markdown = table.to_markdown()
341
+
342
+ result.append({
343
+ 'bbox': {'x0': bbox[0], 'y0': bbox[1], 'x1': bbox[2], 'y1': bbox[3]},
344
+ 'rowCount': table.row_count,
345
+ 'colCount': table.col_count,
346
+ 'header': header_data,
347
+ 'rows': rows,
348
+ 'markdown': markdown
349
+ })
350
+
351
+ json.dumps(result)
352
+ `);
353
+ return JSON.parse(result);
354
+ }
355
+ tablesToMarkdown(options) {
356
+ const tables = this.findTables(options);
357
+ return tables.map((t) => t.markdown);
358
+ }
359
+ };
360
+
361
+ // src/document.ts
362
+ var PyMuPDFDocument = class {
363
+ constructor(pyodide, docVar, inputPath) {
364
+ this.closed = false;
365
+ this.pyodide = pyodide;
366
+ this.docVar = docVar;
367
+ this.inputPath = inputPath;
368
+ }
369
+ runPython(code) {
370
+ return this.pyodide.runPython(code);
371
+ }
372
+ ensureOpen() {
373
+ if (this.closed) {
374
+ throw new Error("Document has been closed");
375
+ }
376
+ }
377
+ get pageCount() {
378
+ this.ensureOpen();
379
+ return this.runPython(`${this.docVar}.page_count`);
380
+ }
381
+ get isPdf() {
382
+ this.ensureOpen();
383
+ return this.runPython(`${this.docVar}.is_pdf`);
384
+ }
385
+ get isEncrypted() {
386
+ this.ensureOpen();
387
+ return this.runPython(`${this.docVar}.is_encrypted`);
388
+ }
389
+ get needsPass() {
390
+ this.ensureOpen();
391
+ return this.runPython(`${this.docVar}.needs_pass`);
392
+ }
393
+ get metadata() {
394
+ this.ensureOpen();
395
+ const result = this.runPython(`
396
+ import json
397
+ m = ${this.docVar}.metadata
398
+ json.dumps(m if m else {})
399
+ `);
400
+ return JSON.parse(result);
401
+ }
402
+ setMetadata(metadata) {
403
+ this.ensureOpen();
404
+ const metaJson = JSON.stringify(metadata);
405
+ this.runPython(`${this.docVar}.set_metadata(${metaJson})`);
406
+ }
407
+ getPage(index) {
408
+ this.ensureOpen();
409
+ if (index < 0 || index >= this.pageCount) {
410
+ throw new Error(`Page index ${index} out of range (0-${this.pageCount - 1})`);
411
+ }
412
+ return new PyMuPDFPage(
413
+ (code) => this.runPython(code),
414
+ this.docVar,
415
+ index
416
+ );
417
+ }
418
+ *pages() {
419
+ this.ensureOpen();
420
+ const count = this.pageCount;
421
+ for (let i = 0; i < count; i++) {
422
+ yield this.getPage(i);
423
+ }
424
+ }
425
+ deletePage(index) {
426
+ this.ensureOpen();
427
+ this.runPython(`${this.docVar}.delete_page(${index})`);
428
+ }
429
+ deletePages(indices) {
430
+ this.ensureOpen();
431
+ const sorted = [...indices].sort((a, b) => b - a);
432
+ for (const i of sorted) {
433
+ this.runPython(`${this.docVar}.delete_page(${i})`);
434
+ }
435
+ }
436
+ insertBlankPage(index, width, height) {
437
+ this.ensureOpen();
438
+ const w = width ?? 595;
439
+ const h = height ?? 842;
440
+ this.runPython(`${this.docVar}.insert_page(${index}, width=${w}, height=${h})`);
441
+ return this.getPage(index);
442
+ }
443
+ movePage(from, to) {
444
+ this.ensureOpen();
445
+ this.runPython(`${this.docVar}.move_page(${from}, ${to})`);
446
+ }
447
+ copyPage(from, to) {
448
+ this.ensureOpen();
449
+ this.runPython(`${this.docVar}.copy_page(${from}, ${to})`);
450
+ }
451
+ selectPages(indices) {
452
+ this.ensureOpen();
453
+ this.runPython(`${this.docVar}.select([${indices.join(", ")}])`);
454
+ }
455
+ insertPdf(sourceDoc, options) {
456
+ this.ensureOpen();
457
+ const fromPage = options?.fromPage ?? 0;
458
+ const toPage = options?.toPage ?? -1;
459
+ const startAt = options?.startAt ?? -1;
460
+ const rotate = options?.rotate ?? 0;
461
+ this.runPython(`
462
+ ${this.docVar}.insert_pdf(
463
+ ${sourceDoc.docVar},
464
+ from_page=${fromPage},
465
+ to_page=${toPage},
466
+ start_at=${startAt},
467
+ rotate=${rotate}
468
+ )
469
+ `);
470
+ }
471
+ convertToPdf() {
472
+ this.ensureOpen();
473
+ const result = this.runPython(`
474
+ import base64
475
+ pdf_bytes = ${this.docVar}.convert_to_pdf()
476
+ base64.b64encode(pdf_bytes).decode('ascii')
477
+ `);
478
+ const binary = atob(result);
479
+ const bytes = new Uint8Array(binary.length);
480
+ for (let i = 0; i < binary.length; i++) {
481
+ bytes[i] = binary.charCodeAt(i);
482
+ }
483
+ return bytes;
484
+ }
485
+ searchText(query) {
486
+ this.ensureOpen();
487
+ const results = [];
488
+ for (let i = 0; i < this.pageCount; i++) {
489
+ const page = this.getPage(i);
490
+ const rects = page.searchFor(query);
491
+ for (const rect of rects) {
492
+ results.push({ page: i, rect, text: query });
493
+ }
494
+ }
495
+ return results;
496
+ }
497
+ getToc() {
498
+ this.ensureOpen();
499
+ const result = this.runPython(`
500
+ import json
501
+ toc = ${this.docVar}.get_toc()
502
+ json.dumps([{
503
+ 'level': entry[0],
504
+ 'title': entry[1],
505
+ 'page': entry[2],
506
+ 'dest': {'x': entry[3].x, 'y': entry[3].y} if len(entry) > 3 and entry[3] else None
507
+ } for entry in toc])
508
+ `);
509
+ return JSON.parse(result);
510
+ }
511
+ setToc(toc) {
512
+ this.ensureOpen();
513
+ const tocData = toc.map((e) => [e.level, e.title, e.page]);
514
+ this.runPython(`${this.docVar}.set_toc(${JSON.stringify(tocData)})`);
515
+ }
516
+ get isFormPdf() {
517
+ this.ensureOpen();
518
+ return this.runPython(`${this.docVar}.is_form_pdf`);
519
+ }
520
+ getFormFields() {
521
+ this.ensureOpen();
522
+ const result = this.runPython(`
523
+ import json
524
+ fields = []
525
+ for page in ${this.docVar}:
526
+ for widget in page.widgets():
527
+ r = widget.rect
528
+ fields.append({
529
+ 'name': widget.field_name,
530
+ 'type': widget.field_type_string.lower(),
531
+ 'value': widget.field_value,
532
+ 'rect': {'x0': r.x0, 'y0': r.y0, 'x1': r.x1, 'y1': r.y1},
533
+ 'readonly': widget.field_flags & 1 != 0
534
+ })
535
+ json.dumps(fields)
536
+ `);
537
+ return JSON.parse(result);
538
+ }
539
+ setFormField(name, value) {
540
+ this.ensureOpen();
541
+ const valueStr = typeof value === "boolean" ? value ? "True" : "False" : `"${String(value).replace(/"/g, '\\"')}"`;
542
+ this.runPython(`
543
+ for page in ${this.docVar}:
544
+ for widget in page.widgets():
545
+ if widget.field_name == "${name}":
546
+ widget.field_value = ${valueStr}
547
+ widget.update()
548
+ break
549
+ `);
550
+ }
551
+ authenticate(password) {
552
+ this.ensureOpen();
553
+ return this.runPython(`${this.docVar}.authenticate("${password}")`);
554
+ }
555
+ save(options) {
556
+ this.ensureOpen();
557
+ let encryptParams = "";
558
+ if (options?.encryption) {
559
+ const enc = options.encryption;
560
+ const perms = enc.permissions ?? {};
561
+ const permValue = (perms.print !== false ? 4 : 0) | (perms.modify !== false ? 8 : 0) | (perms.copy !== false ? 16 : 0) | (perms.annotate !== false ? 32 : 0);
562
+ encryptParams = `, encryption=pymupdf.PDF_ENCRYPT_AES_256, owner_pw="${enc.ownerPassword}", user_pw="${enc.userPassword ?? ""}", permissions=${permValue}`;
563
+ }
564
+ const garbage = options?.garbage ?? 1;
565
+ const deflate = options?.deflate !== false;
566
+ const clean = options?.clean !== false;
567
+ const result = this.runPython(`
568
+ import base64
569
+ output = ${this.docVar}.tobytes(garbage=${garbage}, deflate=${deflate ? "True" : "False"}, clean=${clean ? "True" : "False"}${encryptParams})
570
+ base64.b64encode(output).decode('ascii')
571
+ `);
572
+ const binary = atob(result);
573
+ const bytes = new Uint8Array(binary.length);
574
+ for (let i = 0; i < binary.length; i++) {
575
+ bytes[i] = binary.charCodeAt(i);
576
+ }
577
+ return bytes;
578
+ }
579
+ saveAsBlob(options) {
580
+ const bytes = this.save(options);
581
+ return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
582
+ }
583
+ getLayerConfig() {
584
+ this.ensureOpen();
585
+ const result = this.runPython(`
586
+ import json
587
+ import re
588
+
589
+ # Get basic layer info from layer_ui_configs
590
+ layers = ${this.docVar}.layer_ui_configs()
591
+
592
+ # Build a map of layer number to layer info
593
+ layer_map = {}
594
+ xref_to_num = {}
595
+
596
+ for layer in layers:
597
+ num = layer.get('number', 0)
598
+ layer_map[num] = {
599
+ 'number': num,
600
+ 'text': layer.get('text', ''),
601
+ 'on': layer.get('on', False),
602
+ 'locked': layer.get('locked', False),
603
+ 'depth': 0,
604
+ 'xref': 0,
605
+ 'parentXref': 0,
606
+ 'displayOrder': 0
607
+ }
608
+
609
+ # Try to parse the Order array to get hierarchy and xrefs
610
+ try:
611
+ catalog_xref = ${this.docVar}.pdf_catalog()
612
+
613
+ # Get OCProperties
614
+ t, ocprop_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties")
615
+
616
+ ocgs_str = None
617
+ order_str = None
618
+
619
+ if t == "dict":
620
+ t_ocg, ocgs_str = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/OCGs")
621
+ t2, order_str = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/D/Order")
622
+ elif t != "null":
623
+ ocprop_match = re.search(r'(\\d+)\\s+\\d+\\s+R', ocprop_val)
624
+ if ocprop_match:
625
+ ocprop_xref = int(ocprop_match.group(1))
626
+ t_ocg, ocgs_str = ${this.docVar}.xref_get_key(ocprop_xref, "OCGs")
627
+ t2, d_val = ${this.docVar}.xref_get_key(ocprop_xref, "D")
628
+ if t2 == "dict":
629
+ t2, order_str = ${this.docVar}.xref_get_key(ocprop_xref, "D/Order")
630
+ elif t2 != "null":
631
+ d_match = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
632
+ if d_match:
633
+ d_xref = int(d_match.group(1))
634
+ t2, order_str = ${this.docVar}.xref_get_key(d_xref, "Order")
635
+
636
+ # Parse OCGs array and build xref -> number mapping by matching OCG names to layer text
637
+ if ocgs_str:
638
+ xref_matches = re.findall(r'(\\d+)\\s+0\\s+R', ocgs_str)
639
+ ocg_xrefs = [int(x) for x in xref_matches]
640
+
641
+ # Build a name-to-layer-number map from layer_ui_configs
642
+ name_to_num = {}
643
+ for num, info in layer_map.items():
644
+ name_to_num[info['text']] = num
645
+
646
+ # For each OCG xref, look up its Name and match to layer
647
+ for xref in ocg_xrefs:
648
+ # Get the OCG's Name from its dictionary
649
+ t_name, name_val = ${this.docVar}.xref_get_key(xref, "Name")
650
+ if t_name != "null" and name_val:
651
+ # Remove parentheses from PDF string: "(Layer Name)" -> "Layer Name"
652
+ ocg_name = name_val.strip()
653
+ if ocg_name.startswith('(') and ocg_name.endswith(')'):
654
+ ocg_name = ocg_name[1:-1]
655
+
656
+ # Find the layer with this name
657
+ if ocg_name in name_to_num:
658
+ num = name_to_num[ocg_name]
659
+ layer_map[num]['xref'] = xref
660
+ xref_to_num[xref] = num
661
+
662
+ # Parse Order array with state machine to get proper hierarchy
663
+ # Format: ParentRef [Child1 Child2] or [OCG1 OCG2] or just OCG
664
+ if order_str:
665
+ display_order = [0] # Use list for mutable counter
666
+
667
+ # Strip outer brackets from Order array - it's always wrapped in []
668
+ inner_order = order_str.strip()
669
+ if inner_order.startswith('[') and inner_order.endswith(']'):
670
+ inner_order = inner_order[1:-1]
671
+
672
+ def parse_order_array(order_val, depth=0, parent_xref=0):
673
+ i = 0
674
+ last_xref = 0 # Track last OCG xref at current level
675
+
676
+ while i < len(order_val):
677
+ char = order_val[i]
678
+
679
+ if char == '[':
680
+ # Start of nested array - children of last_xref
681
+ # Find matching closing bracket
682
+ bracket_depth = 1
683
+ start = i + 1
684
+ j = i + 1
685
+ while j < len(order_val) and bracket_depth > 0:
686
+ if order_val[j] == '[':
687
+ bracket_depth += 1
688
+ elif order_val[j] == ']':
689
+ bracket_depth -= 1
690
+ j += 1
691
+
692
+ nested_content = order_val[start:j-1]
693
+ # Recursively parse with last_xref as parent
694
+ parse_order_array(nested_content, depth + 1, last_xref)
695
+ i = j
696
+ elif char == ']':
697
+ i += 1
698
+ elif char.isdigit():
699
+ # Parse xref reference
700
+ ref_match = re.match(r'(\\d+)\\s+0\\s+R', order_val[i:])
701
+ if ref_match:
702
+ xref = int(ref_match.group(1))
703
+ if xref in xref_to_num:
704
+ num = xref_to_num[xref]
705
+ layer_map[num]['depth'] = depth
706
+ layer_map[num]['parentXref'] = parent_xref
707
+ layer_map[num]['displayOrder'] = display_order[0]
708
+ display_order[0] += 1
709
+ last_xref = xref
710
+ i += len(ref_match.group(0))
711
+ else:
712
+ i += 1
713
+ else:
714
+ i += 1
715
+
716
+ parse_order_array(inner_order)
717
+
718
+ except Exception as e:
719
+ # If parsing fails, continue with basic layer info
720
+ pass
721
+
722
+ # Convert to list and sort by displayOrder
723
+ result_list = sorted(layer_map.values(), key=lambda x: x.get('displayOrder', 0))
724
+ json.dumps(result_list)
725
+ `);
726
+ return JSON.parse(result);
727
+ }
728
+ addOCG(name, options) {
729
+ this.ensureOpen();
730
+ const config = options?.config ?? -1;
731
+ const on = options?.on !== false;
732
+ const intent = options?.intent ?? "View";
733
+ const usage = options?.usage ?? "Artwork";
734
+ return this.runPython(`
735
+ ${this.docVar}.add_ocg("${name.replace(/"/g, '\\"')}", config=${config}, on=${on ? "True" : "False"}, intent="${intent}", usage="${usage}")
736
+ `);
737
+ }
738
+ addOCGWithParent(name, parentXref, options) {
739
+ this.ensureOpen();
740
+ const config = options?.config ?? -1;
741
+ const on = options?.on !== false;
742
+ const intent = options?.intent ?? "View";
743
+ const usage = options?.usage ?? "Artwork";
744
+ return this.runPython(`
745
+ import re
746
+
747
+ # 1. Create the new OCG (automatically added to root of Order array)
748
+ child_xref = ${this.docVar}.add_ocg("${name.replace(/"/g, '\\"')}", config=${config}, on=${on ? "True" : "False"}, intent="${intent}", usage="${usage}")
749
+
750
+ catalog_xref = ${this.docVar}.pdf_catalog()
751
+
752
+ # 2. Locate OCProperties and Order array
753
+ t, ocprop_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties")
754
+
755
+ order_key_path = None
756
+ order_xref = None
757
+ order_str = None
758
+
759
+ if t == "dict":
760
+ # Inline OCProperties
761
+ t2, order_str = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/D/Order")
762
+ order_key_path = "OCProperties/D/Order"
763
+ order_xref = catalog_xref
764
+ elif t != "null":
765
+ # Reference to OCProperties
766
+ ocprop_match = re.search(r'(\\d+)\\s+\\d+\\s+R', ocprop_val)
767
+ if ocprop_match:
768
+ ocprop_xref = int(ocprop_match.group(1))
769
+ t2, d_val = ${this.docVar}.xref_get_key(ocprop_xref, "D")
770
+
771
+ if t2 == "dict":
772
+ # D is inline
773
+ t2, order_str = ${this.docVar}.xref_get_key(ocprop_xref, "D/Order")
774
+ order_key_path = "D/Order"
775
+ order_xref = ocprop_xref
776
+ elif t2 != "null":
777
+ # D is reference
778
+ d_match = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
779
+ if d_match:
780
+ d_xref = int(d_match.group(1))
781
+ t2, order_str = ${this.docVar}.xref_get_key(d_xref, "Order")
782
+ order_key_path = "Order"
783
+ order_xref = d_xref
784
+
785
+ parent_ref = f"{${parentXref}} 0 R"
786
+ child_ref = f"{child_xref} 0 R"
787
+
788
+ def modify_pdf_order(order_string, p_ref, c_ref):
789
+ if not order_string:
790
+ return order_string
791
+
792
+ # --- STEP 1: Remove the Child from Root ---
793
+ # add_ocg usually appends to the end of the root array.
794
+ # We find the child ref that is strictly at depth 1 (root).
795
+
796
+ cleaned_order = ""
797
+ depth = 0
798
+ i = 0
799
+ removed = False
800
+
801
+ while i < len(order_string):
802
+ char = order_string[i]
803
+
804
+ if char == '[':
805
+ depth += 1
806
+ cleaned_order += char
807
+ i += 1
808
+ elif char == ']':
809
+ depth -= 1
810
+ cleaned_order += char
811
+ i += 1
812
+ else:
813
+ # Check if we are looking at the child ref
814
+ # We match strictly "xref 0 R"
815
+ match = None
816
+ if not removed and depth == 1: # Only remove from root
817
+ chunk = order_string[i:]
818
+ # Check if chunk starts with child_ref followed by non-digit
819
+ if chunk.startswith(c_ref):
820
+ # verify boundary (next char is space, ], or end)
821
+ if len(chunk) == len(c_ref) or chunk[len(c_ref)] in ' ]':
822
+ match = True
823
+
824
+ if match:
825
+ # Skip this ref
826
+ i += len(c_ref)
827
+ removed = True
828
+ # Skip following whitespace if any
829
+ while i < len(order_string) and order_string[i].isspace():
830
+ i += 1
831
+ else:
832
+ cleaned_order += char
833
+ i += 1
834
+
835
+ # --- STEP 2: Insert Child Under Parent ---
836
+ # Logic: Find Parent. Check next non-space char.
837
+ # If '[': Parent already has children. Insert inside that array.
838
+ # If not '[': Create new array [ Child ] after Parent.
839
+
840
+ final_order = cleaned_order
841
+
842
+ # Find parent index
843
+ p_idx = final_order.find(p_ref)
844
+
845
+ if p_idx != -1:
846
+ # Look ahead
847
+ scan_idx = p_idx + len(p_ref)
848
+ insertion_point = -1
849
+ is_existing_array = False
850
+
851
+ # Scan forward for next significant char
852
+ next_char_idx = -1
853
+ for k in range(scan_idx, len(final_order)):
854
+ if not final_order[k].isspace():
855
+ next_char_idx = k
856
+ break
857
+
858
+ if next_char_idx != -1 and final_order[next_char_idx] == '[':
859
+ # Parent has existing children array.
860
+ # We must find the closing bracket for THIS array.
861
+ is_existing_array = True
862
+ arr_depth = 1
863
+ for k in range(next_char_idx + 1, len(final_order)):
864
+ if final_order[k] == '[': arr_depth += 1
865
+ elif final_order[k] == ']': arr_depth -= 1
866
+
867
+ if arr_depth == 0:
868
+ # Found the closing bracket
869
+ insertion_point = k
870
+ break
871
+ else:
872
+ # No existing array, insert after parent
873
+ insertion_point = scan_idx
874
+
875
+ if insertion_point != -1:
876
+ if is_existing_array:
877
+ # Insert inside existing array (before the closing bracket)
878
+ prefix = final_order[:insertion_point]
879
+ suffix = final_order[insertion_point:]
880
+ final_order = prefix + " " + c_ref + suffix
881
+ else:
882
+ # Create new array after parent
883
+ prefix = final_order[:insertion_point]
884
+ suffix = final_order[insertion_point:]
885
+ final_order = prefix + " [" + c_ref + "]" + suffix
886
+
887
+ return final_order
888
+
889
+ if order_str and order_xref:
890
+ new_order = modify_pdf_order(order_str, parent_ref, child_ref)
891
+ ${this.docVar}.xref_set_key(order_xref, order_key_path, new_order)
892
+
893
+ child_xref
894
+ `);
895
+ }
896
+ setLayerVisibility(ocgXref, on) {
897
+ this.ensureOpen();
898
+ this.runPython(`
899
+ import re
900
+
901
+ catalog_xref = ${this.docVar}.pdf_catalog()
902
+ t, ocprop_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties")
903
+
904
+ # Find the D (default config) and its xref/path
905
+ d_xref = None
906
+ d_path = None
907
+ is_inline_d = False
908
+
909
+ if t == "dict":
910
+ # Inline OCProperties
911
+ t2, d_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/D")
912
+ if t2 == "dict":
913
+ d_xref = catalog_xref
914
+ d_path = "OCProperties/D"
915
+ is_inline_d = True
916
+ elif t2 != "null":
917
+ m = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
918
+ if m:
919
+ d_xref = int(m.group(1))
920
+ d_path = ""
921
+ elif t != "null":
922
+ m = re.search(r'(\\d+)\\s+\\d+\\s+R', ocprop_val)
923
+ if m:
924
+ ocprop_xref = int(m.group(1))
925
+ t2, d_val = ${this.docVar}.xref_get_key(ocprop_xref, "D")
926
+ if t2 == "dict":
927
+ d_xref = ocprop_xref
928
+ d_path = "D"
929
+ is_inline_d = True
930
+ elif t2 != "null":
931
+ m2 = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
932
+ if m2:
933
+ d_xref = int(m2.group(1))
934
+ d_path = ""
935
+
936
+ if d_xref is None:
937
+ raise ValueError("Could not find OCProperties/D config")
938
+
939
+ ocg_ref = f"${ocgXref} 0 R"
940
+
941
+ # Helper to add/remove xref from an array
942
+ def add_to_array(arr_str, xref_ref):
943
+ if not arr_str or arr_str == "null":
944
+ return "[" + xref_ref + "]"
945
+ # Check if already in array
946
+ if xref_ref in arr_str:
947
+ return arr_str
948
+ # Add before closing bracket
949
+ return arr_str.rstrip(']') + " " + xref_ref + "]"
950
+
951
+ def remove_from_array(arr_str, xref_ref):
952
+ if not arr_str or arr_str == "null":
953
+ return arr_str
954
+ # Remove the xref reference
955
+ pattern = r'\\s*' + str(${ocgXref}) + r'\\s+0\\s+R'
956
+ result = re.sub(pattern, '', arr_str)
957
+ # Clean up any double spaces
958
+ result = re.sub(r'\\s+', ' ', result)
959
+ result = result.replace('[ ', '[').replace(' ]', ']')
960
+ return result
961
+
962
+ # Get current ON and OFF arrays
963
+ on_key = d_path + "/ON" if d_path else "ON"
964
+ off_key = d_path + "/OFF" if d_path else "OFF"
965
+
966
+ t_on, on_arr = ${this.docVar}.xref_get_key(d_xref, on_key)
967
+ t_off, off_arr = ${this.docVar}.xref_get_key(d_xref, off_key)
968
+
969
+ if ${on ? "True" : "False"}:
970
+ # Turn ON: add to ON array, remove from OFF array
971
+ new_on = add_to_array(on_arr if t_on != "null" else "", ocg_ref)
972
+ new_off = remove_from_array(off_arr if t_off != "null" else "", ocg_ref)
973
+ ${this.docVar}.xref_set_key(d_xref, on_key, new_on)
974
+ if new_off and new_off != "[]":
975
+ ${this.docVar}.xref_set_key(d_xref, off_key, new_off)
976
+ else:
977
+ # Turn OFF: add to OFF array, remove from ON array
978
+ new_off = add_to_array(off_arr if t_off != "null" else "", ocg_ref)
979
+ new_on = remove_from_array(on_arr if t_on != "null" else "", ocg_ref)
980
+ ${this.docVar}.xref_set_key(d_xref, off_key, new_off)
981
+ if new_on and new_on != "[]":
982
+ ${this.docVar}.xref_set_key(d_xref, on_key, new_on)
983
+ `);
984
+ }
985
+ setOC(xref, ocgXref) {
986
+ this.ensureOpen();
987
+ this.runPython(`${this.docVar}.set_oc(${xref}, ${ocgXref})`);
988
+ }
989
+ getOC(xref) {
990
+ this.ensureOpen();
991
+ return this.runPython(`${this.docVar}.get_oc(${xref})`);
992
+ }
993
+ deleteOCG(layerNumber) {
994
+ this.ensureOpen();
995
+ this.runPython(`
996
+ import re
997
+
998
+ # First, get the actual OCG xref from the layer number
999
+ # layer_ui_configs returns items with "number" which is an index, not xref
1000
+ # We need to find the actual OCG xref by looking at the OCProperties
1001
+
1002
+ catalog_xref = ${this.docVar}.pdf_catalog()
1003
+
1004
+ # Get OCProperties - it might be inline dict or a reference
1005
+ t, ocprop_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties")
1006
+
1007
+ # Determine if OCProperties is inline (dict) or a reference
1008
+ if t == "dict":
1009
+ # OCProperties is inline in catalog - we work directly with catalog_xref
1010
+ ocprop_xref = catalog_xref
1011
+ is_inline = True
1012
+ else:
1013
+ # It's a reference like "X 0 R"
1014
+ ocprop_match = re.search(r'(\\d+)\\s+\\d+\\s+R', ocprop_val)
1015
+ if not ocprop_match:
1016
+ raise ValueError("Cannot find OCProperties")
1017
+ ocprop_xref = int(ocprop_match.group(1))
1018
+ is_inline = False
1019
+
1020
+ # Get the OCGs array to find the actual xref at this index
1021
+ if is_inline:
1022
+ # For inline, we need to get it from the full catalog dict
1023
+ t, ocgs_str = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/OCGs")
1024
+ else:
1025
+ t, ocgs_str = ${this.docVar}.xref_get_key(ocprop_xref, "OCGs")
1026
+
1027
+ if t == "null" or not ocgs_str:
1028
+ raise ValueError("No OCGs array found")
1029
+
1030
+ # Parse all xrefs from the array like "[5 0 R 6 0 R 7 0 R]"
1031
+ xref_matches = re.findall(r'(\\d+)\\s+0\\s+R', ocgs_str)
1032
+ ocg_xrefs = [int(x) for x in xref_matches]
1033
+
1034
+ # The layer number from layer_ui_configs corresponds to index in this array
1035
+ if ${layerNumber} < 0 or ${layerNumber} >= len(ocg_xrefs):
1036
+ # layerNumber might actually BE the xref in some cases
1037
+ target_xref = ${layerNumber}
1038
+ else:
1039
+ target_xref = ocg_xrefs[${layerNumber}]
1040
+
1041
+ # Helper to remove xref from array string
1042
+ def remove_xref_from_array(arr_str, xref_to_remove):
1043
+ # Remove "X 0 R" pattern
1044
+ pattern = r'\\s*' + str(xref_to_remove) + r'\\s+0\\s+R'
1045
+ return re.sub(pattern, '', arr_str)
1046
+
1047
+ # Update the OCGs array
1048
+ new_ocgs = remove_xref_from_array(ocgs_str, target_xref)
1049
+ if is_inline:
1050
+ ${this.docVar}.xref_set_key(catalog_xref, "OCProperties/OCGs", new_ocgs)
1051
+ else:
1052
+ ${this.docVar}.xref_set_key(ocprop_xref, "OCGs", new_ocgs)
1053
+
1054
+ # Get D (default config) and update its arrays
1055
+ if is_inline:
1056
+ t, d_val = ${this.docVar}.xref_get_key(catalog_xref, "OCProperties/D")
1057
+ else:
1058
+ t, d_val = ${this.docVar}.xref_get_key(ocprop_xref, "D")
1059
+
1060
+ if t == "dict":
1061
+ # D is inline
1062
+ d_xref = ocprop_xref if not is_inline else catalog_xref
1063
+ d_prefix = "OCProperties/D/" if is_inline else "D/"
1064
+
1065
+ # Try to update ON, OFF, Order arrays
1066
+ for key in ["ON", "OFF", "Order"]:
1067
+ try:
1068
+ tk, val = ${this.docVar}.xref_get_key(d_xref, d_prefix.rstrip('/') + '/' + key if d_prefix else key)
1069
+ if tk != "null" and val:
1070
+ new_val = remove_xref_from_array(val, target_xref)
1071
+ ${this.docVar}.xref_set_key(d_xref, d_prefix.rstrip('/') + '/' + key if d_prefix else key, new_val)
1072
+ except:
1073
+ pass
1074
+ elif t != "null":
1075
+ # D is a reference
1076
+ d_match = re.search(r'(\\d+)\\s+\\d+\\s+R', d_val)
1077
+ if d_match:
1078
+ d_xref = int(d_match.group(1))
1079
+ for key in ["ON", "OFF", "Order"]:
1080
+ try:
1081
+ tk, val = ${this.docVar}.xref_get_key(d_xref, key)
1082
+ if tk != "null" and val:
1083
+ new_val = remove_xref_from_array(val, target_xref)
1084
+ ${this.docVar}.xref_set_key(d_xref, key, new_val)
1085
+ except:
1086
+ pass
1087
+ `);
1088
+ }
1089
+ close() {
1090
+ if (this.closed) return;
1091
+ try {
1092
+ this.runPython(`${this.docVar}.close()`);
1093
+ this.pyodide.FS.unlink(this.inputPath);
1094
+ } catch {
1095
+ }
1096
+ this.closed = true;
1097
+ }
1098
+ };
1099
+
1100
+ // src/pymupdf.ts
1101
+ import loadGhostscriptWASM from "@okathira/ghostpdl-wasm";
1102
+ async function convertPdfToRgb(pdfData) {
1103
+ console.log("[convertPdfToRgb] Starting Ghostscript RGB conversion...");
1104
+ console.log("[convertPdfToRgb] Input size:", pdfData.length);
1105
+ const gs = await loadGhostscriptWASM({
1106
+ locateFile: (path) => {
1107
+ if (path.endsWith(".wasm")) {
1108
+ return "/ghostscript-wasm/gs.wasm";
1109
+ }
1110
+ return path;
1111
+ },
1112
+ print: (text) => console.log("[GS RGB]", text),
1113
+ printErr: (text) => console.error("[GS RGB Error]", text)
1114
+ });
1115
+ const inputPath = "/tmp/cmyk_input.pdf";
1116
+ const outputPath = "/tmp/rgb_output.pdf";
1117
+ gs.FS.writeFile(inputPath, pdfData);
1118
+ console.log("[convertPdfToRgb] Wrote input file");
1119
+ const args = [
1120
+ "-dBATCH",
1121
+ "-dNOPAUSE",
1122
+ "-dNOSAFER",
1123
+ "-dQUIET",
1124
+ "-sDEVICE=pdfwrite",
1125
+ "-sColorConversionStrategy=sRGB",
1126
+ "-sColorConversionStrategyForImages=sRGB",
1127
+ "-dConvertCMYKImagesToRGB=true",
1128
+ "-dProcessColorModel=/DeviceRGB",
1129
+ "-dAutoFilterColorImages=true",
1130
+ "-dAutoFilterGrayImages=true",
1131
+ "-dColorImageFilter=/DCTEncode",
1132
+ "-dGrayImageFilter=/DCTEncode",
1133
+ "-dCompatibilityLevel=1.4",
1134
+ `-sOutputFile=${outputPath}`,
1135
+ inputPath
1136
+ ];
1137
+ console.log("[convertPdfToRgb] Running Ghostscript with args:", args.join(" "));
1138
+ let exitCode;
1139
+ try {
1140
+ exitCode = gs.callMain(args);
1141
+ } catch (e) {
1142
+ console.error("[convertPdfToRgb] Ghostscript exception:", e);
1143
+ try {
1144
+ gs.FS.unlink(inputPath);
1145
+ } catch {
1146
+ }
1147
+ throw new Error(`Ghostscript threw exception: ${e}`);
1148
+ }
1149
+ console.log("[convertPdfToRgb] Ghostscript exit code:", exitCode);
1150
+ if (exitCode !== 0) {
1151
+ try {
1152
+ gs.FS.unlink(inputPath);
1153
+ } catch {
1154
+ }
1155
+ try {
1156
+ gs.FS.unlink(outputPath);
1157
+ } catch {
1158
+ }
1159
+ throw new Error(`Ghostscript RGB conversion failed with exit code ${exitCode}`);
1160
+ }
1161
+ let output;
1162
+ try {
1163
+ const stat = gs.FS.stat(outputPath);
1164
+ console.log("[convertPdfToRgb] Output file size:", stat.size);
1165
+ output = gs.FS.readFile(outputPath);
1166
+ } catch (e) {
1167
+ console.error("[convertPdfToRgb] Failed to read output:", e);
1168
+ try {
1169
+ gs.FS.unlink(inputPath);
1170
+ } catch {
1171
+ }
1172
+ throw new Error("Ghostscript did not produce output file");
1173
+ }
1174
+ try {
1175
+ gs.FS.unlink(inputPath);
1176
+ } catch {
1177
+ }
1178
+ try {
1179
+ gs.FS.unlink(outputPath);
1180
+ } catch {
1181
+ }
1182
+ const copy = new Uint8Array(output.length);
1183
+ copy.set(output);
1184
+ console.log("[convertPdfToRgb] Conversion complete, output size:", copy.length);
1185
+ return copy;
1186
+ }
1187
+ var ASSETS = {
1188
+ pyodide: "pyodide.js",
1189
+ wheels: [
1190
+ "pymupdf-1.26.3-cp313-none-pyodide_2025_0_wasm32.whl",
1191
+ "pymupdf4llm-0.0.27-py3-none-any.whl",
1192
+ "fonttools-4.56.0-py3-none-any.whl",
1193
+ "lxml-5.4.0-cp313-cp313-pyodide_2025_0_wasm32.whl",
1194
+ "numpy-2.2.5-cp313-cp313-pyodide_2025_0_wasm32.whl",
1195
+ "opencv_python-4.11.0.86-cp313-cp313-pyodide_2025_0_wasm32.whl",
1196
+ "pdf2docx-0.5.8-py3-none-any.whl",
1197
+ "python_docx-1.2.0-py3-none-any.whl",
1198
+ "typing_extensions-4.12.2-py3-none-any.whl"
1199
+ ]
1200
+ };
1201
+ var PyMuPDF = class {
1202
+ constructor(options) {
1203
+ this.pyodidePromise = null;
1204
+ this.pyodide = null;
1205
+ this.docCounter = 0;
1206
+ if (typeof options === "string") {
1207
+ this.assetPath = options;
1208
+ } else {
1209
+ this.assetPath = options?.assetPath ?? "./";
1210
+ }
1211
+ if (!this.assetPath.endsWith("/")) {
1212
+ this.assetPath += "/";
1213
+ }
1214
+ }
1215
+ getAssetPath(name) {
1216
+ return this.assetPath + name;
1217
+ }
1218
+ async load() {
1219
+ await this.getPyodide();
1220
+ }
1221
+ async getPyodide() {
1222
+ if (this.pyodide) return this.pyodide;
1223
+ if (this.pyodidePromise) return this.pyodidePromise;
1224
+ this.pyodidePromise = this.initPyodide();
1225
+ this.pyodide = await this.pyodidePromise;
1226
+ return this.pyodide;
1227
+ }
1228
+ async initPyodide() {
1229
+ const pyodideUrl = this.getAssetPath(ASSETS.pyodide);
1230
+ const pyodideModule = await import(
1231
+ /* @vite-ignore */
1232
+ pyodideUrl
1233
+ );
1234
+ const { loadPyodide } = pyodideModule;
1235
+ const pyodide = await loadPyodide({
1236
+ indexURL: this.assetPath
1237
+ });
1238
+ await Promise.all(
1239
+ ASSETS.wheels.map((wheel) => pyodide.loadPackage(this.getAssetPath(wheel)))
1240
+ );
1241
+ pyodide.runPython(`
1242
+ import pymupdf
1243
+ pymupdf.TOOLS.store_shrink(100)
1244
+ `);
1245
+ return pyodide;
1246
+ }
1247
+ async open(input) {
1248
+ const pyodide = await this.getPyodide();
1249
+ const docId = ++this.docCounter;
1250
+ const docVar = `_doc${docId}`;
1251
+ const inputPath = `/input_${docId}`;
1252
+ const buf = await input.arrayBuffer();
1253
+ pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
1254
+ pyodide.runPython(`${docVar} = pymupdf.open("${inputPath}")`);
1255
+ return new PyMuPDFDocument(pyodide, docVar, inputPath);
1256
+ }
1257
+ async openUrl(url) {
1258
+ const response = await fetch(url);
1259
+ if (!response.ok) {
1260
+ throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
1261
+ }
1262
+ const blob = await response.blob();
1263
+ return this.open(blob);
1264
+ }
1265
+ async create() {
1266
+ const pyodide = await this.getPyodide();
1267
+ const docId = ++this.docCounter;
1268
+ const docVar = `_doc${docId}`;
1269
+ const inputPath = `/input_${docId}`;
1270
+ pyodide.runPython(`${docVar} = pymupdf.open()`);
1271
+ return new PyMuPDFDocument(pyodide, docVar, inputPath);
1272
+ }
1273
+ async pdfToDocx(pdf, pages) {
1274
+ const pyodide = await this.getPyodide();
1275
+ const buf = await pdf.arrayBuffer();
1276
+ let pdfData = new Uint8Array(buf);
1277
+ console.log("[pdfToDocx] Converting PDF to RGB colorspace with Ghostscript...");
1278
+ try {
1279
+ const rgbData = await convertPdfToRgb(pdfData);
1280
+ pdfData = rgbData;
1281
+ console.log("[pdfToDocx] RGB conversion complete");
1282
+ } catch (e) {
1283
+ console.warn("[pdfToDocx] Ghostscript RGB conversion failed, trying original:", e);
1284
+ }
1285
+ pyodide.FS.writeFile("/input.pdf", pdfData);
1286
+ const pagesArg = pages ? `[${pages.join(", ")}]` : "None";
1287
+ pyodide.runPython(`
1288
+ import pymupdf
1289
+ from pdf2docx import Converter
1290
+ from pdf2docx.image.ImagesExtractor import ImagesExtractor
1291
+
1292
+ # Store original _to_raw_dict static method
1293
+ _orig_to_raw_dict = ImagesExtractor._to_raw_dict
1294
+
1295
+ def _patched_to_raw_dict(image, bbox):
1296
+ """Convert non-RGB pixmaps to RGB before processing.
1297
+
1298
+ This is a staticmethod that takes (image, bbox).
1299
+ PNG format only supports grayscale and RGB, so we need to convert
1300
+ CMYK and other colorspaces to RGB.
1301
+ """
1302
+ pix = image
1303
+
1304
+ # Check if pixmap needs conversion to RGB
1305
+ # PNG only supports: Grayscale (n=1), Grayscale+Alpha (n=2), RGB (n=3), RGBA (n=4)
1306
+ needs_conversion = False
1307
+
1308
+ if hasattr(pix, 'colorspace') and pix.colorspace:
1309
+ cs_name = pix.colorspace.name.upper() if pix.colorspace.name else ''
1310
+ # Convert if not grayscale or RGB
1311
+ if 'CMYK' in cs_name or 'DEVICECMYK' in cs_name:
1312
+ needs_conversion = True
1313
+ elif cs_name not in ('DEVICEGRAY', 'GRAY', 'DEVICERGB', 'RGB', 'SRGB', ''):
1314
+ # Unknown colorspace - try to convert to RGB
1315
+ needs_conversion = True
1316
+
1317
+ # Also check by component count: CMYK has n=4 without alpha
1318
+ if not needs_conversion and hasattr(pix, 'n') and hasattr(pix, 'alpha'):
1319
+ if pix.n == 4 and not pix.alpha:
1320
+ # Likely CMYK (4 components, no alpha)
1321
+ needs_conversion = True
1322
+ elif pix.n > 4:
1323
+ # More than 4 components - definitely needs conversion
1324
+ needs_conversion = True
1325
+
1326
+ if needs_conversion:
1327
+ try:
1328
+ # Convert to RGB
1329
+ pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
1330
+ except Exception as e:
1331
+ # If direct conversion fails, try via samples
1332
+ try:
1333
+ # Create a new RGB pixmap with same dimensions
1334
+ new_pix = pymupdf.Pixmap(pymupdf.csRGB, pix.irect)
1335
+ new_pix.set_rect(pix.irect, (255, 255, 255)) # White background
1336
+ # Insert the original (this handles conversion)
1337
+ new_pix.copy(pix, pix.irect)
1338
+ pix = new_pix
1339
+ except:
1340
+ # Last resort: just pass through and hope for the best
1341
+ pass
1342
+
1343
+ # Call original static method with converted pixmap and bbox
1344
+ return _orig_to_raw_dict(pix, bbox)
1345
+
1346
+ # Apply patch as staticmethod
1347
+ ImagesExtractor._to_raw_dict = staticmethod(_patched_to_raw_dict)
1348
+
1349
+ cv = Converter("/input.pdf")
1350
+ cv.convert("/output.docx", pages=${pagesArg})
1351
+ cv.close()
1352
+
1353
+ # Restore original
1354
+ ImagesExtractor._to_raw_dict = _orig_to_raw_dict
1355
+ `);
1356
+ const outputBuf = pyodide.FS.readFile("/output.docx");
1357
+ try {
1358
+ pyodide.FS.unlink("/input.pdf");
1359
+ pyodide.FS.unlink("/output.docx");
1360
+ } catch {
1361
+ }
1362
+ return new Blob([new Uint8Array(outputBuf)], {
1363
+ type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
1364
+ });
1365
+ }
1366
+ async merge(pdfs) {
1367
+ if (pdfs.length === 0) {
1368
+ throw new Error("No PDFs provided for merging");
1369
+ }
1370
+ const result = await this.open(pdfs[0]);
1371
+ for (let i = 1; i < pdfs.length; i++) {
1372
+ const doc = await this.open(pdfs[i]);
1373
+ result.insertPdf(doc);
1374
+ doc.close();
1375
+ }
1376
+ const blob = result.saveAsBlob();
1377
+ result.close();
1378
+ return blob;
1379
+ }
1380
+ async split(pdf, ranges) {
1381
+ const results = [];
1382
+ const source = await this.open(pdf);
1383
+ const pageCount = source.pageCount;
1384
+ for (const range of ranges) {
1385
+ const start = Math.max(0, range.start);
1386
+ const end = Math.min(pageCount - 1, range.end);
1387
+ if (start > end) continue;
1388
+ const newDoc = await this.create();
1389
+ newDoc.insertPdf(source, { fromPage: start, toPage: end });
1390
+ results.push(newDoc.saveAsBlob());
1391
+ newDoc.close();
1392
+ }
1393
+ source.close();
1394
+ return results;
1395
+ }
1396
+ async extractText(pdf) {
1397
+ const doc = await this.open(pdf);
1398
+ let text = "";
1399
+ for (const page of doc.pages()) {
1400
+ text += page.getText() + "\n";
1401
+ }
1402
+ doc.close();
1403
+ return text.trim();
1404
+ }
1405
+ async renderPage(pdf, pageIndex, dpi = 150) {
1406
+ const doc = await this.open(pdf);
1407
+ const page = doc.getPage(pageIndex);
1408
+ const image = await page.toImage({ dpi });
1409
+ doc.close();
1410
+ return image;
1411
+ }
1412
+ async convertToPdf(file, options) {
1413
+ const pyodide = await this.getPyodide();
1414
+ const docId = ++this.docCounter;
1415
+ const inputPath = `/convert_input_${docId}`;
1416
+ const filename = file instanceof File ? file.name : "document";
1417
+ const ext = options?.filetype ?? filename.split(".").pop()?.toLowerCase() ?? "";
1418
+ const buf = await file.arrayBuffer();
1419
+ pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
1420
+ const result = pyodide.runPython(`
1421
+ import base64
1422
+
1423
+ src = pymupdf.open("${inputPath}"${ext ? `, filetype="${ext}"` : ""})
1424
+ pdf_bytes = src.convert_to_pdf()
1425
+ src.close()
1426
+
1427
+ pdf = pymupdf.open("pdf", pdf_bytes)
1428
+ output = pdf.tobytes(garbage=3, deflate=True)
1429
+ pdf.close()
1430
+
1431
+ base64.b64encode(output).decode('ascii')
1432
+ `);
1433
+ try {
1434
+ pyodide.FS.unlink(inputPath);
1435
+ } catch {
1436
+ }
1437
+ const binary = atob(result);
1438
+ const bytes = new Uint8Array(binary.length);
1439
+ for (let i = 0; i < binary.length; i++) {
1440
+ bytes[i] = binary.charCodeAt(i);
1441
+ }
1442
+ return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
1443
+ }
1444
+ /**
1445
+ * Repair a PDF by re-opening and re-saving with garbage collection and compression.
1446
+ * This fixes stream length issues that can occur from Ghostscript WASM output.
1447
+ * @param pdf The PDF to repair
1448
+ * @returns Repaired PDF blob
1449
+ */
1450
+ async repairPdf(pdf) {
1451
+ const pyodide = await this.getPyodide();
1452
+ const docId = ++this.docCounter;
1453
+ const inputPath = `/repair_input_${docId}`;
1454
+ const buf = await pdf.arrayBuffer();
1455
+ pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
1456
+ const result = pyodide.runPython(`
1457
+ import base64
1458
+
1459
+ # Open the PDF (this re-parses and fixes internal structure)
1460
+ doc = pymupdf.open("${inputPath}")
1461
+
1462
+ # Re-save with garbage collection and deflate compression
1463
+ # garbage=4 is the most aggressive cleanup (includes unused objects and duplicate streams)
1464
+ # deflate=True compresses streams
1465
+ output = doc.tobytes(garbage=4, deflate=True, clean=True)
1466
+ doc.close()
1467
+
1468
+ base64.b64encode(output).decode('ascii')
1469
+ `);
1470
+ try {
1471
+ pyodide.FS.unlink(inputPath);
1472
+ } catch {
1473
+ }
1474
+ const binary = atob(result);
1475
+ const bytes = new Uint8Array(binary.length);
1476
+ for (let i = 0; i < binary.length; i++) {
1477
+ bytes[i] = binary.charCodeAt(i);
1478
+ }
1479
+ return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
1480
+ }
1481
+ async xpsToPdf(xps) {
1482
+ return this.convertToPdf(xps, { filetype: "xps" });
1483
+ }
1484
+ async epubToPdf(epub) {
1485
+ return this.convertToPdf(epub, { filetype: "epub" });
1486
+ }
1487
+ async imageToPdf(image, options) {
1488
+ return this.convertToPdf(image, { filetype: options?.imageType });
1489
+ }
1490
+ async svgToPdf(svg) {
1491
+ return this.convertToPdf(svg, { filetype: "svg" });
1492
+ }
1493
+ async imagesToPdf(images) {
1494
+ if (images.length === 0) {
1495
+ throw new Error("No images provided");
1496
+ }
1497
+ const pyodide = await this.getPyodide();
1498
+ pyodide.runPython(`_multi_img_pdf = pymupdf.open()`);
1499
+ for (let i = 0; i < images.length; i++) {
1500
+ const image = images[i];
1501
+ const inputPath = `/multi_img_${i}`;
1502
+ const buf = await image.arrayBuffer();
1503
+ pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
1504
+ pyodide.runPython(`
1505
+ img_doc = pymupdf.open("${inputPath}")
1506
+ pdf_bytes = img_doc.convert_to_pdf()
1507
+ img_pdf = pymupdf.open("pdf", pdf_bytes)
1508
+ _multi_img_pdf.insert_pdf(img_pdf)
1509
+ img_pdf.close()
1510
+ img_doc.close()
1511
+ `);
1512
+ try {
1513
+ pyodide.FS.unlink(inputPath);
1514
+ } catch {
1515
+ }
1516
+ }
1517
+ const result = pyodide.runPython(`
1518
+ import base64
1519
+ output = _multi_img_pdf.tobytes(garbage=3, deflate=True)
1520
+ _multi_img_pdf.close()
1521
+ base64.b64encode(output).decode('ascii')
1522
+ `);
1523
+ const binary = atob(result);
1524
+ const bytes = new Uint8Array(binary.length);
1525
+ for (let i = 0; i < binary.length; i++) {
1526
+ bytes[i] = binary.charCodeAt(i);
1527
+ }
1528
+ return new Blob([new Uint8Array(bytes)], { type: "application/pdf" });
1529
+ }
1530
+ async pdfToImages(pdf, options) {
1531
+ const pyodide = await this.getPyodide();
1532
+ const doc = await this.open(pdf);
1533
+ const format = options?.format ?? "png";
1534
+ const dpi = options?.dpi ?? 150;
1535
+ const zoom = dpi / 72;
1536
+ const pageCount = doc.pageCount;
1537
+ const pagesToExport = options?.pages ?? Array.from({ length: pageCount }, (_, i) => i);
1538
+ const results = [];
1539
+ for (const pageIdx of pagesToExport) {
1540
+ if (pageIdx < 0 || pageIdx >= pageCount) continue;
1541
+ const result = pyodide.runPython(`
1542
+ import base64
1543
+ page = ${doc.docVar}[${pageIdx}]
1544
+ mat = pymupdf.Matrix(${zoom}, ${zoom})
1545
+ pix = page.get_pixmap(matrix=mat)
1546
+ base64.b64encode(pix.tobytes("${format}")).decode('ascii')
1547
+ `);
1548
+ const binary = atob(result);
1549
+ const bytes = new Uint8Array(binary.length);
1550
+ for (let i = 0; i < binary.length; i++) {
1551
+ bytes[i] = binary.charCodeAt(i);
1552
+ }
1553
+ results.push(bytes);
1554
+ }
1555
+ doc.close();
1556
+ return results;
1557
+ }
1558
+ async pdfToSvg(pdf, pages) {
1559
+ const doc = await this.open(pdf);
1560
+ const pageCount = doc.pageCount;
1561
+ const pagesToExport = pages ?? Array.from({ length: pageCount }, (_, i) => i);
1562
+ const results = [];
1563
+ for (const pageIdx of pagesToExport) {
1564
+ if (pageIdx < 0 || pageIdx >= pageCount) continue;
1565
+ const page = doc.getPage(pageIdx);
1566
+ results.push(page.toSvg());
1567
+ }
1568
+ doc.close();
1569
+ return results;
1570
+ }
1571
+ async pdfToText(pdf) {
1572
+ return this.extractText(pdf);
1573
+ }
1574
+ async pdfToHtml(pdf) {
1575
+ const doc = await this.open(pdf);
1576
+ let html = "";
1577
+ for (const page of doc.pages()) {
1578
+ html += page.getText("html") + "\n";
1579
+ }
1580
+ doc.close();
1581
+ return html;
1582
+ }
1583
+ async pdfToJson(pdf) {
1584
+ const doc = await this.open(pdf);
1585
+ const results = [];
1586
+ for (const page of doc.pages()) {
1587
+ const text = page.getText("dict");
1588
+ results.push(text);
1589
+ }
1590
+ doc.close();
1591
+ return results;
1592
+ }
1593
+ async pdfToXml(pdf) {
1594
+ const doc = await this.open(pdf);
1595
+ let xml = '<?xml version="1.0" encoding="UTF-8"?>\n<document>\n';
1596
+ for (const page of doc.pages()) {
1597
+ xml += page.getText("xml") + "\n";
1598
+ }
1599
+ xml += "</document>";
1600
+ doc.close();
1601
+ return xml;
1602
+ }
1603
+ hasRtlCharacters(text) {
1604
+ const rtlPattern = /[\u0590-\u05FF\u0600-\u06FF\u0700-\u074F\u0750-\u077F\u0780-\u07BF\u07C0-\u07FF\u08A0-\u08FF\uFB1D-\uFB4F\uFB50-\uFDFF\uFE70-\uFEFF]/;
1605
+ return rtlPattern.test(text);
1606
+ }
1607
+ async textToPdf(text, options) {
1608
+ const pyodide = await this.getPyodide();
1609
+ const isRtl = this.hasRtlCharacters(text);
1610
+ const directionStyle = isRtl ? "direction: rtl; text-align: right;" : "";
1611
+ const escapedText = text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#039;").replace(/\\/g, "\\\\").replace(/\n/g, "<br>");
1612
+ const fontSize = options?.fontSize ?? 11;
1613
+ const pageSize = options?.pageSize ?? "a4";
1614
+ const margins = options?.margins ?? 72;
1615
+ const fontMap = {
1616
+ "helv": "sans-serif",
1617
+ "tiro": "serif",
1618
+ "cour": "monospace",
1619
+ "times": "serif"
1620
+ };
1621
+ const fontName = options?.fontName ?? "helv";
1622
+ const fontFamily = fontMap[fontName] || "sans-serif";
1623
+ const result = pyodide.runPython(`
1624
+ import base64
1625
+
1626
+ html_content = '''
1627
+ <p style="font-family: ${fontFamily}; font-size: ${fontSize}pt; margin: 0; padding: 0; ${directionStyle}">
1628
+ ${escapedText}
1629
+ </p>
1630
+ '''
1631
+
1632
+ doc = pymupdf.open()
1633
+ mediabox = pymupdf.paper_rect("${pageSize}")
1634
+ margin = ${margins}
1635
+ where = mediabox + (margin, margin, -margin, -margin)
1636
+
1637
+ more = True
1638
+ page_count = 0
1639
+ max_pages = 100
1640
+
1641
+ while more and page_count < max_pages:
1642
+ page = doc.new_page(width=mediabox.width, height=mediabox.height)
1643
+ more, _ = page.insert_htmlbox(where, html_content, css="* { font-family: ${fontFamily}; font-size: ${fontSize}pt; }")
1644
+ page_count += 1
1645
+
1646
+ # Subset and embed fonts for PDF/A compatibility
1647
+ doc.subset_fonts()
1648
+
1649
+ pdf_bytes = doc.tobytes(garbage=3, deflate=True)
1650
+ doc.close()
1651
+
1652
+ base64.b64encode(pdf_bytes).decode('ascii')
1653
+ `);
1654
+ const binaryStr = atob(result);
1655
+ const bytes = new Uint8Array(binaryStr.length);
1656
+ for (let i = 0; i < binaryStr.length; i++) {
1657
+ bytes[i] = binaryStr.charCodeAt(i);
1658
+ }
1659
+ return new Blob([bytes], { type: "application/pdf" });
1660
+ }
1661
+ async htmlToPdf(html, options) {
1662
+ const pyodide = await this.getPyodide();
1663
+ const escapedHtml = html.replace(/\\/g, "\\\\").replace(/'/g, "\\'").replace(/\n/g, "\\n");
1664
+ const escapedCss = options?.css?.replace(/\\/g, "\\\\").replace(/'/g, "\\'").replace(/\n/g, "\\n") ?? "";
1665
+ const pageSize = options?.pageSize ?? "a4";
1666
+ let margins = { top: 36, right: 36, bottom: 36, left: 36 };
1667
+ if (typeof options?.margins === "number") {
1668
+ margins = { top: options.margins, right: options.margins, bottom: options.margins, left: options.margins };
1669
+ } else if (options?.margins) {
1670
+ margins = options.margins;
1671
+ }
1672
+ const result = pyodide.runPython(`
1673
+ import base64
1674
+ import io
1675
+ import re
1676
+ import json
1677
+
1678
+ html_content = '''${escapedHtml}'''
1679
+ css_content = '''${escapedCss}'''
1680
+
1681
+ # Extract links from HTML before processing
1682
+ link_pattern = r'<a[^>]*href=["\\'](https?://[^"\\'>]+)["\\'"][^>]*>([^<]+)</a>'
1683
+ links = re.findall(link_pattern, html_content, re.IGNORECASE)
1684
+ # links is a list of (url, text) tuples
1685
+
1686
+ html_content = re.sub(r'<link[^>]*stylesheet[^>]*>', '', html_content, flags=re.IGNORECASE)
1687
+ html_content = re.sub(r'<link[^>]*href=[^>]*>', '', html_content, flags=re.IGNORECASE)
1688
+ html_content = re.sub(r'<script[^>]*src=[^>]*>.*?<\\/script>', '', html_content, flags=re.IGNORECASE|re.DOTALL)
1689
+ html_content = re.sub(r'<script[^>]*src=[^>]*/>', '', html_content, flags=re.IGNORECASE)
1690
+
1691
+ mediabox = pymupdf.paper_rect("${pageSize}")
1692
+ where = mediabox + (${margins.left}, ${margins.top}, -${margins.right}, -${margins.bottom})
1693
+
1694
+ story = pymupdf.Story(html=html_content, user_css=css_content if css_content else None)
1695
+
1696
+ buffer = io.BytesIO()
1697
+ writer = pymupdf.DocumentWriter(buffer)
1698
+
1699
+ def rectfn(rect_num, filled):
1700
+ if rect_num == 0 or filled == 0:
1701
+ return mediabox, where, None
1702
+ return mediabox, where, None
1703
+
1704
+ story.write(writer, rectfn)
1705
+ writer.close()
1706
+
1707
+ # Now open the PDF and add link annotations
1708
+ buffer.seek(0)
1709
+ doc = pymupdf.open("pdf", buffer.read())
1710
+
1711
+ # For each link found in HTML, search for the text and add a link annotation
1712
+ for url, text in links:
1713
+ text = text.strip()
1714
+ if not text:
1715
+ continue
1716
+ # Search all pages for this text
1717
+ for page_num in range(doc.page_count):
1718
+ page = doc[page_num]
1719
+ # Search for the link text
1720
+ text_instances = page.search_for(text)
1721
+ for rect in text_instances:
1722
+ # Add a link annotation
1723
+ link = page.insert_link({
1724
+ "kind": pymupdf.LINK_URI,
1725
+ "from": rect,
1726
+ "uri": url
1727
+ })
1728
+
1729
+ # Save the modified PDF
1730
+ output_buffer = io.BytesIO()
1731
+ doc.save(output_buffer)
1732
+ doc.close()
1733
+
1734
+ pdf_bytes = output_buffer.getvalue()
1735
+ base64.b64encode(pdf_bytes).decode('ascii')
1736
+ `);
1737
+ const binaryStr = atob(result);
1738
+ const bytes = new Uint8Array(binaryStr.length);
1739
+ for (let i = 0; i < binaryStr.length; i++) {
1740
+ bytes[i] = binaryStr.charCodeAt(i);
1741
+ }
1742
+ return new Blob([bytes], { type: "application/pdf" });
1743
+ }
1744
+ async pdfToMarkdown(pdf, options) {
1745
+ const pyodide = await this.getPyodide();
1746
+ const docId = ++this.docCounter;
1747
+ const inputPath = `/md_input_${docId}`;
1748
+ const buf = await pdf.arrayBuffer();
1749
+ pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
1750
+ const embedImages = options?.includeImages ? "True" : "False";
1751
+ const pageBreaks = options?.pageBreaks !== false ? "True" : "False";
1752
+ const pagesArg = options?.pages ? `pages=[${options.pages.join(", ")}]` : "";
1753
+ const result = pyodide.runPython(`
1754
+ import pymupdf4llm
1755
+
1756
+ md_text = pymupdf4llm.to_markdown(
1757
+ "${inputPath}",
1758
+ embed_images=${embedImages},
1759
+ page_chunks=${pageBreaks}${pagesArg ? ", " + pagesArg : ""}
1760
+ )
1761
+
1762
+ if isinstance(md_text, list):
1763
+ result = "\\n\\n---\\n\\n".join([chunk.get('text', '') if isinstance(chunk, dict) else str(chunk) for chunk in md_text])
1764
+ else:
1765
+ result = md_text if md_text else ""
1766
+
1767
+ result
1768
+ `);
1769
+ try {
1770
+ pyodide.FS.unlink(inputPath);
1771
+ } catch {
1772
+ }
1773
+ return result;
1774
+ }
1775
+ async pdfToLlmChunks(pdf) {
1776
+ const pyodide = await this.getPyodide();
1777
+ const docId = ++this.docCounter;
1778
+ const inputPath = `/llm_input_${docId}`;
1779
+ const buf = await pdf.arrayBuffer();
1780
+ pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
1781
+ const result = pyodide.runPython(`
1782
+ import pymupdf4llm
1783
+ import json
1784
+
1785
+ chunks = pymupdf4llm.to_markdown(
1786
+ "${inputPath}",
1787
+ page_chunks=True
1788
+ )
1789
+
1790
+ result = []
1791
+ for chunk in chunks:
1792
+ if isinstance(chunk, dict):
1793
+ result.append({
1794
+ "text": chunk.get("text", ""),
1795
+ "metadata": {
1796
+ "page": chunk.get("metadata", {}).get("page", None)
1797
+ }
1798
+ })
1799
+ else:
1800
+ result.append({"text": str(chunk), "metadata": {}})
1801
+
1802
+ json.dumps(result)
1803
+ `);
1804
+ try {
1805
+ pyodide.FS.unlink(inputPath);
1806
+ } catch {
1807
+ }
1808
+ return JSON.parse(result);
1809
+ }
1810
+ /**
1811
+ * Extract PDF as LlamaIndex-compatible documents using PyMuPDF4LLM.
1812
+ * Uses to_markdown with page_chunks=True to produce LlamaIndex Document format.
1813
+ * @param pdf The PDF file to extract
1814
+ * @returns Array of LlamaIndex-compatible documents
1815
+ */
1816
+ async pdfToLlamaIndex(pdf) {
1817
+ const pyodide = await this.getPyodide();
1818
+ const docId = ++this.docCounter;
1819
+ const inputPath = `/llama_input_${docId}`;
1820
+ const filename = pdf instanceof File ? pdf.name : "document.pdf";
1821
+ const buf = await pdf.arrayBuffer();
1822
+ pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
1823
+ const result = pyodide.runPython(`
1824
+ import pymupdf4llm
1825
+ import pymupdf
1826
+ import json
1827
+
1828
+ # Use to_markdown with page_chunks=True - same output as LlamaMarkdownReader
1829
+ chunks = pymupdf4llm.to_markdown("${inputPath}", page_chunks=True)
1830
+
1831
+ # Get document metadata
1832
+ doc = pymupdf.open("${inputPath}")
1833
+ doc_meta = doc.metadata
1834
+ page_count = doc.page_count
1835
+ doc.close()
1836
+
1837
+ # Convert to LlamaIndex Document format
1838
+ result = []
1839
+ for chunk in chunks:
1840
+ if isinstance(chunk, dict):
1841
+ doc_dict = {
1842
+ "text": chunk.get("text", ""),
1843
+ "metadata": {
1844
+ "file_name": "${filename.replace(/"/g, '\\"')}",
1845
+ "total_pages": page_count
1846
+ }
1847
+ }
1848
+
1849
+ # Copy chunk metadata
1850
+ chunk_meta = chunk.get("metadata", {})
1851
+ if chunk_meta:
1852
+ if "page" in chunk_meta:
1853
+ doc_dict["metadata"]["page"] = chunk_meta["page"]
1854
+ if "page_count" in chunk_meta:
1855
+ doc_dict["metadata"]["page_count"] = chunk_meta["page_count"]
1856
+ if "file_path" in chunk_meta:
1857
+ doc_dict["metadata"]["file_path"] = chunk_meta["file_path"]
1858
+
1859
+ # Add document-level metadata
1860
+ if doc_meta:
1861
+ for key in ["author", "title", "subject", "keywords", "creator", "producer", "creationDate", "modDate"]:
1862
+ if doc_meta.get(key):
1863
+ doc_dict["metadata"][key] = doc_meta[key]
1864
+
1865
+ # Include tables info if available (convert Rect to list)
1866
+ if "tables" in chunk and chunk["tables"]:
1867
+ tables_serializable = []
1868
+ for t in chunk["tables"]:
1869
+ if isinstance(t, dict):
1870
+ t_copy = dict(t)
1871
+ if "bbox" in t_copy and hasattr(t_copy["bbox"], "__iter__"):
1872
+ t_copy["bbox"] = list(t_copy["bbox"])
1873
+ tables_serializable.append(t_copy)
1874
+ doc_dict["metadata"]["tables"] = tables_serializable
1875
+
1876
+ # Include images info if available (convert Rect to list)
1877
+ if "images" in chunk and chunk["images"]:
1878
+ images_serializable = []
1879
+ for img in chunk["images"]:
1880
+ if isinstance(img, dict):
1881
+ img_copy = dict(img)
1882
+ if "bbox" in img_copy and hasattr(img_copy["bbox"], "__iter__"):
1883
+ img_copy["bbox"] = list(img_copy["bbox"])
1884
+ images_serializable.append(img_copy)
1885
+ doc_dict["metadata"]["images"] = images_serializable
1886
+
1887
+ if "toc_items" in chunk:
1888
+ doc_dict["metadata"]["toc_items"] = chunk["toc_items"]
1889
+
1890
+ result.append(doc_dict)
1891
+ else:
1892
+ result.append({"text": str(chunk), "metadata": {"file_name": "${filename.replace(/"/g, '\\"')}"}})
1893
+
1894
+ json.dumps(result)
1895
+ `);
1896
+ try {
1897
+ pyodide.FS.unlink(inputPath);
1898
+ } catch {
1899
+ }
1900
+ return JSON.parse(result);
1901
+ }
1902
+ /**
1903
+ * Rasterize a PDF - convert all pages to images and create a new PDF from those images.
1904
+ * This flattens all vector graphics, text, and layers into raster images.
1905
+ * Useful for: printing, reducing file complexity, removing selectable text, or creating image-based PDFs.
1906
+ */
1907
+ async rasterizePdf(pdf, options) {
1908
+ const pyodide = await this.getPyodide();
1909
+ const docId = ++this.docCounter;
1910
+ const inputPath = `/rasterize_input_${docId}`;
1911
+ const dpi = options?.dpi ?? 150;
1912
+ const format = options?.format ?? "png";
1913
+ const quality = options?.quality ?? 95;
1914
+ const alpha = options?.alpha ?? false;
1915
+ const pages = options?.pages;
1916
+ const grayscale = options?.grayscale ?? false;
1917
+ const buf = await pdf.arrayBuffer();
1918
+ pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
1919
+ const pagesArg = pages ? `[${pages.join(", ")}]` : "None";
1920
+ const result = pyodide.runPython(`
1921
+ import base64
1922
+
1923
+ src_doc = pymupdf.open("${inputPath}")
1924
+ out_doc = pymupdf.open()
1925
+
1926
+ zoom = ${dpi} / 72.0
1927
+ mat = pymupdf.Matrix(zoom, zoom)
1928
+
1929
+ page_indices = ${pagesArg} if ${pagesArg} is not None else range(src_doc.page_count)
1930
+
1931
+ for page_idx in page_indices:
1932
+ if page_idx < 0 or page_idx >= src_doc.page_count:
1933
+ continue
1934
+
1935
+ page = src_doc[page_idx]
1936
+
1937
+ # Render page to pixmap
1938
+ pix = page.get_pixmap(matrix=mat, alpha=${alpha ? "True" : "False"})
1939
+
1940
+ # Convert to grayscale if requested
1941
+ if ${grayscale ? "True" : "False"}:
1942
+ pix = pymupdf.Pixmap(pymupdf.csGRAY, pix)
1943
+
1944
+ # Get image bytes
1945
+ img_bytes = pix.tobytes("${format}"${format === "jpeg" ? `, jpg_quality=${quality}` : ""})
1946
+
1947
+ # Create new page with same dimensions as rendered image
1948
+ # Scale back to original page size for the PDF
1949
+ orig_rect = page.rect
1950
+ new_page = out_doc.new_page(width=orig_rect.width, height=orig_rect.height)
1951
+
1952
+ # Insert the rasterized image
1953
+ new_page.insert_image(new_page.rect, stream=img_bytes)
1954
+
1955
+ src_doc.close()
1956
+
1957
+ # Save output PDF
1958
+ pdf_bytes = out_doc.tobytes(garbage=3, deflate=True)
1959
+ out_doc.close()
1960
+
1961
+ base64.b64encode(pdf_bytes).decode('ascii')
1962
+ `);
1963
+ try {
1964
+ pyodide.FS.unlink(inputPath);
1965
+ } catch {
1966
+ }
1967
+ const binary = atob(result);
1968
+ const bytes = new Uint8Array(binary.length);
1969
+ for (let i = 0; i < binary.length; i++) {
1970
+ bytes[i] = binary.charCodeAt(i);
1971
+ }
1972
+ return new Blob([bytes], { type: "application/pdf" });
1973
+ }
1974
+ /**
1975
+ * Compress a PDF using multiple optimization techniques.
1976
+ * Combines dead-weight removal, image compression, font subsetting, and advanced save options.
1977
+ * Based on PyMuPDF's optimization capabilities.
1978
+ */
1979
+ async compressPdf(pdf, options) {
1980
+ const pyodide = await this.getPyodide();
1981
+ const docId = ++this.docCounter;
1982
+ const inputPath = `/compress_input_${docId}`;
1983
+ const buf = await pdf.arrayBuffer();
1984
+ const originalSize = buf.byteLength;
1985
+ pyodide.FS.writeFile(inputPath, new Uint8Array(buf));
1986
+ const scrubOpts = options?.scrub ?? {};
1987
+ const scrubMetadata = scrubOpts.metadata !== false;
1988
+ const scrubXmlMetadata = scrubOpts.xmlMetadata !== false;
1989
+ const scrubAttachedFiles = scrubOpts.attachedFiles ?? false;
1990
+ const scrubEmbeddedFiles = scrubOpts.embeddedFiles ?? false;
1991
+ const scrubThumbnails = scrubOpts.thumbnails !== false;
1992
+ const scrubResetFields = scrubOpts.resetFields ?? false;
1993
+ const scrubResetResponses = scrubOpts.resetResponses ?? false;
1994
+ const imageOpts = options?.images ?? {};
1995
+ const compressImages = imageOpts.enabled !== false;
1996
+ const dpiThreshold = imageOpts.dpiThreshold ?? 150;
1997
+ const dpiTarget = imageOpts.dpiTarget ?? 96;
1998
+ const imageQuality = imageOpts.quality ?? 75;
1999
+ const processLossy = imageOpts.lossy !== false;
2000
+ const processLossless = imageOpts.lossless !== false;
2001
+ const processBitonal = imageOpts.bitonal ?? false;
2002
+ const processColor = imageOpts.color !== false;
2003
+ const processGray = imageOpts.gray !== false;
2004
+ const convertToGray = imageOpts.convertToGray ?? false;
2005
+ const subsetFonts = options?.subsetFonts !== false;
2006
+ const saveOpts = options?.save ?? {};
2007
+ const garbage = saveOpts.garbage ?? 4;
2008
+ const deflate = saveOpts.deflate !== false;
2009
+ const clean = saveOpts.clean !== false;
2010
+ const useObjstms = saveOpts.useObjstms !== false;
2011
+ const result = pyodide.runPython(`
2012
+ import base64
2013
+ import json
2014
+
2015
+ doc = pymupdf.open("${inputPath}")
2016
+ original_page_count = doc.page_count
2017
+
2018
+ # 1. Dead-weight removal (scrub)
2019
+ doc.scrub(
2020
+ metadata=${scrubMetadata ? "True" : "False"},
2021
+ xml_metadata=${scrubXmlMetadata ? "True" : "False"},
2022
+ attached_files=${scrubAttachedFiles ? "True" : "False"},
2023
+ embedded_files=${scrubEmbeddedFiles ? "True" : "False"},
2024
+ thumbnails=${scrubThumbnails ? "True" : "False"},
2025
+ reset_fields=${scrubResetFields ? "True" : "False"},
2026
+ reset_responses=${scrubResetResponses ? "True" : "False"},
2027
+ )
2028
+
2029
+ # 2. Image compression
2030
+ if ${compressImages ? "True" : "False"}:
2031
+ doc.rewrite_images(
2032
+ dpi_threshold=${dpiThreshold},
2033
+ dpi_target=${dpiTarget},
2034
+ quality=${imageQuality},
2035
+ lossy=${processLossy ? "True" : "False"},
2036
+ lossless=${processLossless ? "True" : "False"},
2037
+ bitonal=${processBitonal ? "True" : "False"},
2038
+ color=${processColor ? "True" : "False"},
2039
+ gray=${processGray ? "True" : "False"},
2040
+ set_to_gray=${convertToGray ? "True" : "False"},
2041
+ )
2042
+
2043
+ # 3. Font subsetting
2044
+ if ${subsetFonts ? "True" : "False"}:
2045
+ doc.subset_fonts()
2046
+
2047
+ # 4. Save with optimization options
2048
+ pdf_bytes = doc.tobytes(
2049
+ garbage=${garbage},
2050
+ deflate=${deflate ? "True" : "False"},
2051
+ use_objstms=${useObjstms ? "True" : "False"},
2052
+ clean=${clean ? "True" : "False"}
2053
+ )
2054
+
2055
+ compressed_size = len(pdf_bytes)
2056
+ doc.close()
2057
+
2058
+ json.dumps({
2059
+ 'data': base64.b64encode(pdf_bytes).decode('ascii'),
2060
+ 'compressedSize': compressed_size,
2061
+ 'pageCount': original_page_count
2062
+ })
2063
+ `);
2064
+ try {
2065
+ pyodide.FS.unlink(inputPath);
2066
+ } catch {
2067
+ }
2068
+ const parsed = JSON.parse(result);
2069
+ const binary = atob(parsed.data);
2070
+ const bytes = new Uint8Array(binary.length);
2071
+ for (let i = 0; i < binary.length; i++) {
2072
+ bytes[i] = binary.charCodeAt(i);
2073
+ }
2074
+ const compressedSize = parsed.compressedSize;
2075
+ const savings = originalSize - compressedSize;
2076
+ const savingsPercent = originalSize > 0 ? savings / originalSize * 100 : 0;
2077
+ return {
2078
+ blob: new Blob([bytes], { type: "application/pdf" }),
2079
+ originalSize,
2080
+ compressedSize,
2081
+ savings,
2082
+ savingsPercent: Math.round(savingsPercent * 10) / 10,
2083
+ pageCount: parsed.pageCount
2084
+ };
2085
+ }
2086
+ };
2087
+ export {
2088
+ PyMuPDF,
2089
+ PyMuPDFDocument,
2090
+ PyMuPDFPage
2091
+ };