docxrender 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docxrender/pdf_uno.py ADDED
@@ -0,0 +1,608 @@
1
+ """LibreOffice UNO PDF conversion helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib
6
+ import shutil
7
+ import socket
8
+ import subprocess
9
+ import tempfile
10
+ import time
11
+ from dataclasses import dataclass
12
+ from io import BufferedWriter
13
+ from pathlib import Path
14
+ from typing import Any, Protocol
15
+
16
+ from docxrender.contracts import (
17
+ DocxFieldRefreshOptions,
18
+ DocxToPdfOptions,
19
+ DocxToPdfResult,
20
+ )
21
+
22
+ LISTENER_HOST = "127.0.0.1"
23
+ LISTENER_START_TIMEOUT_SECONDS = 15.0
24
+ LISTENER_POLL_INTERVAL_SECONDS = 0.1
25
+ DOCUMENT_LOAD_TIMEOUT_SECONDS = 10.0
26
+ DOCUMENT_LOAD_POLL_INTERVAL_SECONDS = 0.2
27
+ URL_LIBREOFFICE_PARAMETERS = (
28
+ "https://help.libreoffice.org/latest/en-US/text/shared/guide/start_parameters.html"
29
+ )
30
+ URL_LIBREOFFICE_API = "https://api.libreoffice.org/"
31
+ URL_DEBIAN_PYTHON_UNO = "https://packages.debian.org/bullseye/python3-uno"
32
+
33
+
34
+ class UnoUpdatable(Protocol):
35
+ def update(self) -> None: ...
36
+
37
+
38
+ class UnoDocumentIndexes(Protocol):
39
+ def getCount(self) -> int: ...
40
+
41
+ def getByIndex(self, index: int) -> UnoUpdatable: ...
42
+
43
+
44
+ class UnoTextFields(Protocol):
45
+ def refresh(self) -> None: ...
46
+
47
+
48
+ class UnoDisposable(Protocol):
49
+ def dispose(self) -> None: ...
50
+
51
+
52
+ class UnoTextDocument(UnoDisposable, Protocol):
53
+ def updateLinks(self) -> None: ...
54
+
55
+ def refresh(self) -> None: ...
56
+
57
+ def getDocumentIndexes(self) -> UnoDocumentIndexes: ...
58
+
59
+ def getTextFields(self) -> UnoTextFields: ...
60
+
61
+ def store(self) -> None: ...
62
+
63
+ def storeToURL(self, url: str, properties: tuple[Any, ...]) -> None: ...
64
+
65
+ def close(self, deliver_ownership: bool) -> None: ...
66
+
67
+
68
+ class UnoDesktop(Protocol):
69
+ def loadComponentFromURL(
70
+ self,
71
+ url: str,
72
+ target_frame_name: str,
73
+ search_flags: int,
74
+ properties: tuple[Any, ...],
75
+ ) -> UnoTextDocument | None: ...
76
+
77
+
78
+ class ListenerProcess(Protocol):
79
+ def poll(self) -> int | None: ...
80
+
81
+ def terminate(self) -> None: ...
82
+
83
+ def wait(self, timeout: float | None = None) -> int | None: ...
84
+
85
+ def kill(self) -> None: ...
86
+
87
+
88
+ @dataclass(frozen=True, slots=True)
89
+ class DocxToPdfState:
90
+ options: DocxToPdfOptions
91
+
92
+
93
+ def run_docx_to_pdf_pipeline(options: DocxToPdfOptions) -> DocxToPdfResult:
94
+ state = create_docx_to_pdf_state(options)
95
+ validate_docx_input(state)
96
+ convert_docx_to_pdf_with_uno(state)
97
+ return create_docx_to_pdf_result(state)
98
+
99
+
100
+ def create_docx_to_pdf_state(options: DocxToPdfOptions) -> DocxToPdfState:
101
+ return DocxToPdfState(options=options)
102
+
103
+
104
+ def validate_docx_input(state: DocxToPdfState) -> DocxToPdfState:
105
+ file_in_docx = state.options.file_in_docx
106
+ if not file_in_docx.exists():
107
+ raise FileNotFoundError(f"Input DOCX does not exist: {file_in_docx.resolve()}")
108
+ if not file_in_docx.is_file():
109
+ raise RuntimeError(
110
+ f"Input DOCX is not a regular file: {file_in_docx.resolve()}"
111
+ )
112
+ if file_in_docx.stat().st_size == 0:
113
+ raise RuntimeError(f"Input DOCX is empty: {file_in_docx.resolve()}")
114
+ with file_in_docx.open("rb"):
115
+ pass
116
+ return state
117
+
118
+
119
+ def create_docx_to_pdf_result(state: DocxToPdfState) -> DocxToPdfResult:
120
+ return DocxToPdfResult(
121
+ file_pdf=state.options.file_out_pdf,
122
+ file_docx_refreshed=state.options.file_out_docx_refreshed,
123
+ )
124
+
125
+
126
+ def create_libreoffice_listener_command(
127
+ *,
128
+ exe_libreoffice: Path,
129
+ dir_user_profile: Path,
130
+ port: int,
131
+ ) -> list[str]:
132
+ return [
133
+ str(exe_libreoffice),
134
+ "--headless",
135
+ f"--accept=socket,host={LISTENER_HOST},port={port};urp;",
136
+ "--norestore",
137
+ "--nodefault",
138
+ f"-env:UserInstallation={dir_user_profile.resolve().as_uri()}",
139
+ ]
140
+
141
+
142
+ def convert_docx_to_pdf_with_uno(state: DocxToPdfState) -> DocxToPdfState:
143
+ options = state.options
144
+ options.dir_user_profile.mkdir(parents=True, exist_ok=True)
145
+ options.file_out_pdf.parent.mkdir(parents=True, exist_ok=True)
146
+ if options.file_out_docx_refreshed is not None:
147
+ options.file_out_docx_refreshed.parent.mkdir(parents=True, exist_ok=True)
148
+
149
+ with tempfile.TemporaryDirectory(prefix="docxrender-docx-stage-") as dir_stage_tmp:
150
+ file_in_docx_staged = copy_docx_to_stage(
151
+ options.file_in_docx,
152
+ dir_stage=Path(dir_stage_tmp),
153
+ )
154
+ uno_module = import_uno_module()
155
+ port = select_free_port()
156
+ (
157
+ file_listener_log_resolved,
158
+ handle_listener_log,
159
+ stdout_listener,
160
+ stderr_listener,
161
+ ) = open_listener_log_handle(options.file_listener_log)
162
+ process_listener = start_libreoffice_listener(
163
+ exe_libreoffice=options.exe_libreoffice,
164
+ dir_user_profile=options.dir_user_profile,
165
+ port=port,
166
+ stdout=stdout_listener,
167
+ stderr=stderr_listener,
168
+ file_listener_log=file_listener_log_resolved,
169
+ )
170
+ try:
171
+ wait_for_listener(port, file_listener_log=file_listener_log_resolved)
172
+ desktop = connect_desktop(uno_module, port)
173
+ doc: UnoTextDocument | None = None
174
+ try:
175
+ doc = load_uno_document_or_raise(
176
+ uno_module=uno_module,
177
+ desktop=desktop,
178
+ file_in_docx_source=options.file_in_docx,
179
+ file_in_docx_staged=file_in_docx_staged,
180
+ exe_libreoffice=options.exe_libreoffice,
181
+ dir_user_profile=options.dir_user_profile,
182
+ process_listener=process_listener,
183
+ file_listener_log=file_listener_log_resolved,
184
+ file_source_lock=find_source_lock_file(options.file_in_docx),
185
+ )
186
+ refresh_uno_document_fields(doc)
187
+ doc.store()
188
+ doc.storeToURL(
189
+ uno_module.systemPathToFileUrl(str(options.file_out_pdf.resolve())),
190
+ (
191
+ create_property("FilterName", "writer_pdf_Export"),
192
+ create_property("Overwrite", True),
193
+ ),
194
+ )
195
+ if options.file_out_docx_refreshed is not None:
196
+ shutil.copy2(file_in_docx_staged, options.file_out_docx_refreshed)
197
+ finally:
198
+ close_document(doc)
199
+ finally:
200
+ terminate_process(process_listener)
201
+ if handle_listener_log is not None:
202
+ handle_listener_log.close()
203
+ return state
204
+
205
+
206
+ def refresh_docx_with_uno(
207
+ *,
208
+ file_in_docx: Path,
209
+ file_out_docx: Path,
210
+ options: DocxFieldRefreshOptions,
211
+ ) -> None:
212
+ options.dir_user_profile.mkdir(parents=True, exist_ok=True)
213
+ file_out_docx.parent.mkdir(parents=True, exist_ok=True)
214
+
215
+ with tempfile.TemporaryDirectory(
216
+ prefix="docxrender-docx-refresh-stage-"
217
+ ) as dir_tmp:
218
+ file_in_docx_staged = copy_docx_to_stage(file_in_docx, dir_stage=Path(dir_tmp))
219
+ uno_module = import_uno_module()
220
+ port = select_free_port()
221
+ (
222
+ file_listener_log_resolved,
223
+ handle_listener_log,
224
+ stdout_listener,
225
+ stderr_listener,
226
+ ) = open_listener_log_handle(options.file_listener_log)
227
+ process_listener = start_libreoffice_listener(
228
+ exe_libreoffice=options.exe_libreoffice,
229
+ dir_user_profile=options.dir_user_profile,
230
+ port=port,
231
+ stdout=stdout_listener,
232
+ stderr=stderr_listener,
233
+ file_listener_log=file_listener_log_resolved,
234
+ )
235
+ try:
236
+ wait_for_listener(port, file_listener_log=file_listener_log_resolved)
237
+ desktop = connect_desktop(uno_module, port)
238
+ doc: UnoTextDocument | None = None
239
+ try:
240
+ doc = load_uno_document_or_raise(
241
+ uno_module=uno_module,
242
+ desktop=desktop,
243
+ file_in_docx_source=file_in_docx,
244
+ file_in_docx_staged=file_in_docx_staged,
245
+ exe_libreoffice=options.exe_libreoffice,
246
+ dir_user_profile=options.dir_user_profile,
247
+ process_listener=process_listener,
248
+ file_listener_log=file_listener_log_resolved,
249
+ file_source_lock=find_source_lock_file(file_in_docx),
250
+ )
251
+ refresh_uno_document_fields(doc)
252
+ doc.store()
253
+ finally:
254
+ close_document(doc)
255
+ finally:
256
+ terminate_process(process_listener)
257
+ if handle_listener_log is not None:
258
+ handle_listener_log.close()
259
+ shutil.copy2(file_in_docx_staged, file_out_docx)
260
+
261
+
262
+ def import_uno_module() -> Any:
263
+ try:
264
+ return importlib.import_module("uno")
265
+ except ImportError as exc:
266
+ raise RuntimeError(
267
+ "\n".join(
268
+ [
269
+ "error_code=libreoffice_uno_import_failed",
270
+ "reason=UNO Python bindings are not importable in this Python "
271
+ "environment.",
272
+ *create_libreoffice_runtime_guidance_fields(),
273
+ ]
274
+ )
275
+ ) from exc
276
+
277
+
278
+ def create_libreoffice_runtime_guidance_fields() -> list[str]:
279
+ return [
280
+ "runtime_dependency=LibreOffice and Python-UNO are external runtime "
281
+ "dependencies; docxrender does not install them through a Python "
282
+ "package extra.",
283
+ "validate_libreoffice=libreoffice --headless --version",
284
+ 'validate_uno=python -c "import uno"',
285
+ "install_debian_ubuntu=sudo apt install libreoffice python3-uno",
286
+ f"docs_libreoffice_parameters={URL_LIBREOFFICE_PARAMETERS}",
287
+ f"docs_libreoffice_api={URL_LIBREOFFICE_API}",
288
+ f"docs_debian_python_uno={URL_DEBIAN_PYTHON_UNO}",
289
+ ]
290
+
291
+
292
+ def validate_libreoffice_executable(exe_libreoffice: Path) -> None:
293
+ if not exe_libreoffice.exists():
294
+ raise FileNotFoundError(
295
+ "\n".join(
296
+ [
297
+ "error_code=libreoffice_executable_missing",
298
+ f"exe_libreoffice={exe_libreoffice.resolve()}",
299
+ *create_libreoffice_runtime_guidance_fields(),
300
+ ]
301
+ )
302
+ )
303
+ if not exe_libreoffice.is_file():
304
+ raise RuntimeError(
305
+ "\n".join(
306
+ [
307
+ "error_code=libreoffice_executable_not_file",
308
+ f"exe_libreoffice={exe_libreoffice.resolve()}",
309
+ *create_libreoffice_runtime_guidance_fields(),
310
+ ]
311
+ )
312
+ )
313
+
314
+
315
+ def start_libreoffice_listener(
316
+ *,
317
+ exe_libreoffice: Path,
318
+ dir_user_profile: Path,
319
+ port: int,
320
+ stdout: BufferedWriter | None,
321
+ stderr: BufferedWriter | None,
322
+ file_listener_log: Path | None,
323
+ ) -> subprocess.Popen[bytes]:
324
+ validate_libreoffice_executable(exe_libreoffice)
325
+ command = create_libreoffice_listener_command(
326
+ exe_libreoffice=exe_libreoffice,
327
+ dir_user_profile=dir_user_profile,
328
+ port=port,
329
+ )
330
+ try:
331
+ return subprocess.Popen(command, stdout=stdout, stderr=stderr)
332
+ except (FileNotFoundError, PermissionError) as exc:
333
+ raise RuntimeError(
334
+ "\n".join(
335
+ [
336
+ "error_code=libreoffice_listener_start_failed",
337
+ f"exe_libreoffice={exe_libreoffice.resolve()}",
338
+ f"dir_user_profile={dir_user_profile.resolve()}",
339
+ f"listener_log={listener_log_label(file_listener_log)}",
340
+ f"launch_error={type(exc).__name__}: {exc}",
341
+ *create_libreoffice_runtime_guidance_fields(),
342
+ ]
343
+ )
344
+ ) from exc
345
+
346
+
347
+ def select_free_port() -> int:
348
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
349
+ sock.bind((LISTENER_HOST, 0))
350
+ return int(sock.getsockname()[1])
351
+
352
+
353
+ def wait_for_listener(port: int, *, file_listener_log: Path | None = None) -> None:
354
+ deadline = time.monotonic() + LISTENER_START_TIMEOUT_SECONDS
355
+ while time.monotonic() < deadline:
356
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
357
+ sock.settimeout(LISTENER_POLL_INTERVAL_SECONDS)
358
+ if sock.connect_ex((LISTENER_HOST, port)) == 0:
359
+ return
360
+ time.sleep(LISTENER_POLL_INTERVAL_SECONDS)
361
+ fields = [
362
+ "error_code=libreoffice_uno_listener_timeout",
363
+ f"listener_host={LISTENER_HOST}",
364
+ f"listener_port={port}",
365
+ f"listener_log={listener_log_label(file_listener_log)}",
366
+ ]
367
+ text_log_tail = read_log_tail(file_listener_log)
368
+ if text_log_tail:
369
+ fields.append(f"listener_log_tail={format_log_field(text_log_tail)}")
370
+ fields.extend(create_libreoffice_runtime_guidance_fields())
371
+ raise TimeoutError("\n".join(fields))
372
+
373
+
374
+ def create_property(name: str, value: object) -> Any:
375
+ module_beans = importlib.import_module("com.sun.star.beans")
376
+ prop = module_beans.PropertyValue()
377
+ prop.Name = name
378
+ prop.Value = value
379
+ return prop
380
+
381
+
382
+ def copy_docx_to_stage(file_in_docx: Path, *, dir_stage: Path) -> Path:
383
+ file_staged_docx = dir_stage / file_in_docx.name
384
+ shutil.copy2(file_in_docx, file_staged_docx)
385
+ return file_staged_docx
386
+
387
+
388
+ def find_source_lock_file(file_in_docx: Path) -> Path | None:
389
+ file_lock = file_in_docx.parent / f".~lock.{file_in_docx.name}#"
390
+ if file_lock.exists():
391
+ return file_lock
392
+ return None
393
+
394
+
395
+ def open_listener_log_handle(
396
+ file_listener_log: Path | None,
397
+ ) -> tuple[
398
+ Path | None,
399
+ BufferedWriter | None,
400
+ BufferedWriter | None,
401
+ BufferedWriter | None,
402
+ ]:
403
+ if file_listener_log is None:
404
+ return None, None, None, None
405
+ file_listener_log = file_listener_log.resolve()
406
+ file_listener_log.parent.mkdir(parents=True, exist_ok=True)
407
+ handle_listener_log = file_listener_log.open("ab")
408
+ return (
409
+ file_listener_log,
410
+ handle_listener_log,
411
+ handle_listener_log,
412
+ handle_listener_log,
413
+ )
414
+
415
+
416
+ def connect_desktop(uno_module: Any, port: int) -> UnoDesktop:
417
+ context_local = uno_module.getComponentContext()
418
+ resolver = context_local.ServiceManager.createInstanceWithContext(
419
+ "com.sun.star.bridge.UnoUrlResolver",
420
+ context_local,
421
+ )
422
+ context_remote = resolver.resolve(
423
+ f"uno:socket,host={LISTENER_HOST},port={port};urp;StarOffice.ComponentContext"
424
+ )
425
+ return context_remote.ServiceManager.createInstanceWithContext(
426
+ "com.sun.star.frame.Desktop",
427
+ context_remote,
428
+ )
429
+
430
+
431
+ def refresh_uno_document_fields(doc: UnoTextDocument) -> None:
432
+ doc.refresh()
433
+ doc.updateLinks()
434
+ indexes = doc.getDocumentIndexes()
435
+ for idx in range(indexes.getCount()):
436
+ indexes.getByIndex(idx).update()
437
+ doc.getTextFields().refresh()
438
+ doc.refresh()
439
+
440
+
441
+ def load_uno_document_or_raise(
442
+ *,
443
+ uno_module: Any,
444
+ desktop: UnoDesktop,
445
+ file_in_docx_source: Path,
446
+ file_in_docx_staged: Path,
447
+ exe_libreoffice: Path,
448
+ dir_user_profile: Path,
449
+ process_listener: ListenerProcess,
450
+ file_listener_log: Path | None,
451
+ file_source_lock: Path | None,
452
+ ) -> UnoTextDocument:
453
+ file_url = uno_module.systemPathToFileUrl(str(file_in_docx_staged.resolve()))
454
+ props_default = (
455
+ create_property("Hidden", True),
456
+ create_property("ReadOnly", False),
457
+ create_property("UpdateDocMode", 1),
458
+ )
459
+ props_hidden_only = (create_property("Hidden", True),)
460
+
461
+ doc_probe = load_document_with_retry(
462
+ desktop=desktop,
463
+ url="private:factory/swriter",
464
+ properties=props_hidden_only,
465
+ )
466
+ probe_ok = doc_probe is not None
467
+ if doc_probe is not None:
468
+ close_document(doc_probe)
469
+
470
+ doc = load_document_with_retry(
471
+ desktop=desktop,
472
+ url=file_url,
473
+ properties=props_default,
474
+ )
475
+ load_default_ok = doc is not None
476
+ if doc is not None:
477
+ return doc
478
+
479
+ doc = load_document_with_retry(
480
+ desktop=desktop,
481
+ url=file_url,
482
+ properties=props_hidden_only,
483
+ )
484
+ load_hidden_only_ok = doc is not None
485
+ if doc is not None:
486
+ return doc
487
+
488
+ raise RuntimeError(
489
+ "\n".join(
490
+ create_load_failure_fields(
491
+ file_in_docx_source=file_in_docx_source,
492
+ file_in_docx_staged=file_in_docx_staged,
493
+ file_url=file_url,
494
+ exe_libreoffice=exe_libreoffice,
495
+ dir_user_profile=dir_user_profile,
496
+ process_listener=process_listener,
497
+ file_listener_log=file_listener_log,
498
+ file_source_lock=file_source_lock,
499
+ probe_ok=probe_ok,
500
+ load_default_ok=load_default_ok,
501
+ load_hidden_only_ok=load_hidden_only_ok,
502
+ )
503
+ )
504
+ )
505
+
506
+
507
+ def load_document_with_retry(
508
+ *,
509
+ desktop: UnoDesktop,
510
+ url: str,
511
+ properties: tuple[Any, ...],
512
+ ) -> UnoTextDocument | None:
513
+ deadline = time.monotonic() + DOCUMENT_LOAD_TIMEOUT_SECONDS
514
+ while True:
515
+ doc = desktop.loadComponentFromURL(url, "_blank", 0, properties)
516
+ if doc is not None:
517
+ return doc
518
+ if time.monotonic() >= deadline:
519
+ return None
520
+ time.sleep(DOCUMENT_LOAD_POLL_INTERVAL_SECONDS)
521
+
522
+
523
+ def create_load_failure_fields(
524
+ *,
525
+ file_in_docx_source: Path,
526
+ file_in_docx_staged: Path,
527
+ file_url: str,
528
+ exe_libreoffice: Path,
529
+ dir_user_profile: Path,
530
+ process_listener: ListenerProcess,
531
+ file_listener_log: Path | None,
532
+ file_source_lock: Path | None,
533
+ probe_ok: bool,
534
+ load_default_ok: bool,
535
+ load_hidden_only_ok: bool,
536
+ ) -> list[str]:
537
+ exit_code = process_listener.poll()
538
+ text_log_tail = read_log_tail(file_listener_log)
539
+
540
+ if not probe_ok:
541
+ reason_code = "uno_writer_probe_failed"
542
+ elif exit_code not in (None, 0):
543
+ reason_code = "listener_exited"
544
+ elif not load_default_ok and not load_hidden_only_ok:
545
+ reason_code = "staged_docx_import_failed"
546
+ else:
547
+ reason_code = "unknown_load_failure"
548
+
549
+ fields = [
550
+ "error_code=libreoffice_uno_load_failed",
551
+ f"reason_code={reason_code}",
552
+ f"file_in_docx={file_in_docx_source.resolve()}",
553
+ f"file_in_docx_staged={file_in_docx_staged.resolve()}",
554
+ f"file_url={file_url}",
555
+ f"exe_libreoffice={exe_libreoffice.resolve()}",
556
+ f"dir_user_profile={dir_user_profile.resolve()}",
557
+ f"listener_exit_code={exit_code}",
558
+ f"listener_log={listener_log_label(file_listener_log)}",
559
+ f"source_lock_file_present={file_source_lock is not None}",
560
+ f"probe_swriter_factory={'ok' if probe_ok else 'failed'}",
561
+ f"load_staged_default_props={'ok' if load_default_ok else 'failed'}",
562
+ f"load_staged_hidden_only={'ok' if load_hidden_only_ok else 'failed'}",
563
+ f"staged_docx_size_bytes={file_in_docx_staged.stat().st_size}",
564
+ ]
565
+ if file_source_lock is not None:
566
+ fields.append(f"source_lock_file={file_source_lock.resolve()}")
567
+ if text_log_tail:
568
+ fields.append(f"listener_log_tail={format_log_field(text_log_tail)}")
569
+ fields.extend(create_libreoffice_runtime_guidance_fields())
570
+ return fields
571
+
572
+
573
+ def read_log_tail(file_log: Path | None, *, max_bytes: int = 4000) -> str:
574
+ if file_log is None or not file_log.exists():
575
+ return ""
576
+ with file_log.open("rb") as handle_log:
577
+ handle_log.seek(0, 2)
578
+ size = handle_log.tell()
579
+ handle_log.seek(max(size - max_bytes, 0))
580
+ return handle_log.read().decode("utf-8", errors="replace").strip()
581
+
582
+
583
+ def format_log_field(value: object) -> str:
584
+ return str(value).replace("\n", r"\n")
585
+
586
+
587
+ def listener_log_label(file_listener_log: Path | None) -> str:
588
+ if file_listener_log is None:
589
+ return "stderr"
590
+ return str(file_listener_log)
591
+
592
+
593
+ def close_document(doc: UnoTextDocument | None) -> None:
594
+ if doc is None:
595
+ return
596
+ if hasattr(doc, "close"):
597
+ doc.close(True)
598
+ else:
599
+ doc.dispose()
600
+
601
+
602
+ def terminate_process(process: ListenerProcess) -> None:
603
+ process.terminate()
604
+ try:
605
+ process.wait(timeout=5)
606
+ except subprocess.TimeoutExpired:
607
+ process.kill()
608
+ process.wait(timeout=5)