pdf2docx-plus 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. pdf2docx_plus-0.6.1/.gitignore +32 -0
  2. pdf2docx_plus-0.6.1/LICENSE +7 -0
  3. pdf2docx_plus-0.6.1/LICENSING.md +70 -0
  4. pdf2docx_plus-0.6.1/PKG-INFO +236 -0
  5. pdf2docx_plus-0.6.1/README.md +170 -0
  6. pdf2docx_plus-0.6.1/docs/README.md +34 -0
  7. pdf2docx_plus-0.6.1/pdf2docx_plus/__init__.py +41 -0
  8. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/__init__.py +6 -0
  9. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/__init__.py +3 -0
  10. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/Block.py +144 -0
  11. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/Collection.py +359 -0
  12. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/Element.py +312 -0
  13. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/__init__.py +0 -0
  14. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/algorithm.py +403 -0
  15. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/constants.py +90 -0
  16. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/docx.py +591 -0
  17. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/share.py +310 -0
  18. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/converter.py +481 -0
  19. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/font/Fonts.py +240 -0
  20. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/font/__init__.py +0 -0
  21. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/gui/App.py +37 -0
  22. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/gui/MainFrame.py +147 -0
  23. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/gui/__init__.py +0 -0
  24. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/Image.py +94 -0
  25. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/ImageBlock.py +81 -0
  26. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/ImageSpan.py +27 -0
  27. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/ImagesExtractor.py +496 -0
  28. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/__init__.py +0 -0
  29. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Blocks.py +650 -0
  30. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Column.py +49 -0
  31. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Layout.py +177 -0
  32. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Section.py +97 -0
  33. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Sections.py +91 -0
  34. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/__init__.py +0 -0
  35. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/main.py +135 -0
  36. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/BasePage.py +27 -0
  37. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/Page.py +211 -0
  38. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/Pages.py +90 -0
  39. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/RawPage.py +279 -0
  40. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/RawPageFactory.py +23 -0
  41. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/RawPageFitz.py +164 -0
  42. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/__init__.py +0 -0
  43. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/Path.py +405 -0
  44. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/Paths.py +142 -0
  45. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/Shape.py +365 -0
  46. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/Shapes.py +241 -0
  47. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/__init__.py +0 -0
  48. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Border.py +419 -0
  49. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Cell.py +165 -0
  50. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Cells.py +27 -0
  51. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Row.py +78 -0
  52. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Rows.py +25 -0
  53. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/TableBlock.py +174 -0
  54. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/TableStructure.py +634 -0
  55. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/TablesConstructor.py +382 -0
  56. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/__init__.py +0 -0
  57. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/Char.py +65 -0
  58. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/Line.py +179 -0
  59. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/Lines.py +281 -0
  60. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/Spans.py +59 -0
  61. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/TextBlock.py +471 -0
  62. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/TextSpan.py +439 -0
  63. pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/__init__.py +0 -0
  64. pdf2docx_plus-0.6.1/pdf2docx_plus/api.py +870 -0
  65. pdf2docx_plus-0.6.1/pdf2docx_plus/backends/__init__.py +124 -0
  66. pdf2docx_plus-0.6.1/pdf2docx_plus/cli.py +145 -0
  67. pdf2docx_plus-0.6.1/pdf2docx_plus/consolidate.py +73 -0
  68. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/__init__.py +60 -0
  69. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/headers_footers.py +111 -0
  70. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/lists.py +229 -0
  71. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/page_breaks.py +57 -0
  72. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/page_footer.py +259 -0
  73. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/sections.py +252 -0
  74. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/table_fit.py +254 -0
  75. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/tables_cleanup.py +302 -0
  76. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/whitespace.py +55 -0
  77. pdf2docx_plus-0.6.1/pdf2docx_plus/emit/word_spacing.py +119 -0
  78. pdf2docx_plus-0.6.1/pdf2docx_plus/errors.py +53 -0
  79. pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/__init__.py +25 -0
  80. pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/crashguards.py +217 -0
  81. pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/hyperlink.py +56 -0
  82. pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/styles.py +31 -0
  83. pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/text.py +38 -0
  84. pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/tty.py +22 -0
  85. pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/__init__.py +29 -0
  86. pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/formula_ocr.py +82 -0
  87. pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/layout_detection.py +43 -0
  88. pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/ocr.py +38 -0
  89. pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/table_transformer.py +107 -0
  90. pdf2docx_plus-0.6.1/pdf2docx_plus/images/__init__.py +40 -0
  91. pdf2docx_plus-0.6.1/pdf2docx_plus/images/recovery.py +285 -0
  92. pdf2docx_plus-0.6.1/pdf2docx_plus/layout/__init__.py +20 -0
  93. pdf2docx_plus-0.6.1/pdf2docx_plus/layout/hf_detect.py +158 -0
  94. pdf2docx_plus-0.6.1/pdf2docx_plus/layout/lists.py +103 -0
  95. pdf2docx_plus-0.6.1/pdf2docx_plus/layout/scanned.py +76 -0
  96. pdf2docx_plus-0.6.1/pdf2docx_plus/logging.py +43 -0
  97. pdf2docx_plus-0.6.1/pdf2docx_plus/plugins/__init__.py +36 -0
  98. pdf2docx_plus-0.6.1/pdf2docx_plus/plugins/base.py +62 -0
  99. pdf2docx_plus-0.6.1/pdf2docx_plus/plugins/registry.py +45 -0
  100. pdf2docx_plus-0.6.1/pdf2docx_plus/py.typed +0 -0
  101. pdf2docx_plus-0.6.1/pdf2docx_plus/server.py +90 -0
  102. pdf2docx_plus-0.6.1/pdf2docx_plus/styles/__init__.py +144 -0
  103. pdf2docx_plus-0.6.1/pdf2docx_plus/tables/__init__.py +19 -0
  104. pdf2docx_plus-0.6.1/pdf2docx_plus/tables/float_images.py +97 -0
  105. pdf2docx_plus-0.6.1/pdf2docx_plus/tables/stitch.py +219 -0
  106. pdf2docx_plus-0.6.1/pdf2docx_plus/version.py +1 -0
  107. pdf2docx_plus-0.6.1/pyproject.toml +106 -0
@@ -0,0 +1,32 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .eggs/
5
+ dist/
6
+ build/
7
+ .venv/
8
+ venv/
9
+ env/
10
+ .mypy_cache/
11
+ .ruff_cache/
12
+ .pytest_cache/
13
+ .coverage
14
+ htmlcov/
15
+
16
+ # bench outputs
17
+ bench/reports/outputs/
18
+ bench/reports/*.json
19
+ !bench/reports/.gitkeep
20
+
21
+ # legacy upstream patterns
22
+ *.jp*g
23
+ layout.json
24
+ .vscode/
25
+ test/issues/
26
+ test/features/
27
+ test/outputs/
28
+ diff.png
29
+ pdf2docx*.rst
30
+
31
+ .env
32
+ .DS_Store
@@ -0,0 +1,7 @@
1
+ Copyright (c) 2026 Artifex Software, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,70 @@
1
+ # Licensing
2
+
3
+ `pdf2docx-plus` is MIT-licensed (see `LICENSE`). **However, it depends on PyMuPDF,
4
+ which is AGPL-3.0.** This section documents the practical consequences.
5
+
6
+ ## Dependency license matrix
7
+
8
+ | Package | License | Shipped with | Note |
9
+ |---|---|---|---|
10
+ | pdf2docx-plus (this project) | MIT | core | |
11
+ | pdf2docx (vendored patched upstream) | MIT | core | Artifex / dothinking |
12
+ | PyMuPDF (fitz) | **AGPL-3.0** | core | **See AGPL section below** |
13
+ | python-docx | MIT | core | |
14
+ | fonttools | MIT | core | |
15
+ | numpy | BSD-3-Clause | core | |
16
+ | opencv-python-headless | Apache-2.0 | core | |
17
+ | fire | Apache-2.0 | core | |
18
+ | fastapi / uvicorn | MIT / BSD-3 | `rest` extra | |
19
+ | apted | MIT | `bench` extra | |
20
+ | scikit-image | BSD-3-Clause | `bench` extra | |
21
+ | Table Transformer weights | MIT | `ml-tables` extra | |
22
+ | pix2tex / LaTeX-OCR | MIT | `ml-formula` extra | |
23
+ | PaddleOCR | Apache-2.0 | `ml-ocr` extra | |
24
+ | UniMERNet | Apache-2.0 | (optional, manual) | |
25
+
26
+ ## AGPL implications (PyMuPDF)
27
+
28
+ PyMuPDF is distributed under **AGPL-3.0**. When `pdf2docx-plus` is redistributed
29
+ or offered as a network service, the AGPL copyleft reaches through to the
30
+ consumer of that service:
31
+
32
+ - If you **ship pdf2docx-plus inside a closed-source product**, you need a
33
+ commercial PyMuPDF license from Artifex.
34
+ - If you **offer pdf2docx-plus as a SaaS/network service** to third parties,
35
+ the AGPL requires you to make the corresponding source (including your app)
36
+ available to those users.
37
+ - **Internal use** inside a single organisation is typically fine under AGPL.
38
+
39
+ ## Migrating away from PyMuPDF (future work)
40
+
41
+ The parse layer is isolated behind the `pdf2docx_plus.backends` abstraction so
42
+ the fitz dependency can be swapped for an Apache-2.0 / MIT alternative:
43
+
44
+ - **`pypdfium2`** (Apache-2.0): Google PDFium bindings. Exposes text with
45
+ positioning and page rendering but does *not* provide the rich
46
+ block/line/span extraction or path extraction that the current pipeline
47
+ relies on. A swap requires re-implementing ~3-4 weeks of extraction logic
48
+ using `pypdfium2` + `pdfplumber` (MIT) for ruling-line tables.
49
+ - **`pdfminer.six`** (MIT): slower but full text/layout extraction. Could be
50
+ a drop-in for many text paths.
51
+
52
+ The `pdf2docx_plus.backends.Backend` Protocol is the seam. When a permissive
53
+ backend is implemented, the same high-level API keeps working and AGPL falls
54
+ away from the default distribution.
55
+
56
+ ## OCR / ML model weights
57
+
58
+ Some ML integrations downloaded by the optional extras carry **non-commercial
59
+ or research-only** weights:
60
+
61
+ - **LayoutLMv3 weights**: CC-BY-NC-SA-4.0 — **not safe for commercial use**.
62
+ `pdf2docx-plus` does NOT ship or auto-download these.
63
+ - **Nougat (Meta) weights**: CC-BY-NC-4.0 — **not safe for commercial use**.
64
+ - **Surya / Marker weights**: OpenRAIL-M with a revenue cap. Safe up to the
65
+ cap; verify before relying on them in production.
66
+
67
+ The default `ml-*` extras pin only permissively-licensed models
68
+ (Table Transformer, pix2tex, PaddleOCR, UniMERNet). Users who wire in their
69
+ own detectors via the plugin API are responsible for their own weight
70
+ licensing.
@@ -0,0 +1,236 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf2docx-plus
3
+ Version: 0.6.1
4
+ Summary: Hardened PDF->DOCX converter. Fork of pdf2docx with stability fixes, typed API, plugin architecture, and optional ML layout/OCR/table backends.
5
+ Project-URL: Homepage, https://github.com/mithunvoe/pdf2docx-plus
6
+ Project-URL: Issues, https://github.com/mithunvoe/pdf2docx-plus/issues
7
+ Project-URL: Upstream, https://github.com/ArtifexSoftware/pdf2docx
8
+ Author: pdf2docx-plus maintainers
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: convert,docx,ocr,pdf,table,word
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Office/Business
18
+ Classifier: Topic :: Text Processing :: Markup
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: fire>=0.5.0
21
+ Requires-Dist: fonttools>=4.24.0
22
+ Requires-Dist: numpy>=1.24.0
23
+ Requires-Dist: opencv-python-headless>=4.8
24
+ Requires-Dist: pymupdf>=1.24.0
25
+ Requires-Dist: python-docx>=1.1.0
26
+ Requires-Dist: typing-extensions>=4.10
27
+ Provides-Extra: all
28
+ Requires-Dist: apted>=1.0.3; extra == 'all'
29
+ Requires-Dist: fastapi>=0.110; extra == 'all'
30
+ Requires-Dist: pillow>=10.0; extra == 'all'
31
+ Requires-Dist: python-multipart>=0.0.9; extra == 'all'
32
+ Requires-Dist: scikit-image>=0.22; extra == 'all'
33
+ Requires-Dist: scipy>=1.11; extra == 'all'
34
+ Requires-Dist: uvicorn[standard]>=0.27; extra == 'all'
35
+ Provides-Extra: bench
36
+ Requires-Dist: apted>=1.0.3; extra == 'bench'
37
+ Requires-Dist: pillow>=10.0; extra == 'bench'
38
+ Requires-Dist: scikit-image>=0.22; extra == 'bench'
39
+ Requires-Dist: scipy>=1.11; extra == 'bench'
40
+ Provides-Extra: dev
41
+ Requires-Dist: mypy>=1.10; extra == 'dev'
42
+ Requires-Dist: pre-commit>=3.6; extra == 'dev'
43
+ Requires-Dist: pytest-cov>=4.1; extra == 'dev'
44
+ Requires-Dist: pytest-timeout>=2.2; extra == 'dev'
45
+ Requires-Dist: pytest>=8.0; extra == 'dev'
46
+ Requires-Dist: ruff>=0.6; extra == 'dev'
47
+ Requires-Dist: types-setuptools; extra == 'dev'
48
+ Provides-Extra: ml-formula
49
+ Requires-Dist: pix2tex>=0.1.4; extra == 'ml-formula'
50
+ Requires-Dist: torch>=2.2; extra == 'ml-formula'
51
+ Provides-Extra: ml-layout
52
+ Requires-Dist: torch>=2.2; extra == 'ml-layout'
53
+ Requires-Dist: transformers>=4.40; extra == 'ml-layout'
54
+ Provides-Extra: ml-ocr
55
+ Requires-Dist: paddleocr>=2.7; extra == 'ml-ocr'
56
+ Requires-Dist: paddlepaddle>=2.6; extra == 'ml-ocr'
57
+ Provides-Extra: ml-tables
58
+ Requires-Dist: timm>=0.9; extra == 'ml-tables'
59
+ Requires-Dist: torch>=2.2; extra == 'ml-tables'
60
+ Requires-Dist: transformers>=4.40; extra == 'ml-tables'
61
+ Provides-Extra: rest
62
+ Requires-Dist: fastapi>=0.110; extra == 'rest'
63
+ Requires-Dist: python-multipart>=0.0.9; extra == 'rest'
64
+ Requires-Dist: uvicorn[standard]>=0.27; extra == 'rest'
65
+ Description-Content-Type: text/markdown
66
+
67
+ # pdf2docx-plus
68
+
69
+ Hardened fork of [pdf2docx](https://github.com/ArtifexSoftware/pdf2docx) — a
70
+ Python PDF → DOCX converter that actually writes editable Word documents
71
+ (not Markdown, not HTML).
72
+
73
+ **What's different from upstream**
74
+
75
+ | | upstream `pdf2docx` | `pdf2docx-plus` |
76
+ |---|---|---|
77
+ | Python support | 3.10+ | **3.11 / 3.12 / 3.13** |
78
+ | Hyperlink OOXML | nested inside `<w:r>` (invalid) | paragraph-level `<w:hyperlink>` (valid) |
79
+ | NULL-byte / control chars | sometimes leaks into `<w:t>`, corrupts DOCX | stripped at run insertion |
80
+ | Errors | single `ConversionException` | `InputError` / `ParseError` / `MakeDocxError` / `PasswordRequired` / `TimeoutExceeded` |
81
+ | Typed API | no | `py.typed`, dataclasses, `Protocol`-based plugins |
82
+ | Return value | `None` | `ConversionResult` with per-page accounting |
83
+ | Timeout | none (can hang forever) | `timeout_s=` watchdog |
84
+ | Plugin architecture | no | swap table / layout / OCR / formula backends |
85
+ | REST server | no | `pdf2docx-plus serve` (FastAPI, optional) |
86
+ | ML hooks (opt-in) | no | Table Transformer, Granite-Docling, PaddleOCR, pix2tex |
87
+ | Tables → CSV | no | `--tables-csv DIR` |
88
+ | Structured logging | hijacks root logger | scoped `pdf2docx_plus` logger |
89
+
90
+ ## Install
91
+
92
+ ```bash
93
+ pip install pdf2docx-plus # core
94
+ pip install 'pdf2docx-plus[rest]' # + FastAPI server
95
+ pip install 'pdf2docx-plus[bench]' # + evaluation harness
96
+ pip install 'pdf2docx-plus[ml-tables]' # + Table Transformer (torch)
97
+ pip install 'pdf2docx-plus[ml-ocr]' # + PaddleOCR
98
+ ```
99
+
100
+ ## Quick start
101
+
102
+ ```python
103
+ from pdf2docx_plus import convert
104
+
105
+ result = convert("in.pdf", "out.docx", timeout_s=120)
106
+ print(result.pages_ok, "/", result.pages_total, "pages in", result.elapsed_s, "s")
107
+ ```
108
+
109
+ Or with more control:
110
+
111
+ ```python
112
+ from pdf2docx_plus import Converter, PluginRegistry
113
+ from pdf2docx_plus.hooks import TableTransformerDetector
114
+
115
+ plugins = PluginRegistry()
116
+ plugins.add_table_detector(TableTransformerDetector(device="cuda"))
117
+
118
+ with Converter("in.pdf", password="s3cret") as cv:
119
+ result = cv.convert(
120
+ "out.docx",
121
+ pages=[0, 1, 2],
122
+ profile="fidelity", # "fast" | "fidelity" | "semantic"
123
+ timeout_s=60,
124
+ continue_on_error=True,
125
+ )
126
+ for p in result.page_results:
127
+ if not p.ok:
128
+ print(f"page {p.page_index}: {p.error}")
129
+ ```
130
+
131
+ ## CLI
132
+
133
+ ```
134
+ pdf2docx-plus convert in.pdf out.docx --timeout 120 --profile fidelity
135
+ pdf2docx-plus convert in.pdf --pages 0,2,5 --tables-csv tables/
136
+ pdf2docx-plus extract-tables in.pdf --out tables.json
137
+ pdf2docx-plus serve --host 0.0.0.0 --port 8000
138
+ pdf2docx-plus version
139
+ ```
140
+
141
+ ## REST server
142
+
143
+ ```bash
144
+ pip install 'pdf2docx-plus[rest]'
145
+ pdf2docx-plus serve --port 8000
146
+ # in another shell:
147
+ curl -F file=@in.pdf -F profile=fidelity http://localhost:8000/convert -o out.docx
148
+ ```
149
+
150
+ Endpoints:
151
+
152
+ | Method | Path | Body | Returns |
153
+ |---|---|---|---|
154
+ | POST | `/convert` | multipart `file`, optional `password`, `profile`, `timeout_s` | DOCX bytes + `X-Pages-Ok` / `X-Pages-Failed` / `X-Elapsed-Seconds` headers |
155
+ | POST | `/extract-tables` | multipart `file`, optional `password` | JSON `{"tables": [...]}` |
156
+ | GET | `/healthz` | — | `{"status": "ok"}` |
157
+ | GET | `/version` | — | `{"version": "..."}` |
158
+
159
+ ## Plugin architecture
160
+
161
+ Four extension points, all `Protocol`-based:
162
+
163
+ ```python
164
+ from pdf2docx_plus.plugins import (
165
+ TableDetector, LayoutDetector, OcrEngine, FormulaRecognizer
166
+ )
167
+ ```
168
+
169
+ Register any implementation on `PluginRegistry` and pass it to `Converter`.
170
+ Plugins never kill a conversion — exceptions raised inside a plugin are
171
+ logged and skipped.
172
+
173
+ Built-in ML hooks (opt-in extras):
174
+
175
+ | Hook | Backend | Extra | Weights license |
176
+ |---|---|---|---|
177
+ | `TableTransformerDetector` | HuggingFace `microsoft/table-transformer-*` | `ml-tables` | MIT |
178
+ | `GraniteDoclingLayoutDetector` | `ibm-granite/granite-docling-258M` | `ml-layout` | Apache-2.0 |
179
+ | `PaddleOcrEngine` | PaddleOCR | `ml-ocr` | Apache-2.0 |
180
+ | `Pix2TexFormulaRecognizer` | pix2tex | `ml-formula` | MIT |
181
+ | `UniMERNetFormulaRecognizer` | UniMERNet (bring weights) | manual | Apache-2.0 |
182
+
183
+ ## Benchmark
184
+
185
+ ```bash
186
+ pip install 'pdf2docx-plus[bench]'
187
+ python -m bench.run --corpus bench/corpus --out bench/reports/latest.json
188
+ ```
189
+
190
+ Metrics implemented: text F1, TEDS (`apted`), reading-order Kendall-tau,
191
+ rendered SSIM (via LibreOffice + scikit-image), and editability ratio.
192
+
193
+ Seed corpus in this repo: 3 financial fund PDFs (born-digital). Drop more
194
+ under `bench/corpus/<name>/input.pdf` and, optionally, `expected_text.txt`,
195
+ `expected_tables.json`, `expected_order.json` for scoring.
196
+
197
+ Current baseline on the seed corpus (76 pages, CPU):
198
+
199
+ ```
200
+ awhkef 9 pages 0 failed 7.1 s 74 KB
201
+ first_sentier 58 pages 0 failed 15.8 s 155 KB
202
+ kfs_bosera 9 pages 0 failed 4.3 s 87 KB
203
+ TOTAL 76 pages 0 failed 27.7 s 2.75 pg/s
204
+ ```
205
+
206
+ ## Licensing
207
+
208
+ `pdf2docx-plus` is MIT, but **depends on PyMuPDF (AGPL-3.0)** — this
209
+ propagates to you if you redistribute or expose as a network service. See
210
+ [LICENSING.md](LICENSING.md) for the full dependency matrix, AGPL
211
+ implications, and the future pypdfium2 migration path.
212
+
213
+ ## What's NOT done yet (roadmap)
214
+
215
+ This fork covers **Phase 0** (foundation) and most of **Phase 1** (stability
216
+ + typed API) from the original 21-week
217
+ [`PDF2DOCX_FORK_PLAN.md`](../PDF2DOCX_FORK_PLAN.md). Phases 2–5 are scaffolded
218
+ via the plugin architecture but the ML-backed hooks need real integration
219
+ work to reach the v1.0 success criteria in the plan (TEDS ≥ 0.90, text F1 ≥
220
+ 0.98, reading-order Kendall-tau ≥ 0.90).
221
+
222
+ Specifically, still open:
223
+
224
+ - Train / evaluate Table Transformer + Granite-Docling against an annotated
225
+ corpus (plan §K).
226
+ - Cross-page table stitching heuristic (§B.7).
227
+ - Header/footer → `w:hdr` / `w:ftr` emission (§C.13).
228
+ - Math recognition pipeline wiring (§F.24).
229
+ - Scanned-PDF OCR routing + auto-detect (§G.25).
230
+ - `styles.xml` rewrite (§H.27) — currently we still use python-docx defaults.
231
+ - pypdfium2 backend for permissive licensing (§6).
232
+
233
+ ## Credits
234
+
235
+ Forked from [ArtifexSoftware/pdf2docx](https://github.com/ArtifexSoftware/pdf2docx)
236
+ (originally by [@dothinking](https://github.com/dothinking)). MIT.
@@ -0,0 +1,170 @@
1
+ # pdf2docx-plus
2
+
3
+ Hardened fork of [pdf2docx](https://github.com/ArtifexSoftware/pdf2docx) — a
4
+ Python PDF → DOCX converter that actually writes editable Word documents
5
+ (not Markdown, not HTML).
6
+
7
+ **What's different from upstream**
8
+
9
+ | | upstream `pdf2docx` | `pdf2docx-plus` |
10
+ |---|---|---|
11
+ | Python support | 3.10+ | **3.11 / 3.12 / 3.13** |
12
+ | Hyperlink OOXML | nested inside `<w:r>` (invalid) | paragraph-level `<w:hyperlink>` (valid) |
13
+ | NULL-byte / control chars | sometimes leaks into `<w:t>`, corrupts DOCX | stripped at run insertion |
14
+ | Errors | single `ConversionException` | `InputError` / `ParseError` / `MakeDocxError` / `PasswordRequired` / `TimeoutExceeded` |
15
+ | Typed API | no | `py.typed`, dataclasses, `Protocol`-based plugins |
16
+ | Return value | `None` | `ConversionResult` with per-page accounting |
17
+ | Timeout | none (can hang forever) | `timeout_s=` watchdog |
18
+ | Plugin architecture | no | swap table / layout / OCR / formula backends |
19
+ | REST server | no | `pdf2docx-plus serve` (FastAPI, optional) |
20
+ | ML hooks (opt-in) | no | Table Transformer, Granite-Docling, PaddleOCR, pix2tex |
21
+ | Tables → CSV | no | `--tables-csv DIR` |
22
+ | Structured logging | hijacks root logger | scoped `pdf2docx_plus` logger |
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install pdf2docx-plus # core
28
+ pip install 'pdf2docx-plus[rest]' # + FastAPI server
29
+ pip install 'pdf2docx-plus[bench]' # + evaluation harness
30
+ pip install 'pdf2docx-plus[ml-tables]' # + Table Transformer (torch)
31
+ pip install 'pdf2docx-plus[ml-ocr]' # + PaddleOCR
32
+ ```
33
+
34
+ ## Quick start
35
+
36
+ ```python
37
+ from pdf2docx_plus import convert
38
+
39
+ result = convert("in.pdf", "out.docx", timeout_s=120)
40
+ print(result.pages_ok, "/", result.pages_total, "pages in", result.elapsed_s, "s")
41
+ ```
42
+
43
+ Or with more control:
44
+
45
+ ```python
46
+ from pdf2docx_plus import Converter, PluginRegistry
47
+ from pdf2docx_plus.hooks import TableTransformerDetector
48
+
49
+ plugins = PluginRegistry()
50
+ plugins.add_table_detector(TableTransformerDetector(device="cuda"))
51
+
52
+ with Converter("in.pdf", password="s3cret") as cv:
53
+ result = cv.convert(
54
+ "out.docx",
55
+ pages=[0, 1, 2],
56
+ profile="fidelity", # "fast" | "fidelity" | "semantic"
57
+ timeout_s=60,
58
+ continue_on_error=True,
59
+ )
60
+ for p in result.page_results:
61
+ if not p.ok:
62
+ print(f"page {p.page_index}: {p.error}")
63
+ ```
64
+
65
+ ## CLI
66
+
67
+ ```
68
+ pdf2docx-plus convert in.pdf out.docx --timeout 120 --profile fidelity
69
+ pdf2docx-plus convert in.pdf --pages 0,2,5 --tables-csv tables/
70
+ pdf2docx-plus extract-tables in.pdf --out tables.json
71
+ pdf2docx-plus serve --host 0.0.0.0 --port 8000
72
+ pdf2docx-plus version
73
+ ```
74
+
75
+ ## REST server
76
+
77
+ ```bash
78
+ pip install 'pdf2docx-plus[rest]'
79
+ pdf2docx-plus serve --port 8000
80
+ # in another shell:
81
+ curl -F file=@in.pdf -F profile=fidelity http://localhost:8000/convert -o out.docx
82
+ ```
83
+
84
+ Endpoints:
85
+
86
+ | Method | Path | Body | Returns |
87
+ |---|---|---|---|
88
+ | POST | `/convert` | multipart `file`, optional `password`, `profile`, `timeout_s` | DOCX bytes + `X-Pages-Ok` / `X-Pages-Failed` / `X-Elapsed-Seconds` headers |
89
+ | POST | `/extract-tables` | multipart `file`, optional `password` | JSON `{"tables": [...]}` |
90
+ | GET | `/healthz` | — | `{"status": "ok"}` |
91
+ | GET | `/version` | — | `{"version": "..."}` |
92
+
93
+ ## Plugin architecture
94
+
95
+ Four extension points, all `Protocol`-based:
96
+
97
+ ```python
98
+ from pdf2docx_plus.plugins import (
99
+ TableDetector, LayoutDetector, OcrEngine, FormulaRecognizer
100
+ )
101
+ ```
102
+
103
+ Register any implementation on `PluginRegistry` and pass it to `Converter`.
104
+ Plugins never kill a conversion — exceptions raised inside a plugin are
105
+ logged and skipped.
106
+
107
+ Built-in ML hooks (opt-in extras):
108
+
109
+ | Hook | Backend | Extra | Weights license |
110
+ |---|---|---|---|
111
+ | `TableTransformerDetector` | HuggingFace `microsoft/table-transformer-*` | `ml-tables` | MIT |
112
+ | `GraniteDoclingLayoutDetector` | `ibm-granite/granite-docling-258M` | `ml-layout` | Apache-2.0 |
113
+ | `PaddleOcrEngine` | PaddleOCR | `ml-ocr` | Apache-2.0 |
114
+ | `Pix2TexFormulaRecognizer` | pix2tex | `ml-formula` | MIT |
115
+ | `UniMERNetFormulaRecognizer` | UniMERNet (bring weights) | manual | Apache-2.0 |
116
+
117
+ ## Benchmark
118
+
119
+ ```bash
120
+ pip install 'pdf2docx-plus[bench]'
121
+ python -m bench.run --corpus bench/corpus --out bench/reports/latest.json
122
+ ```
123
+
124
+ Metrics implemented: text F1, TEDS (`apted`), reading-order Kendall-tau,
125
+ rendered SSIM (via LibreOffice + scikit-image), and editability ratio.
126
+
127
+ Seed corpus in this repo: 3 financial fund PDFs (born-digital). Drop more
128
+ under `bench/corpus/<name>/input.pdf` and, optionally, `expected_text.txt`,
129
+ `expected_tables.json`, `expected_order.json` for scoring.
130
+
131
+ Current baseline on the seed corpus (76 pages, CPU):
132
+
133
+ ```
134
+ awhkef 9 pages 0 failed 7.1 s 74 KB
135
+ first_sentier 58 pages 0 failed 15.8 s 155 KB
136
+ kfs_bosera 9 pages 0 failed 4.3 s 87 KB
137
+ TOTAL 76 pages 0 failed 27.7 s 2.75 pg/s
138
+ ```
139
+
140
+ ## Licensing
141
+
142
+ `pdf2docx-plus` is MIT, but **depends on PyMuPDF (AGPL-3.0)** — this
143
+ propagates to you if you redistribute or expose as a network service. See
144
+ [LICENSING.md](LICENSING.md) for the full dependency matrix, AGPL
145
+ implications, and the future pypdfium2 migration path.
146
+
147
+ ## What's NOT done yet (roadmap)
148
+
149
+ This fork covers **Phase 0** (foundation) and most of **Phase 1** (stability
150
+ + typed API) from the original 21-week
151
+ [`PDF2DOCX_FORK_PLAN.md`](../PDF2DOCX_FORK_PLAN.md). Phases 2–5 are scaffolded
152
+ via the plugin architecture but the ML-backed hooks need real integration
153
+ work to reach the v1.0 success criteria in the plan (TEDS ≥ 0.90, text F1 ≥
154
+ 0.98, reading-order Kendall-tau ≥ 0.90).
155
+
156
+ Specifically, still open:
157
+
158
+ - Train / evaluate Table Transformer + Granite-Docling against an annotated
159
+ corpus (plan §K).
160
+ - Cross-page table stitching heuristic (§B.7).
161
+ - Header/footer → `w:hdr` / `w:ftr` emission (§C.13).
162
+ - Math recognition pipeline wiring (§F.24).
163
+ - Scanned-PDF OCR routing + auto-detect (§G.25).
164
+ - `styles.xml` rewrite (§H.27) — currently we still use python-docx defaults.
165
+ - pypdfium2 backend for permissive licensing (§6).
166
+
167
+ ## Credits
168
+
169
+ Forked from [ArtifexSoftware/pdf2docx](https://github.com/ArtifexSoftware/pdf2docx)
170
+ (originally by [@dothinking](https://github.com/dothinking)). MIT.
@@ -0,0 +1,34 @@
1
+ # pdf2docx documentation
2
+
3
+ Welcome to the **pdf2docx** documentation. This documentation relies on [Sphinx](https://www.sphinx-doc.org/en/master/) to publish HTML docs from markdown files written with [restructured text](https://en.wikipedia.org/wiki/ReStructuredText) (RST).
4
+
5
+
6
+ ## Sphinx version
7
+
8
+ This README assumes you have [Sphinx v5.0.2 installed](https://www.sphinx-doc.org/en/master/usage/installation.html) on your system.
9
+
10
+
11
+ ## Updating the documentation
12
+
13
+ Within `docs` update the associated restructured text (`.rst`) files. These files represent the corresponding document pages.
14
+
15
+
16
+ ## Building HTML documentation
17
+
18
+ - Ensure you have the `furo` theme installed:
19
+
20
+ `pip install furo`
21
+
22
+ Furo theme, Copyright (c) 2020 Pradyun Gedam <mail@pradyunsg.me>, thank you to:
23
+
24
+ https://github.com/pradyunsg/furo/blob/main/LICENSE
25
+
26
+ - From the "docs" location run:
27
+
28
+ `sphinx-build -b html . build/html`
29
+
30
+ This then creates the HTML documentation within `build/html`.
31
+
32
+ > Use: `sphinx-build -a -b html . build/html` to build all, including the assets in `_static` (important if you have updated CSS).
33
+
34
+ For full details see: [Using Sphinx](https://www.sphinx-doc.org/en/master/usage/index.html)
@@ -0,0 +1,41 @@
1
+ """pdf2docx-plus: hardened PDF -> DOCX converter.
2
+
3
+ Public API:
4
+
5
+ from pdf2docx_plus import Converter, convert, ConversionResult
6
+
7
+ result = convert("in.pdf", "out.docx", timeout_s=60)
8
+ print(result.pages_ok, result.pages_failed, result.elapsed_s)
9
+
10
+ Lower-level facade:
11
+
12
+ with Converter("in.pdf") as cv:
13
+ cv.convert("out.docx", pages=[0, 1, 2])
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from .api import ConversionResult, Converter, convert, extract_tables
19
+ from .errors import (
20
+ ConversionError,
21
+ InputError,
22
+ MakeDocxError,
23
+ ParseError,
24
+ PasswordRequired,
25
+ TimeoutExceeded,
26
+ )
27
+ from .version import __version__
28
+
29
+ __all__ = [
30
+ "ConversionError",
31
+ "ConversionResult",
32
+ "Converter",
33
+ "InputError",
34
+ "MakeDocxError",
35
+ "ParseError",
36
+ "PasswordRequired",
37
+ "TimeoutExceeded",
38
+ "__version__",
39
+ "convert",
40
+ "extract_tables",
41
+ ]
@@ -0,0 +1,6 @@
1
+ """Vendored third-party packages.
2
+
3
+ These packages are shipped inside pdf2docx_plus to isolate them from
4
+ whatever else the user has installed. Do not import from here directly
5
+ from application code; use the public pdf2docx_plus API instead.
6
+ """
@@ -0,0 +1,3 @@
1
+ from .converter import Converter
2
+ from .page.Page import Page
3
+ from .main import parse