pdf2docx-plus 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf2docx_plus-0.6.1/.gitignore +32 -0
- pdf2docx_plus-0.6.1/LICENSE +7 -0
- pdf2docx_plus-0.6.1/LICENSING.md +70 -0
- pdf2docx_plus-0.6.1/PKG-INFO +236 -0
- pdf2docx_plus-0.6.1/README.md +170 -0
- pdf2docx_plus-0.6.1/docs/README.md +34 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/__init__.py +41 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/__init__.py +6 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/__init__.py +3 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/Block.py +144 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/Collection.py +359 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/Element.py +312 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/__init__.py +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/algorithm.py +403 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/constants.py +90 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/docx.py +591 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/common/share.py +310 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/converter.py +481 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/font/Fonts.py +240 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/font/__init__.py +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/gui/App.py +37 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/gui/MainFrame.py +147 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/gui/__init__.py +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/Image.py +94 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/ImageBlock.py +81 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/ImageSpan.py +27 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/ImagesExtractor.py +496 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/image/__init__.py +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Blocks.py +650 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Column.py +49 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Layout.py +177 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Section.py +97 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/Sections.py +91 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/layout/__init__.py +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/main.py +135 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/BasePage.py +27 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/Page.py +211 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/Pages.py +90 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/RawPage.py +279 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/RawPageFactory.py +23 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/RawPageFitz.py +164 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/page/__init__.py +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/Path.py +405 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/Paths.py +142 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/Shape.py +365 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/Shapes.py +241 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/shape/__init__.py +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Border.py +419 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Cell.py +165 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Cells.py +27 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Row.py +78 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/Rows.py +25 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/TableBlock.py +174 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/TableStructure.py +634 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/TablesConstructor.py +382 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/table/__init__.py +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/Char.py +65 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/Line.py +179 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/Lines.py +281 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/Spans.py +59 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/TextBlock.py +471 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/TextSpan.py +439 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/_vendored/pdf2docx/text/__init__.py +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/api.py +870 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/backends/__init__.py +124 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/cli.py +145 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/consolidate.py +73 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/__init__.py +60 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/headers_footers.py +111 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/lists.py +229 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/page_breaks.py +57 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/page_footer.py +259 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/sections.py +252 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/table_fit.py +254 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/tables_cleanup.py +302 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/whitespace.py +55 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/emit/word_spacing.py +119 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/errors.py +53 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/__init__.py +25 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/crashguards.py +217 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/hyperlink.py +56 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/styles.py +31 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/text.py +38 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/fidelity/tty.py +22 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/__init__.py +29 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/formula_ocr.py +82 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/layout_detection.py +43 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/ocr.py +38 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/hooks/table_transformer.py +107 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/images/__init__.py +40 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/images/recovery.py +285 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/layout/__init__.py +20 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/layout/hf_detect.py +158 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/layout/lists.py +103 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/layout/scanned.py +76 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/logging.py +43 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/plugins/__init__.py +36 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/plugins/base.py +62 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/plugins/registry.py +45 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/py.typed +0 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/server.py +90 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/styles/__init__.py +144 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/tables/__init__.py +19 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/tables/float_images.py +97 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/tables/stitch.py +219 -0
- pdf2docx_plus-0.6.1/pdf2docx_plus/version.py +1 -0
- pdf2docx_plus-0.6.1/pyproject.toml +106 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*.egg-info/
|
|
4
|
+
.eggs/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
.venv/
|
|
8
|
+
venv/
|
|
9
|
+
env/
|
|
10
|
+
.mypy_cache/
|
|
11
|
+
.ruff_cache/
|
|
12
|
+
.pytest_cache/
|
|
13
|
+
.coverage
|
|
14
|
+
htmlcov/
|
|
15
|
+
|
|
16
|
+
# bench outputs
|
|
17
|
+
bench/reports/outputs/
|
|
18
|
+
bench/reports/*.json
|
|
19
|
+
!bench/reports/.gitkeep
|
|
20
|
+
|
|
21
|
+
# legacy upstream patterns
|
|
22
|
+
*.jp*g
|
|
23
|
+
layout.json
|
|
24
|
+
.vscode/
|
|
25
|
+
test/issues/
|
|
26
|
+
test/features/
|
|
27
|
+
test/outputs/
|
|
28
|
+
diff.png
|
|
29
|
+
pdf2docx*.rst
|
|
30
|
+
|
|
31
|
+
.env
|
|
32
|
+
.DS_Store
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright (c) 2026 Artifex Software, Inc.
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Licensing
|
|
2
|
+
|
|
3
|
+
`pdf2docx-plus` is MIT-licensed (see `LICENSE`). **However, it depends on PyMuPDF,
|
|
4
|
+
which is AGPL-3.0.** This section documents the practical consequences.
|
|
5
|
+
|
|
6
|
+
## Dependency license matrix
|
|
7
|
+
|
|
8
|
+
| Package | License | Shipped with | Note |
|
|
9
|
+
|---|---|---|---|
|
|
10
|
+
| pdf2docx-plus (this project) | MIT | core | |
|
|
11
|
+
| pdf2docx (vendored patched upstream) | MIT | core | Artifex / dothinking |
|
|
12
|
+
| PyMuPDF (fitz) | **AGPL-3.0** | core | **See AGPL section below** |
|
|
13
|
+
| python-docx | MIT | core | |
|
|
14
|
+
| fonttools | MIT | core | |
|
|
15
|
+
| numpy | BSD-3-Clause | core | |
|
|
16
|
+
| opencv-python-headless | Apache-2.0 | core | |
|
|
17
|
+
| fire | Apache-2.0 | core | |
|
|
18
|
+
| fastapi / uvicorn | MIT / BSD-3 | `rest` extra | |
|
|
19
|
+
| apted | MIT | `bench` extra | |
|
|
20
|
+
| scikit-image | BSD-3-Clause | `bench` extra | |
|
|
21
|
+
| Table Transformer weights | MIT | `ml-tables` extra | |
|
|
22
|
+
| pix2tex / LaTeX-OCR | MIT | `ml-formula` extra | |
|
|
23
|
+
| PaddleOCR | Apache-2.0 | `ml-ocr` extra | |
|
|
24
|
+
| UniMERNet | Apache-2.0 | (optional, manual) | |
|
|
25
|
+
|
|
26
|
+
## AGPL implications (PyMuPDF)
|
|
27
|
+
|
|
28
|
+
PyMuPDF is distributed under **AGPL-3.0**. When `pdf2docx-plus` is redistributed
|
|
29
|
+
or offered as a network service, the AGPL copyleft reaches through to the
|
|
30
|
+
consumer of that service:
|
|
31
|
+
|
|
32
|
+
- If you **ship pdf2docx-plus inside a closed-source product**, you need a
|
|
33
|
+
commercial PyMuPDF license from Artifex.
|
|
34
|
+
- If you **offer pdf2docx-plus as a SaaS/network service** to third parties,
|
|
35
|
+
the AGPL requires you to make the corresponding source (including your app)
|
|
36
|
+
available to those users.
|
|
37
|
+
- **Internal use** inside a single organisation is typically fine under AGPL.
|
|
38
|
+
|
|
39
|
+
## Migrating away from PyMuPDF (future work)
|
|
40
|
+
|
|
41
|
+
The parse layer is isolated behind the `pdf2docx_plus.backends` abstraction so
|
|
42
|
+
the fitz dependency can be swapped for an Apache-2.0 / MIT alternative:
|
|
43
|
+
|
|
44
|
+
- **`pypdfium2`** (Apache-2.0): Google PDFium bindings. Exposes text with
|
|
45
|
+
positioning and page rendering but does *not* provide the rich
|
|
46
|
+
block/line/span extraction or path extraction that the current pipeline
|
|
47
|
+
relies on. A swap requires re-implementing ~3-4 weeks of extraction logic
|
|
48
|
+
using `pypdfium2` + `pdfplumber` (MIT) for ruling-line tables.
|
|
49
|
+
- **`pdfminer.six`** (MIT): slower but full text/layout extraction. Could be
|
|
50
|
+
a drop-in for many text paths.
|
|
51
|
+
|
|
52
|
+
The `pdf2docx_plus.backends.Backend` Protocol is the seam. When a permissive
|
|
53
|
+
backend is implemented, the same high-level API keeps working and AGPL falls
|
|
54
|
+
away from the default distribution.
|
|
55
|
+
|
|
56
|
+
## OCR / ML model weights
|
|
57
|
+
|
|
58
|
+
Some ML integrations downloaded by the optional extras carry **non-commercial
|
|
59
|
+
or research-only** weights:
|
|
60
|
+
|
|
61
|
+
- **LayoutLMv3 weights**: CC-BY-NC-SA-4.0 — **not safe for commercial use**.
|
|
62
|
+
`pdf2docx-plus` does NOT ship or auto-download these.
|
|
63
|
+
- **Nougat (Meta) weights**: CC-BY-NC-4.0 — **not safe for commercial use**.
|
|
64
|
+
- **Surya / Marker weights**: OpenRAIL-M with a revenue cap. Safe up to the
|
|
65
|
+
cap; verify before relying on them in production.
|
|
66
|
+
|
|
67
|
+
The default `ml-*` extras pin only permissively-licensed models
|
|
68
|
+
(Table Transformer, pix2tex, PaddleOCR, UniMERNet). Users who wire in their
|
|
69
|
+
own detectors via the plugin API are responsible for their own weight
|
|
70
|
+
licensing.
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf2docx-plus
|
|
3
|
+
Version: 0.6.1
|
|
4
|
+
Summary: Hardened PDF->DOCX converter. Fork of pdf2docx with stability fixes, typed API, plugin architecture, and optional ML layout/OCR/table backends.
|
|
5
|
+
Project-URL: Homepage, https://github.com/mithunvoe/pdf2docx-plus
|
|
6
|
+
Project-URL: Issues, https://github.com/mithunvoe/pdf2docx-plus/issues
|
|
7
|
+
Project-URL: Upstream, https://github.com/ArtifexSoftware/pdf2docx
|
|
8
|
+
Author: pdf2docx-plus maintainers
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: convert,docx,ocr,pdf,table,word
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Office/Business
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: fire>=0.5.0
|
|
21
|
+
Requires-Dist: fonttools>=4.24.0
|
|
22
|
+
Requires-Dist: numpy>=1.24.0
|
|
23
|
+
Requires-Dist: opencv-python-headless>=4.8
|
|
24
|
+
Requires-Dist: pymupdf>=1.24.0
|
|
25
|
+
Requires-Dist: python-docx>=1.1.0
|
|
26
|
+
Requires-Dist: typing-extensions>=4.10
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Requires-Dist: apted>=1.0.3; extra == 'all'
|
|
29
|
+
Requires-Dist: fastapi>=0.110; extra == 'all'
|
|
30
|
+
Requires-Dist: pillow>=10.0; extra == 'all'
|
|
31
|
+
Requires-Dist: python-multipart>=0.0.9; extra == 'all'
|
|
32
|
+
Requires-Dist: scikit-image>=0.22; extra == 'all'
|
|
33
|
+
Requires-Dist: scipy>=1.11; extra == 'all'
|
|
34
|
+
Requires-Dist: uvicorn[standard]>=0.27; extra == 'all'
|
|
35
|
+
Provides-Extra: bench
|
|
36
|
+
Requires-Dist: apted>=1.0.3; extra == 'bench'
|
|
37
|
+
Requires-Dist: pillow>=10.0; extra == 'bench'
|
|
38
|
+
Requires-Dist: scikit-image>=0.22; extra == 'bench'
|
|
39
|
+
Requires-Dist: scipy>=1.11; extra == 'bench'
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
42
|
+
Requires-Dist: pre-commit>=3.6; extra == 'dev'
|
|
43
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
44
|
+
Requires-Dist: pytest-timeout>=2.2; extra == 'dev'
|
|
45
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
46
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
47
|
+
Requires-Dist: types-setuptools; extra == 'dev'
|
|
48
|
+
Provides-Extra: ml-formula
|
|
49
|
+
Requires-Dist: pix2tex>=0.1.4; extra == 'ml-formula'
|
|
50
|
+
Requires-Dist: torch>=2.2; extra == 'ml-formula'
|
|
51
|
+
Provides-Extra: ml-layout
|
|
52
|
+
Requires-Dist: torch>=2.2; extra == 'ml-layout'
|
|
53
|
+
Requires-Dist: transformers>=4.40; extra == 'ml-layout'
|
|
54
|
+
Provides-Extra: ml-ocr
|
|
55
|
+
Requires-Dist: paddleocr>=2.7; extra == 'ml-ocr'
|
|
56
|
+
Requires-Dist: paddlepaddle>=2.6; extra == 'ml-ocr'
|
|
57
|
+
Provides-Extra: ml-tables
|
|
58
|
+
Requires-Dist: timm>=0.9; extra == 'ml-tables'
|
|
59
|
+
Requires-Dist: torch>=2.2; extra == 'ml-tables'
|
|
60
|
+
Requires-Dist: transformers>=4.40; extra == 'ml-tables'
|
|
61
|
+
Provides-Extra: rest
|
|
62
|
+
Requires-Dist: fastapi>=0.110; extra == 'rest'
|
|
63
|
+
Requires-Dist: python-multipart>=0.0.9; extra == 'rest'
|
|
64
|
+
Requires-Dist: uvicorn[standard]>=0.27; extra == 'rest'
|
|
65
|
+
Description-Content-Type: text/markdown
|
|
66
|
+
|
|
67
|
+
# pdf2docx-plus
|
|
68
|
+
|
|
69
|
+
Hardened fork of [pdf2docx](https://github.com/ArtifexSoftware/pdf2docx) — a
|
|
70
|
+
Python PDF → DOCX converter that actually writes editable Word documents
|
|
71
|
+
(not Markdown, not HTML).
|
|
72
|
+
|
|
73
|
+
**What's different from upstream**
|
|
74
|
+
|
|
75
|
+
| | upstream `pdf2docx` | `pdf2docx-plus` |
|
|
76
|
+
|---|---|---|
|
|
77
|
+
| Python support | 3.10+ | **3.11 / 3.12 / 3.13** |
|
|
78
|
+
| Hyperlink OOXML | nested inside `<w:r>` (invalid) | paragraph-level `<w:hyperlink>` (valid) |
|
|
79
|
+
| NULL-byte / control chars | sometimes leaks into `<w:t>`, corrupts DOCX | stripped at run insertion |
|
|
80
|
+
| Errors | single `ConversionException` | `InputError` / `ParseError` / `MakeDocxError` / `PasswordRequired` / `TimeoutExceeded` |
|
|
81
|
+
| Typed API | no | `py.typed`, dataclasses, `Protocol`-based plugins |
|
|
82
|
+
| Return value | `None` | `ConversionResult` with per-page accounting |
|
|
83
|
+
| Timeout | none (can hang forever) | `timeout_s=` watchdog |
|
|
84
|
+
| Plugin architecture | no | swap table / layout / OCR / formula backends |
|
|
85
|
+
| REST server | no | `pdf2docx-plus serve` (FastAPI, optional) |
|
|
86
|
+
| ML hooks (opt-in) | no | Table Transformer, Granite-Docling, PaddleOCR, pix2tex |
|
|
87
|
+
| Tables → CSV | no | `--tables-csv DIR` |
|
|
88
|
+
| Structured logging | hijacks root logger | scoped `pdf2docx_plus` logger |
|
|
89
|
+
|
|
90
|
+
## Install
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install pdf2docx-plus # core
|
|
94
|
+
pip install 'pdf2docx-plus[rest]' # + FastAPI server
|
|
95
|
+
pip install 'pdf2docx-plus[bench]' # + evaluation harness
|
|
96
|
+
pip install 'pdf2docx-plus[ml-tables]' # + Table Transformer (torch)
|
|
97
|
+
pip install 'pdf2docx-plus[ml-ocr]' # + PaddleOCR
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Quick start
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from pdf2docx_plus import convert
|
|
104
|
+
|
|
105
|
+
result = convert("in.pdf", "out.docx", timeout_s=120)
|
|
106
|
+
print(result.pages_ok, "/", result.pages_total, "pages in", result.elapsed_s, "s")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Or with more control:
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from pdf2docx_plus import Converter, PluginRegistry
|
|
113
|
+
from pdf2docx_plus.hooks import TableTransformerDetector
|
|
114
|
+
|
|
115
|
+
plugins = PluginRegistry()
|
|
116
|
+
plugins.add_table_detector(TableTransformerDetector(device="cuda"))
|
|
117
|
+
|
|
118
|
+
with Converter("in.pdf", password="s3cret") as cv:
|
|
119
|
+
result = cv.convert(
|
|
120
|
+
"out.docx",
|
|
121
|
+
pages=[0, 1, 2],
|
|
122
|
+
profile="fidelity", # "fast" | "fidelity" | "semantic"
|
|
123
|
+
timeout_s=60,
|
|
124
|
+
continue_on_error=True,
|
|
125
|
+
)
|
|
126
|
+
for p in result.page_results:
|
|
127
|
+
if not p.ok:
|
|
128
|
+
print(f"page {p.page_index}: {p.error}")
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## CLI
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
pdf2docx-plus convert in.pdf out.docx --timeout 120 --profile fidelity
|
|
135
|
+
pdf2docx-plus convert in.pdf --pages 0,2,5 --tables-csv tables/
|
|
136
|
+
pdf2docx-plus extract-tables in.pdf --out tables.json
|
|
137
|
+
pdf2docx-plus serve --host 0.0.0.0 --port 8000
|
|
138
|
+
pdf2docx-plus version
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## REST server
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
pip install 'pdf2docx-plus[rest]'
|
|
145
|
+
pdf2docx-plus serve --port 8000
|
|
146
|
+
# in another shell:
|
|
147
|
+
curl -F file=@in.pdf -F profile=fidelity http://localhost:8000/convert -o out.docx
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Endpoints:
|
|
151
|
+
|
|
152
|
+
| Method | Path | Body | Returns |
|
|
153
|
+
|---|---|---|---|
|
|
154
|
+
| POST | `/convert` | multipart `file`, optional `password`, `profile`, `timeout_s` | DOCX bytes + `X-Pages-Ok` / `X-Pages-Failed` / `X-Elapsed-Seconds` headers |
|
|
155
|
+
| POST | `/extract-tables` | multipart `file`, optional `password` | JSON `{"tables": [...]}` |
|
|
156
|
+
| GET | `/healthz` | — | `{"status": "ok"}` |
|
|
157
|
+
| GET | `/version` | — | `{"version": "..."}` |
|
|
158
|
+
|
|
159
|
+
## Plugin architecture
|
|
160
|
+
|
|
161
|
+
Four extension points, all `Protocol`-based:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from pdf2docx_plus.plugins import (
|
|
165
|
+
TableDetector, LayoutDetector, OcrEngine, FormulaRecognizer
|
|
166
|
+
)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Register any implementation on `PluginRegistry` and pass it to `Converter`.
|
|
170
|
+
Plugins never kill a conversion — exceptions raised inside a plugin are
|
|
171
|
+
logged and skipped.
|
|
172
|
+
|
|
173
|
+
Built-in ML hooks (opt-in extras):
|
|
174
|
+
|
|
175
|
+
| Hook | Backend | Extra | Weights license |
|
|
176
|
+
|---|---|---|---|
|
|
177
|
+
| `TableTransformerDetector` | HuggingFace `microsoft/table-transformer-*` | `ml-tables` | MIT |
|
|
178
|
+
| `GraniteDoclingLayoutDetector` | `ibm-granite/granite-docling-258M` | `ml-layout` | Apache-2.0 |
|
|
179
|
+
| `PaddleOcrEngine` | PaddleOCR | `ml-ocr` | Apache-2.0 |
|
|
180
|
+
| `Pix2TexFormulaRecognizer` | pix2tex | `ml-formula` | MIT |
|
|
181
|
+
| `UniMERNetFormulaRecognizer` | UniMERNet (bring weights) | manual | Apache-2.0 |
|
|
182
|
+
|
|
183
|
+
## Benchmark
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pip install 'pdf2docx-plus[bench]'
|
|
187
|
+
python -m bench.run --corpus bench/corpus --out bench/reports/latest.json
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
Metrics implemented: text F1, TEDS (`apted`), reading-order Kendall-tau,
|
|
191
|
+
rendered SSIM (via LibreOffice + scikit-image), and editability ratio.
|
|
192
|
+
|
|
193
|
+
Seed corpus in this repo: 3 financial fund PDFs (born-digital). Drop more
|
|
194
|
+
under `bench/corpus/<name>/input.pdf` and, optionally, `expected_text.txt`,
|
|
195
|
+
`expected_tables.json`, `expected_order.json` for scoring.
|
|
196
|
+
|
|
197
|
+
Current baseline on the seed corpus (76 pages, CPU):
|
|
198
|
+
|
|
199
|
+
```
|
|
200
|
+
awhkef 9 pages 0 failed 7.1 s 74 KB
|
|
201
|
+
first_sentier 58 pages 0 failed 15.8 s 155 KB
|
|
202
|
+
kfs_bosera 9 pages 0 failed 4.3 s 87 KB
|
|
203
|
+
TOTAL 76 pages 0 failed 27.7 s 2.75 pg/s
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Licensing
|
|
207
|
+
|
|
208
|
+
`pdf2docx-plus` is MIT, but **depends on PyMuPDF (AGPL-3.0)** — this
|
|
209
|
+
propagates to you if you redistribute or expose as a network service. See
|
|
210
|
+
[LICENSING.md](LICENSING.md) for the full dependency matrix, AGPL
|
|
211
|
+
implications, and the future pypdfium2 migration path.
|
|
212
|
+
|
|
213
|
+
## What's NOT done yet (roadmap)
|
|
214
|
+
|
|
215
|
+
This fork covers **Phase 0** (foundation) and most of **Phase 1** (stability
|
|
216
|
+
+ typed API) from the original 21-week
|
|
217
|
+
[`PDF2DOCX_FORK_PLAN.md`](../PDF2DOCX_FORK_PLAN.md). Phases 2–5 are scaffolded
|
|
218
|
+
via the plugin architecture but the ML-backed hooks need real integration
|
|
219
|
+
work to reach the v1.0 success criteria in the plan (TEDS ≥ 0.90, text F1 ≥
|
|
220
|
+
0.98, reading-order Kendall-tau ≥ 0.90).
|
|
221
|
+
|
|
222
|
+
Specifically, still open:
|
|
223
|
+
|
|
224
|
+
- Train / evaluate Table Transformer + Granite-Docling against an annotated
|
|
225
|
+
corpus (plan §K).
|
|
226
|
+
- Cross-page table stitching heuristic (§B.7).
|
|
227
|
+
- Header/footer → `w:hdr` / `w:ftr` emission (§C.13).
|
|
228
|
+
- Math recognition pipeline wiring (§F.24).
|
|
229
|
+
- Scanned-PDF OCR routing + auto-detect (§G.25).
|
|
230
|
+
- `styles.xml` rewrite (§H.27) — currently we still use python-docx defaults.
|
|
231
|
+
- pypdfium2 backend for permissive licensing (§6).
|
|
232
|
+
|
|
233
|
+
## Credits
|
|
234
|
+
|
|
235
|
+
Forked from [ArtifexSoftware/pdf2docx](https://github.com/ArtifexSoftware/pdf2docx)
|
|
236
|
+
(originally by [@dothinking](https://github.com/dothinking)). MIT.
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# pdf2docx-plus
|
|
2
|
+
|
|
3
|
+
Hardened fork of [pdf2docx](https://github.com/ArtifexSoftware/pdf2docx) — a
|
|
4
|
+
Python PDF → DOCX converter that actually writes editable Word documents
|
|
5
|
+
(not Markdown, not HTML).
|
|
6
|
+
|
|
7
|
+
**What's different from upstream**
|
|
8
|
+
|
|
9
|
+
| | upstream `pdf2docx` | `pdf2docx-plus` |
|
|
10
|
+
|---|---|---|
|
|
11
|
+
| Python support | 3.10+ | **3.11 / 3.12 / 3.13** |
|
|
12
|
+
| Hyperlink OOXML | nested inside `<w:r>` (invalid) | paragraph-level `<w:hyperlink>` (valid) |
|
|
13
|
+
| NULL-byte / control chars | sometimes leaks into `<w:t>`, corrupts DOCX | stripped at run insertion |
|
|
14
|
+
| Errors | single `ConversionException` | `InputError` / `ParseError` / `MakeDocxError` / `PasswordRequired` / `TimeoutExceeded` |
|
|
15
|
+
| Typed API | no | `py.typed`, dataclasses, `Protocol`-based plugins |
|
|
16
|
+
| Return value | `None` | `ConversionResult` with per-page accounting |
|
|
17
|
+
| Timeout | none (can hang forever) | `timeout_s=` watchdog |
|
|
18
|
+
| Plugin architecture | no | swap table / layout / OCR / formula backends |
|
|
19
|
+
| REST server | no | `pdf2docx-plus serve` (FastAPI, optional) |
|
|
20
|
+
| ML hooks (opt-in) | no | Table Transformer, Granite-Docling, PaddleOCR, pix2tex |
|
|
21
|
+
| Tables → CSV | no | `--tables-csv DIR` |
|
|
22
|
+
| Structured logging | hijacks root logger | scoped `pdf2docx_plus` logger |
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install pdf2docx-plus # core
|
|
28
|
+
pip install 'pdf2docx-plus[rest]' # + FastAPI server
|
|
29
|
+
pip install 'pdf2docx-plus[bench]' # + evaluation harness
|
|
30
|
+
pip install 'pdf2docx-plus[ml-tables]' # + Table Transformer (torch)
|
|
31
|
+
pip install 'pdf2docx-plus[ml-ocr]' # + PaddleOCR
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Quick start
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from pdf2docx_plus import convert
|
|
38
|
+
|
|
39
|
+
result = convert("in.pdf", "out.docx", timeout_s=120)
|
|
40
|
+
print(result.pages_ok, "/", result.pages_total, "pages in", result.elapsed_s, "s")
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Or with more control:
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from pdf2docx_plus import Converter, PluginRegistry
|
|
47
|
+
from pdf2docx_plus.hooks import TableTransformerDetector
|
|
48
|
+
|
|
49
|
+
plugins = PluginRegistry()
|
|
50
|
+
plugins.add_table_detector(TableTransformerDetector(device="cuda"))
|
|
51
|
+
|
|
52
|
+
with Converter("in.pdf", password="s3cret") as cv:
|
|
53
|
+
result = cv.convert(
|
|
54
|
+
"out.docx",
|
|
55
|
+
pages=[0, 1, 2],
|
|
56
|
+
profile="fidelity", # "fast" | "fidelity" | "semantic"
|
|
57
|
+
timeout_s=60,
|
|
58
|
+
continue_on_error=True,
|
|
59
|
+
)
|
|
60
|
+
for p in result.page_results:
|
|
61
|
+
if not p.ok:
|
|
62
|
+
print(f"page {p.page_index}: {p.error}")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## CLI
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
pdf2docx-plus convert in.pdf out.docx --timeout 120 --profile fidelity
|
|
69
|
+
pdf2docx-plus convert in.pdf --pages 0,2,5 --tables-csv tables/
|
|
70
|
+
pdf2docx-plus extract-tables in.pdf --out tables.json
|
|
71
|
+
pdf2docx-plus serve --host 0.0.0.0 --port 8000
|
|
72
|
+
pdf2docx-plus version
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## REST server
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install 'pdf2docx-plus[rest]'
|
|
79
|
+
pdf2docx-plus serve --port 8000
|
|
80
|
+
# in another shell:
|
|
81
|
+
curl -F file=@in.pdf -F profile=fidelity http://localhost:8000/convert -o out.docx
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Endpoints:
|
|
85
|
+
|
|
86
|
+
| Method | Path | Body | Returns |
|
|
87
|
+
|---|---|---|---|
|
|
88
|
+
| POST | `/convert` | multipart `file`, optional `password`, `profile`, `timeout_s` | DOCX bytes + `X-Pages-Ok` / `X-Pages-Failed` / `X-Elapsed-Seconds` headers |
|
|
89
|
+
| POST | `/extract-tables` | multipart `file`, optional `password` | JSON `{"tables": [...]}` |
|
|
90
|
+
| GET | `/healthz` | — | `{"status": "ok"}` |
|
|
91
|
+
| GET | `/version` | — | `{"version": "..."}` |
|
|
92
|
+
|
|
93
|
+
## Plugin architecture
|
|
94
|
+
|
|
95
|
+
Four extension points, all `Protocol`-based:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from pdf2docx_plus.plugins import (
|
|
99
|
+
TableDetector, LayoutDetector, OcrEngine, FormulaRecognizer
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Register any implementation on `PluginRegistry` and pass it to `Converter`.
|
|
104
|
+
Plugins never kill a conversion — exceptions raised inside a plugin are
|
|
105
|
+
logged and skipped.
|
|
106
|
+
|
|
107
|
+
Built-in ML hooks (opt-in extras):
|
|
108
|
+
|
|
109
|
+
| Hook | Backend | Extra | Weights license |
|
|
110
|
+
|---|---|---|---|
|
|
111
|
+
| `TableTransformerDetector` | HuggingFace `microsoft/table-transformer-*` | `ml-tables` | MIT |
|
|
112
|
+
| `GraniteDoclingLayoutDetector` | `ibm-granite/granite-docling-258M` | `ml-layout` | Apache-2.0 |
|
|
113
|
+
| `PaddleOcrEngine` | PaddleOCR | `ml-ocr` | Apache-2.0 |
|
|
114
|
+
| `Pix2TexFormulaRecognizer` | pix2tex | `ml-formula` | MIT |
|
|
115
|
+
| `UniMERNetFormulaRecognizer` | UniMERNet (bring weights) | manual | Apache-2.0 |
|
|
116
|
+
|
|
117
|
+
## Benchmark
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
pip install 'pdf2docx-plus[bench]'
|
|
121
|
+
python -m bench.run --corpus bench/corpus --out bench/reports/latest.json
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Metrics implemented: text F1, TEDS (`apted`), reading-order Kendall-tau,
|
|
125
|
+
rendered SSIM (via LibreOffice + scikit-image), and editability ratio.
|
|
126
|
+
|
|
127
|
+
Seed corpus in this repo: 3 financial fund PDFs (born-digital). Drop more
|
|
128
|
+
under `bench/corpus/<name>/input.pdf` and, optionally, `expected_text.txt`,
|
|
129
|
+
`expected_tables.json`, `expected_order.json` for scoring.
|
|
130
|
+
|
|
131
|
+
Current baseline on the seed corpus (76 pages, CPU):
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
awhkef 9 pages 0 failed 7.1 s 74 KB
|
|
135
|
+
first_sentier 58 pages 0 failed 15.8 s 155 KB
|
|
136
|
+
kfs_bosera 9 pages 0 failed 4.3 s 87 KB
|
|
137
|
+
TOTAL 76 pages 0 failed 27.7 s 2.75 pg/s
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Licensing
|
|
141
|
+
|
|
142
|
+
`pdf2docx-plus` is MIT, but **depends on PyMuPDF (AGPL-3.0)** — this
|
|
143
|
+
propagates to you if you redistribute or expose as a network service. See
|
|
144
|
+
[LICENSING.md](LICENSING.md) for the full dependency matrix, AGPL
|
|
145
|
+
implications, and the future pypdfium2 migration path.
|
|
146
|
+
|
|
147
|
+
## What's NOT done yet (roadmap)
|
|
148
|
+
|
|
149
|
+
This fork covers **Phase 0** (foundation) and most of **Phase 1** (stability
|
|
150
|
+
+ typed API) from the original 21-week
|
|
151
|
+
[`PDF2DOCX_FORK_PLAN.md`](../PDF2DOCX_FORK_PLAN.md). Phases 2–5 are scaffolded
|
|
152
|
+
via the plugin architecture but the ML-backed hooks need real integration
|
|
153
|
+
work to reach the v1.0 success criteria in the plan (TEDS ≥ 0.90, text F1 ≥
|
|
154
|
+
0.98, reading-order Kendall-tau ≥ 0.90).
|
|
155
|
+
|
|
156
|
+
Specifically, still open:
|
|
157
|
+
|
|
158
|
+
- Train / evaluate Table Transformer + Granite-Docling against an annotated
|
|
159
|
+
corpus (plan §K).
|
|
160
|
+
- Cross-page table stitching heuristic (§B.7).
|
|
161
|
+
- Header/footer → `w:hdr` / `w:ftr` emission (§C.13).
|
|
162
|
+
- Math recognition pipeline wiring (§F.24).
|
|
163
|
+
- Scanned-PDF OCR routing + auto-detect (§G.25).
|
|
164
|
+
- `styles.xml` rewrite (§H.27) — currently we still use python-docx defaults.
|
|
165
|
+
- pypdfium2 backend for permissive licensing (§6).
|
|
166
|
+
|
|
167
|
+
## Credits
|
|
168
|
+
|
|
169
|
+
Forked from [ArtifexSoftware/pdf2docx](https://github.com/ArtifexSoftware/pdf2docx)
|
|
170
|
+
(originally by [@dothinking](https://github.com/dothinking)). MIT.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# pdf2docx documentation
|
|
2
|
+
|
|
3
|
+
Welcome to the **pdf2docx** documentation. This documentation relies on [Sphinx](https://www.sphinx-doc.org/en/master/) to publish HTML docs from markdown files written with [restructured text](https://en.wikipedia.org/wiki/ReStructuredText) (RST).
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
## Sphinx version
|
|
7
|
+
|
|
8
|
+
This README assumes you have [Sphinx v5.0.2 installed](https://www.sphinx-doc.org/en/master/usage/installation.html) on your system.
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## Updating the documentation
|
|
12
|
+
|
|
13
|
+
Within `docs` update the associated restructured text (`.rst`) files. These files represent the corresponding document pages.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## Building HTML documentation
|
|
17
|
+
|
|
18
|
+
- Ensure you have the `furo` theme installed:
|
|
19
|
+
|
|
20
|
+
`pip install furo`
|
|
21
|
+
|
|
22
|
+
Furo theme, Copyright (c) 2020 Pradyun Gedam <mail@pradyunsg.me>, thank you to:
|
|
23
|
+
|
|
24
|
+
https://github.com/pradyunsg/furo/blob/main/LICENSE
|
|
25
|
+
|
|
26
|
+
- From the "docs" location run:
|
|
27
|
+
|
|
28
|
+
`sphinx-build -b html . build/html`
|
|
29
|
+
|
|
30
|
+
This then creates the HTML documentation within `build/html`.
|
|
31
|
+
|
|
32
|
+
> Use: `sphinx-build -a -b html . build/html` to build all, including the assets in `_static` (important if you have updated CSS).
|
|
33
|
+
|
|
34
|
+
For full details see: [Using Sphinx](https://www.sphinx-doc.org/en/master/usage/index.html)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""pdf2docx-plus: hardened PDF -> DOCX converter.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
|
|
5
|
+
from pdf2docx_plus import Converter, convert, ConversionResult
|
|
6
|
+
|
|
7
|
+
result = convert("in.pdf", "out.docx", timeout_s=60)
|
|
8
|
+
print(result.pages_ok, result.pages_failed, result.elapsed_s)
|
|
9
|
+
|
|
10
|
+
Lower-level facade:
|
|
11
|
+
|
|
12
|
+
with Converter("in.pdf") as cv:
|
|
13
|
+
cv.convert("out.docx", pages=[0, 1, 2])
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from .api import ConversionResult, Converter, convert, extract_tables
|
|
19
|
+
from .errors import (
|
|
20
|
+
ConversionError,
|
|
21
|
+
InputError,
|
|
22
|
+
MakeDocxError,
|
|
23
|
+
ParseError,
|
|
24
|
+
PasswordRequired,
|
|
25
|
+
TimeoutExceeded,
|
|
26
|
+
)
|
|
27
|
+
from .version import __version__
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"ConversionError",
|
|
31
|
+
"ConversionResult",
|
|
32
|
+
"Converter",
|
|
33
|
+
"InputError",
|
|
34
|
+
"MakeDocxError",
|
|
35
|
+
"ParseError",
|
|
36
|
+
"PasswordRequired",
|
|
37
|
+
"TimeoutExceeded",
|
|
38
|
+
"__version__",
|
|
39
|
+
"convert",
|
|
40
|
+
"extract_tables",
|
|
41
|
+
]
|