docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
6
|
+
|
|
7
|
+
import pypdfium2 as pdfium
|
|
8
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
9
|
+
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
|
10
|
+
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
|
11
|
+
from PIL import Image
|
|
12
|
+
from pypdfium2 import PdfPage
|
|
13
|
+
|
|
14
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
15
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
16
|
+
from docling.datamodel.base_models import Size
|
|
17
|
+
from docling.utils.locks import pypdfium2_lock
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from docling.datamodel.document import InputDocument
|
|
21
|
+
|
|
22
|
+
_log = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DoclingParseV4PageBackend(PdfPageBackend):
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
*,
|
|
29
|
+
dp_doc: PdfDocument,
|
|
30
|
+
page_obj: PdfPage,
|
|
31
|
+
page_no: int,
|
|
32
|
+
create_words: bool = True,
|
|
33
|
+
create_textlines: bool = True,
|
|
34
|
+
keep_chars: bool = False,
|
|
35
|
+
keep_lines: bool = False,
|
|
36
|
+
keep_images: bool = True,
|
|
37
|
+
):
|
|
38
|
+
self._ppage = page_obj
|
|
39
|
+
self._dp_doc = dp_doc
|
|
40
|
+
self._page_no = page_no
|
|
41
|
+
|
|
42
|
+
self._create_words = create_words
|
|
43
|
+
self._create_textlines = create_textlines
|
|
44
|
+
|
|
45
|
+
self._keep_chars = keep_chars
|
|
46
|
+
self._keep_lines = keep_lines
|
|
47
|
+
self._keep_images = keep_images
|
|
48
|
+
|
|
49
|
+
self._dpage: Optional[SegmentedPdfPage] = None
|
|
50
|
+
self._unloaded = False
|
|
51
|
+
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
|
|
52
|
+
|
|
53
|
+
def _ensure_parsed(self) -> None:
|
|
54
|
+
if self._dpage is not None:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
seg_page = self._dp_doc.get_page(
|
|
58
|
+
self._page_no + 1,
|
|
59
|
+
keep_chars=self._keep_chars,
|
|
60
|
+
keep_lines=self._keep_lines,
|
|
61
|
+
keep_bitmaps=self._keep_images,
|
|
62
|
+
create_words=self._create_words,
|
|
63
|
+
create_textlines=self._create_textlines,
|
|
64
|
+
enforce_same_font=True,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# In Docling, all TextCell instances are expected with top-left origin.
|
|
68
|
+
[
|
|
69
|
+
tc.to_top_left_origin(seg_page.dimension.height)
|
|
70
|
+
for tc in seg_page.textline_cells
|
|
71
|
+
]
|
|
72
|
+
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
|
|
73
|
+
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
|
|
74
|
+
|
|
75
|
+
self._dpage = seg_page
|
|
76
|
+
|
|
77
|
+
def is_valid(self) -> bool:
|
|
78
|
+
return self.valid
|
|
79
|
+
|
|
80
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
81
|
+
self._ensure_parsed()
|
|
82
|
+
assert self._dpage is not None
|
|
83
|
+
|
|
84
|
+
# Find intersecting cells on the page
|
|
85
|
+
text_piece = ""
|
|
86
|
+
page_size = self.get_size()
|
|
87
|
+
|
|
88
|
+
scale = (
|
|
89
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
for i, cell in enumerate(self._dpage.textline_cells):
|
|
93
|
+
cell_bbox = (
|
|
94
|
+
cell.rect.to_bounding_box()
|
|
95
|
+
.to_top_left_origin(page_height=page_size.height)
|
|
96
|
+
.scaled(scale)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
|
100
|
+
|
|
101
|
+
if overlap_frac > 0.5:
|
|
102
|
+
if len(text_piece) > 0:
|
|
103
|
+
text_piece += " "
|
|
104
|
+
text_piece += cell.text
|
|
105
|
+
|
|
106
|
+
return text_piece
|
|
107
|
+
|
|
108
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
|
109
|
+
self._ensure_parsed()
|
|
110
|
+
return self._dpage
|
|
111
|
+
|
|
112
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
|
113
|
+
self._ensure_parsed()
|
|
114
|
+
assert self._dpage is not None
|
|
115
|
+
|
|
116
|
+
return self._dpage.textline_cells
|
|
117
|
+
|
|
118
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
119
|
+
self._ensure_parsed()
|
|
120
|
+
assert self._dpage is not None
|
|
121
|
+
|
|
122
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
|
123
|
+
|
|
124
|
+
images = self._dpage.bitmap_resources
|
|
125
|
+
|
|
126
|
+
for img in images:
|
|
127
|
+
cropbox = img.rect.to_bounding_box().to_top_left_origin(
|
|
128
|
+
self.get_size().height
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
if cropbox.area() > AREA_THRESHOLD:
|
|
132
|
+
cropbox = cropbox.scaled(scale=scale)
|
|
133
|
+
|
|
134
|
+
yield cropbox
|
|
135
|
+
|
|
136
|
+
def get_page_image(
|
|
137
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
|
138
|
+
) -> Image.Image:
|
|
139
|
+
page_size = self.get_size()
|
|
140
|
+
|
|
141
|
+
if not cropbox:
|
|
142
|
+
cropbox = BoundingBox(
|
|
143
|
+
l=0,
|
|
144
|
+
r=page_size.width,
|
|
145
|
+
t=0,
|
|
146
|
+
b=page_size.height,
|
|
147
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
148
|
+
)
|
|
149
|
+
padbox = BoundingBox(
|
|
150
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
|
151
|
+
)
|
|
152
|
+
else:
|
|
153
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
|
154
|
+
padbox.r = page_size.width - padbox.r
|
|
155
|
+
padbox.t = page_size.height - padbox.t
|
|
156
|
+
|
|
157
|
+
with pypdfium2_lock:
|
|
158
|
+
image = (
|
|
159
|
+
self._ppage.render(
|
|
160
|
+
scale=scale * 1.5,
|
|
161
|
+
rotation=0, # no additional rotation
|
|
162
|
+
crop=padbox.as_tuple(),
|
|
163
|
+
)
|
|
164
|
+
.to_pil()
|
|
165
|
+
.resize(
|
|
166
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
|
167
|
+
)
|
|
168
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
|
169
|
+
|
|
170
|
+
return image
|
|
171
|
+
|
|
172
|
+
def get_size(self) -> Size:
|
|
173
|
+
with pypdfium2_lock:
|
|
174
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
|
175
|
+
|
|
176
|
+
# TODO: Take width and height from docling-parse.
|
|
177
|
+
# return Size(
|
|
178
|
+
# width=self._dpage.dimension.width,
|
|
179
|
+
# height=self._dpage.dimension.height,
|
|
180
|
+
# )
|
|
181
|
+
|
|
182
|
+
def unload(self):
|
|
183
|
+
if not self._unloaded and self._dp_doc is not None:
|
|
184
|
+
self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
|
|
185
|
+
self._unloaded = True
|
|
186
|
+
|
|
187
|
+
self._ppage = None
|
|
188
|
+
self._dpage = None
|
|
189
|
+
self._dp_doc = None
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
in_doc: "InputDocument",
|
|
196
|
+
path_or_stream: Union[BytesIO, Path],
|
|
197
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
198
|
+
):
|
|
199
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
200
|
+
|
|
201
|
+
password = (
|
|
202
|
+
self.options.password.get_secret_value() if self.options.password else None
|
|
203
|
+
)
|
|
204
|
+
with pypdfium2_lock:
|
|
205
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
|
|
206
|
+
self.parser = DoclingPdfParser(loglevel="fatal")
|
|
207
|
+
self.dp_doc: PdfDocument = self.parser.load(
|
|
208
|
+
path_or_stream=self.path_or_stream, password=password
|
|
209
|
+
)
|
|
210
|
+
success = self.dp_doc is not None
|
|
211
|
+
|
|
212
|
+
if not success:
|
|
213
|
+
raise RuntimeError(
|
|
214
|
+
f"docling-parse v4 could not load document {self.document_hash}."
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def page_count(self) -> int:
|
|
218
|
+
# return len(self._pdoc) # To be replaced with docling-parse API
|
|
219
|
+
|
|
220
|
+
len_1 = len(self._pdoc)
|
|
221
|
+
len_2 = self.dp_doc.number_of_pages()
|
|
222
|
+
|
|
223
|
+
if len_1 != len_2:
|
|
224
|
+
_log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
|
|
225
|
+
|
|
226
|
+
return len_2
|
|
227
|
+
|
|
228
|
+
def load_page(
|
|
229
|
+
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
|
230
|
+
) -> DoclingParseV4PageBackend:
|
|
231
|
+
with pypdfium2_lock:
|
|
232
|
+
ppage = self._pdoc[page_no]
|
|
233
|
+
|
|
234
|
+
return DoclingParseV4PageBackend(
|
|
235
|
+
dp_doc=self.dp_doc,
|
|
236
|
+
page_obj=ppage,
|
|
237
|
+
page_no=page_no,
|
|
238
|
+
create_words=create_words,
|
|
239
|
+
create_textlines=create_textlines,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
def is_valid(self) -> bool:
|
|
243
|
+
return self.page_count() > 0
|
|
244
|
+
|
|
245
|
+
def unload(self):
|
|
246
|
+
super().unload()
|
|
247
|
+
# Unload docling-parse document first
|
|
248
|
+
if self.dp_doc is not None:
|
|
249
|
+
self.dp_doc.unload()
|
|
250
|
+
self.dp_doc = None
|
|
251
|
+
|
|
252
|
+
# Then close pypdfium2 document with proper locking
|
|
253
|
+
if self._pdoc is not None:
|
|
254
|
+
with pypdfium2_lock:
|
|
255
|
+
try:
|
|
256
|
+
self._pdoc.close()
|
|
257
|
+
except Exception:
|
|
258
|
+
# Ignore cleanup errors
|
|
259
|
+
pass
|
|
260
|
+
self._pdoc = None
|
|
File without changes
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from tempfile import mkdtemp
|
|
6
|
+
from typing import Callable, Optional
|
|
7
|
+
|
|
8
|
+
import pypdfium2
|
|
9
|
+
from docx.document import Document
|
|
10
|
+
from PIL import Image, ImageChops
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
|
|
14
|
+
"""Return the libreoffice cmd and optionally test it."""
|
|
15
|
+
|
|
16
|
+
libreoffice_cmd = (
|
|
17
|
+
shutil.which("libreoffice")
|
|
18
|
+
or shutil.which("soffice")
|
|
19
|
+
or (
|
|
20
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice"
|
|
21
|
+
if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
|
|
22
|
+
else None
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if raise_if_unavailable:
|
|
27
|
+
if libreoffice_cmd is None:
|
|
28
|
+
raise RuntimeError("Libreoffice not found")
|
|
29
|
+
|
|
30
|
+
# The following test will raise if the libreoffice_cmd cannot be used
|
|
31
|
+
subprocess.run(
|
|
32
|
+
[
|
|
33
|
+
libreoffice_cmd,
|
|
34
|
+
"-h",
|
|
35
|
+
],
|
|
36
|
+
stdout=subprocess.DEVNULL,
|
|
37
|
+
stderr=subprocess.DEVNULL,
|
|
38
|
+
check=True,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
return libreoffice_cmd
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_docx_to_pdf_converter() -> Optional[Callable]:
|
|
45
|
+
"""
|
|
46
|
+
Detects the best available DOCX to PDF tool and returns a conversion function.
|
|
47
|
+
The returned function accepts (input_path, output_path).
|
|
48
|
+
Returns None if no tool is available.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Try LibreOffice
|
|
52
|
+
libreoffice_cmd = get_libreoffice_cmd()
|
|
53
|
+
|
|
54
|
+
if libreoffice_cmd:
|
|
55
|
+
|
|
56
|
+
def convert_with_libreoffice(input_path, output_path):
|
|
57
|
+
subprocess.run(
|
|
58
|
+
[
|
|
59
|
+
libreoffice_cmd,
|
|
60
|
+
"--headless",
|
|
61
|
+
"--convert-to",
|
|
62
|
+
"pdf",
|
|
63
|
+
"--outdir",
|
|
64
|
+
os.path.dirname(output_path),
|
|
65
|
+
input_path,
|
|
66
|
+
],
|
|
67
|
+
stdout=subprocess.DEVNULL,
|
|
68
|
+
stderr=subprocess.DEVNULL,
|
|
69
|
+
check=True,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
expected_output = os.path.join(
|
|
73
|
+
os.path.dirname(output_path),
|
|
74
|
+
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
|
|
75
|
+
)
|
|
76
|
+
if expected_output != output_path:
|
|
77
|
+
os.rename(expected_output, output_path)
|
|
78
|
+
|
|
79
|
+
return convert_with_libreoffice
|
|
80
|
+
|
|
81
|
+
## Space for other DOCX to PDF converters if available
|
|
82
|
+
|
|
83
|
+
# No tools found
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
|
|
88
|
+
if bg_color is None:
|
|
89
|
+
bg_color = image.getpixel((0, 0))
|
|
90
|
+
|
|
91
|
+
bg = Image.new(image.mode, image.size, bg_color)
|
|
92
|
+
diff = ImageChops.difference(image, bg)
|
|
93
|
+
bbox = diff.getbbox()
|
|
94
|
+
|
|
95
|
+
if bbox:
|
|
96
|
+
left, upper, right, lower = bbox
|
|
97
|
+
left = max(0, left - padding)
|
|
98
|
+
upper = max(0, upper - padding)
|
|
99
|
+
right = min(image.width, right + padding)
|
|
100
|
+
lower = min(image.height, lower + padding)
|
|
101
|
+
return image.crop((left, upper, right, lower))
|
|
102
|
+
else:
|
|
103
|
+
return image
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_pil_from_dml_docx(
|
|
107
|
+
docx: Document, converter: Optional[Callable]
|
|
108
|
+
) -> Optional[Image.Image]:
|
|
109
|
+
if converter is None:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
temp_dir = Path(mkdtemp())
|
|
113
|
+
temp_docx = Path(temp_dir / "drawing_only.docx")
|
|
114
|
+
temp_pdf = Path(temp_dir / "drawing_only.pdf")
|
|
115
|
+
|
|
116
|
+
# 1) Save docx temporarily
|
|
117
|
+
docx.save(str(temp_docx))
|
|
118
|
+
|
|
119
|
+
# 2) Export to PDF
|
|
120
|
+
converter(temp_docx, temp_pdf)
|
|
121
|
+
|
|
122
|
+
# 3) Load PDF as PNG
|
|
123
|
+
pdf = pypdfium2.PdfDocument(temp_pdf)
|
|
124
|
+
page = pdf[0]
|
|
125
|
+
image = crop_whitespace(page.render(scale=2).to_pil())
|
|
126
|
+
page.close()
|
|
127
|
+
pdf.close()
|
|
128
|
+
|
|
129
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
130
|
+
|
|
131
|
+
return image
|
|
File without changes
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
|
3
|
+
On 23/01/2025
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
|
7
|
+
|
|
8
|
+
BLANK = ""
|
|
9
|
+
BACKSLASH = "\\"
|
|
10
|
+
ALN = "&"
|
|
11
|
+
|
|
12
|
+
CHR = {
|
|
13
|
+
# Unicode : Latex Math Symbols
|
|
14
|
+
# Top accents
|
|
15
|
+
"\u0300": "\\grave{{{0}}}",
|
|
16
|
+
"\u0301": "\\acute{{{0}}}",
|
|
17
|
+
"\u0302": "\\hat{{{0}}}",
|
|
18
|
+
"\u0303": "\\tilde{{{0}}}",
|
|
19
|
+
"\u0304": "\\bar{{{0}}}",
|
|
20
|
+
"\u0305": "\\overbar{{{0}}}",
|
|
21
|
+
"\u0306": "\\breve{{{0}}}",
|
|
22
|
+
"\u0307": "\\dot{{{0}}}",
|
|
23
|
+
"\u0308": "\\ddot{{{0}}}",
|
|
24
|
+
"\u0309": "\\ovhook{{{0}}}",
|
|
25
|
+
"\u030a": "\\ocirc{{{0}}}}",
|
|
26
|
+
"\u030c": "\\check{{{0}}}}",
|
|
27
|
+
"\u0310": "\\candra{{{0}}}",
|
|
28
|
+
"\u0312": "\\oturnedcomma{{{0}}}",
|
|
29
|
+
"\u0315": "\\ocommatopright{{{0}}}",
|
|
30
|
+
"\u031a": "\\droang{{{0}}}",
|
|
31
|
+
"\u0338": "\\not{{{0}}}",
|
|
32
|
+
"\u20d0": "\\leftharpoonaccent{{{0}}}",
|
|
33
|
+
"\u20d1": "\\rightharpoonaccent{{{0}}}",
|
|
34
|
+
"\u20d2": "\\vertoverlay{{{0}}}",
|
|
35
|
+
"\u20d6": "\\overleftarrow{{{0}}}",
|
|
36
|
+
"\u20d7": "\\vec{{{0}}}",
|
|
37
|
+
"\u20db": "\\dddot{{{0}}}",
|
|
38
|
+
"\u20dc": "\\ddddot{{{0}}}",
|
|
39
|
+
"\u20e1": "\\overleftrightarrow{{{0}}}",
|
|
40
|
+
"\u20e7": "\\annuity{{{0}}}",
|
|
41
|
+
"\u20e9": "\\widebridgeabove{{{0}}}",
|
|
42
|
+
"\u20f0": "\\asteraccent{{{0}}}",
|
|
43
|
+
# Bottom accents
|
|
44
|
+
"\u0330": "\\wideutilde{{{0}}}",
|
|
45
|
+
"\u0331": "\\underbar{{{0}}}",
|
|
46
|
+
"\u20e8": "\\threeunderdot{{{0}}}",
|
|
47
|
+
"\u20ec": "\\underrightharpoondown{{{0}}}",
|
|
48
|
+
"\u20ed": "\\underleftharpoondown{{{0}}}",
|
|
49
|
+
"\u20ee": "\\underledtarrow{{{0}}}",
|
|
50
|
+
"\u20ef": "\\underrightarrow{{{0}}}",
|
|
51
|
+
# Over | group
|
|
52
|
+
"\u23b4": "\\overbracket{{{0}}}",
|
|
53
|
+
"\u23dc": "\\overparen{{{0}}}",
|
|
54
|
+
"\u23de": "\\overbrace{{{0}}}",
|
|
55
|
+
# Under| group
|
|
56
|
+
"\u23b5": "\\underbracket{{{0}}}",
|
|
57
|
+
"\u23dd": "\\underparen{{{0}}}",
|
|
58
|
+
"\u23df": "\\underbrace{{{0}}}",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
CHR_BO = {
|
|
62
|
+
# Big operators,
|
|
63
|
+
"\u2140": "\\Bbbsum",
|
|
64
|
+
"\u220f": "\\prod",
|
|
65
|
+
"\u2210": "\\coprod",
|
|
66
|
+
"\u2211": "\\sum",
|
|
67
|
+
"\u222b": "\\int",
|
|
68
|
+
"\u222c": "\\iint",
|
|
69
|
+
"\u222d": "\\iiint",
|
|
70
|
+
"\u222e": "\\oint",
|
|
71
|
+
"\u222f": "\\oiint",
|
|
72
|
+
"\u2230": "\\oiiint",
|
|
73
|
+
"\u22c0": "\\bigwedge",
|
|
74
|
+
"\u22c1": "\\bigvee",
|
|
75
|
+
"\u22c2": "\\bigcap",
|
|
76
|
+
"\u22c3": "\\bigcup",
|
|
77
|
+
"\u2a00": "\\bigodot",
|
|
78
|
+
"\u2a01": "\\bigoplus",
|
|
79
|
+
"\u2a02": "\\bigotimes",
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
T = {
|
|
83
|
+
# Greek letters
|
|
84
|
+
"\U0001d6fc": "\\alpha ",
|
|
85
|
+
"\U0001d6fd": "\\beta ",
|
|
86
|
+
"\U0001d6fe": "\\gamma ",
|
|
87
|
+
"\U0001d6ff": "\\theta ",
|
|
88
|
+
"\U0001d700": "\\epsilon ",
|
|
89
|
+
"\U0001d701": "\\zeta ",
|
|
90
|
+
"\U0001d702": "\\eta ",
|
|
91
|
+
"\U0001d703": "\\theta ",
|
|
92
|
+
"\U0001d704": "\\iota ",
|
|
93
|
+
"\U0001d705": "\\kappa ",
|
|
94
|
+
"\U0001d706": "\\lambda ",
|
|
95
|
+
"\U0001d707": "\\m ",
|
|
96
|
+
"\U0001d708": "\\n ",
|
|
97
|
+
"\U0001d709": "\\xi ",
|
|
98
|
+
"\U0001d70a": "\\omicron ",
|
|
99
|
+
"\U0001d70b": "\\pi ",
|
|
100
|
+
"\U0001d70c": "\\rho ",
|
|
101
|
+
"\U0001d70d": "\\varsigma ",
|
|
102
|
+
"\U0001d70e": "\\sigma ",
|
|
103
|
+
"\U0001d70f": "\\ta ",
|
|
104
|
+
"\U0001d710": "\\upsilon ",
|
|
105
|
+
"\U0001d711": "\\phi ",
|
|
106
|
+
"\U0001d712": "\\chi ",
|
|
107
|
+
"\U0001d713": "\\psi ",
|
|
108
|
+
"\U0001d714": "\\omega ",
|
|
109
|
+
"\U0001d715": "\\partial ",
|
|
110
|
+
"\U0001d716": "\\varepsilon ",
|
|
111
|
+
"\U0001d717": "\\vartheta ",
|
|
112
|
+
"\U0001d718": "\\varkappa ",
|
|
113
|
+
"\U0001d719": "\\varphi ",
|
|
114
|
+
"\U0001d71a": "\\varrho ",
|
|
115
|
+
"\U0001d71b": "\\varpi ",
|
|
116
|
+
# Relation symbols
|
|
117
|
+
"\u2190": "\\leftarrow ",
|
|
118
|
+
"\u2191": "\\uparrow ",
|
|
119
|
+
"\u2192": "\\rightarrow ",
|
|
120
|
+
"\u2193": "\\downright ",
|
|
121
|
+
"\u2194": "\\leftrightarrow ",
|
|
122
|
+
"\u2195": "\\updownarrow ",
|
|
123
|
+
"\u2196": "\\nwarrow ",
|
|
124
|
+
"\u2197": "\\nearrow ",
|
|
125
|
+
"\u2198": "\\searrow ",
|
|
126
|
+
"\u2199": "\\swarrow ",
|
|
127
|
+
"\u22ee": "\\vdots ",
|
|
128
|
+
"\u22ef": "\\cdots ",
|
|
129
|
+
"\u22f0": "\\adots ",
|
|
130
|
+
"\u22f1": "\\ddots ",
|
|
131
|
+
"\u2260": "\\ne ",
|
|
132
|
+
"\u2264": "\\leq ",
|
|
133
|
+
"\u2265": "\\geq ",
|
|
134
|
+
"\u2266": "\\leqq ",
|
|
135
|
+
"\u2267": "\\geqq ",
|
|
136
|
+
"\u2268": "\\lneqq ",
|
|
137
|
+
"\u2269": "\\gneqq ",
|
|
138
|
+
"\u226a": "\\ll ",
|
|
139
|
+
"\u226b": "\\gg ",
|
|
140
|
+
"\u2208": "\\in ",
|
|
141
|
+
"\u2209": "\\notin ",
|
|
142
|
+
"\u220b": "\\ni ",
|
|
143
|
+
"\u220c": "\\nni ",
|
|
144
|
+
# Ordinary symbols
|
|
145
|
+
"\u221e": "\\infty ",
|
|
146
|
+
# Binary relations
|
|
147
|
+
"\u00b1": "\\pm ",
|
|
148
|
+
"\u2213": "\\mp ",
|
|
149
|
+
# Italic, Latin, uppercase
|
|
150
|
+
"\U0001d434": "A",
|
|
151
|
+
"\U0001d435": "B",
|
|
152
|
+
"\U0001d436": "C",
|
|
153
|
+
"\U0001d437": "D",
|
|
154
|
+
"\U0001d438": "E",
|
|
155
|
+
"\U0001d439": "F",
|
|
156
|
+
"\U0001d43a": "G",
|
|
157
|
+
"\U0001d43b": "H",
|
|
158
|
+
"\U0001d43c": "I",
|
|
159
|
+
"\U0001d43d": "J",
|
|
160
|
+
"\U0001d43e": "K",
|
|
161
|
+
"\U0001d43f": "L",
|
|
162
|
+
"\U0001d440": "M",
|
|
163
|
+
"\U0001d441": "N",
|
|
164
|
+
"\U0001d442": "O",
|
|
165
|
+
"\U0001d443": "P",
|
|
166
|
+
"\U0001d444": "Q",
|
|
167
|
+
"\U0001d445": "R",
|
|
168
|
+
"\U0001d446": "S",
|
|
169
|
+
"\U0001d447": "T",
|
|
170
|
+
"\U0001d448": "U",
|
|
171
|
+
"\U0001d449": "V",
|
|
172
|
+
"\U0001d44a": "W",
|
|
173
|
+
"\U0001d44b": "X",
|
|
174
|
+
"\U0001d44c": "Y",
|
|
175
|
+
"\U0001d44d": "Z",
|
|
176
|
+
# Italic, Latin, lowercase
|
|
177
|
+
"\U0001d44e": "a",
|
|
178
|
+
"\U0001d44f": "b",
|
|
179
|
+
"\U0001d450": "c",
|
|
180
|
+
"\U0001d451": "d",
|
|
181
|
+
"\U0001d452": "e",
|
|
182
|
+
"\U0001d453": "f",
|
|
183
|
+
"\U0001d454": "g",
|
|
184
|
+
"\U0001d456": "i",
|
|
185
|
+
"\U0001d457": "j",
|
|
186
|
+
"\U0001d458": "k",
|
|
187
|
+
"\U0001d459": "l",
|
|
188
|
+
"\U0001d45a": "m",
|
|
189
|
+
"\U0001d45b": "n",
|
|
190
|
+
"\U0001d45c": "o",
|
|
191
|
+
"\U0001d45d": "p",
|
|
192
|
+
"\U0001d45e": "q",
|
|
193
|
+
"\U0001d45f": "r",
|
|
194
|
+
"\U0001d460": "s",
|
|
195
|
+
"\U0001d461": "t",
|
|
196
|
+
"\U0001d462": "u",
|
|
197
|
+
"\U0001d463": "v",
|
|
198
|
+
"\U0001d464": "w",
|
|
199
|
+
"\U0001d465": "x",
|
|
200
|
+
"\U0001d466": "y",
|
|
201
|
+
"\U0001d467": "z",
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
FUNC = {
|
|
205
|
+
"sin": "\\sin({fe})",
|
|
206
|
+
"cos": "\\cos({fe})",
|
|
207
|
+
"tan": "\\tan({fe})",
|
|
208
|
+
"arcsin": "\\arcsin({fe})",
|
|
209
|
+
"arccos": "\\arccos({fe})",
|
|
210
|
+
"arctan": "\\arctan({fe})",
|
|
211
|
+
"arccot": "\\arccot({fe})",
|
|
212
|
+
"sinh": "\\sinh({fe})",
|
|
213
|
+
"cosh": "\\cosh({fe})",
|
|
214
|
+
"tanh": "\\tanh({fe})",
|
|
215
|
+
"coth": "\\coth({fe})",
|
|
216
|
+
"sec": "\\sec({fe})",
|
|
217
|
+
"csc": "\\csc({fe})",
|
|
218
|
+
"mod": "\\mod {fe}",
|
|
219
|
+
"max": "\\max({fe})",
|
|
220
|
+
"min": "\\min({fe})",
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
FUNC_PLACE = "{fe}"
|
|
224
|
+
|
|
225
|
+
BRK = "\\\\"
|
|
226
|
+
|
|
227
|
+
CHR_DEFAULT = {
|
|
228
|
+
"ACC_VAL": "\\hat{{{0}}}",
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
POS = {
|
|
232
|
+
"top": "\\overline{{{0}}}", # not sure
|
|
233
|
+
"bot": "\\underline{{{0}}}",
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
POS_DEFAULT = {
|
|
237
|
+
"BAR_VAL": "\\overline{{{0}}}",
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
SUB = "_{{{0}}}"
|
|
241
|
+
|
|
242
|
+
SUP = "^{{{0}}}"
|
|
243
|
+
|
|
244
|
+
F = {
|
|
245
|
+
"bar": "\\frac{{{num}}}{{{den}}}",
|
|
246
|
+
"skw": r"^{{{num}}}/_{{{den}}}",
|
|
247
|
+
"noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}",
|
|
248
|
+
"lin": "{{{num}}}/{{{den}}}",
|
|
249
|
+
}
|
|
250
|
+
F_DEFAULT = "\\frac{{{num}}}{{{den}}}"
|
|
251
|
+
|
|
252
|
+
D = "\\left{left}{text}\\right{right}"
|
|
253
|
+
|
|
254
|
+
D_DEFAULT = {
|
|
255
|
+
"left": "(",
|
|
256
|
+
"right": ")",
|
|
257
|
+
"null": ".",
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
RAD = "\\sqrt[{deg}]{{{text}}}"
|
|
261
|
+
RAD_DEFAULT = "\\sqrt{{{text}}}"
|
|
262
|
+
ARR = "{text}"
|
|
263
|
+
|
|
264
|
+
LIM_FUNC = {
|
|
265
|
+
"lim": "\\lim_{{{lim}}}",
|
|
266
|
+
"max": "\\max_{{{lim}}}",
|
|
267
|
+
"min": "\\min_{{{lim}}}",
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
LIM_TO = ("\\rightarrow", "\\to")
|
|
271
|
+
|
|
272
|
+
LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
|
|
273
|
+
|
|
274
|
+
M = "\\begin{{matrix}}{text}\\end{{matrix}}"
|