docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1663 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Callable, Final, Optional, Union
|
|
7
|
+
|
|
8
|
+
from docling_core.types.doc import (
|
|
9
|
+
ContentLayer,
|
|
10
|
+
DocItemLabel,
|
|
11
|
+
DoclingDocument,
|
|
12
|
+
DocumentOrigin,
|
|
13
|
+
GroupLabel,
|
|
14
|
+
ImageRef,
|
|
15
|
+
ListGroup,
|
|
16
|
+
NodeItem,
|
|
17
|
+
RefItem,
|
|
18
|
+
RichTableCell,
|
|
19
|
+
TableCell,
|
|
20
|
+
TableData,
|
|
21
|
+
TableItem,
|
|
22
|
+
)
|
|
23
|
+
from docling_core.types.doc.document import Formatting, Script
|
|
24
|
+
from docx import Document
|
|
25
|
+
from docx.document import Document as DocxDocument
|
|
26
|
+
from docx.oxml.table import CT_Tc
|
|
27
|
+
from docx.oxml.xmlchemy import BaseOxmlElement
|
|
28
|
+
from docx.styles.style import ParagraphStyle
|
|
29
|
+
from docx.table import Table, _Cell
|
|
30
|
+
from docx.text.hyperlink import Hyperlink
|
|
31
|
+
from docx.text.paragraph import Paragraph
|
|
32
|
+
from docx.text.run import Run
|
|
33
|
+
from lxml import etree
|
|
34
|
+
from PIL import Image, UnidentifiedImageError
|
|
35
|
+
from pydantic import AnyUrl
|
|
36
|
+
from typing_extensions import override
|
|
37
|
+
|
|
38
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
|
39
|
+
from docling.backend.docx.drawingml.utils import (
|
|
40
|
+
get_docx_to_pdf_converter,
|
|
41
|
+
get_pil_from_dml_docx,
|
|
42
|
+
)
|
|
43
|
+
from docling.backend.docx.latex.omml import oMath2Latex
|
|
44
|
+
from docling.datamodel.base_models import InputFormat
|
|
45
|
+
from docling.datamodel.document import InputDocument
|
|
46
|
+
|
|
47
|
+
_log = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
51
|
+
_BLIP_NAMESPACES: Final = {
|
|
52
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
53
|
+
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
54
|
+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
|
55
|
+
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
|
56
|
+
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
|
|
57
|
+
"v": "urn:schemas-microsoft-com:vml",
|
|
58
|
+
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
|
|
59
|
+
"w10": "urn:schemas-microsoft-com:office:word",
|
|
60
|
+
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
@override
|
|
64
|
+
def __init__(
|
|
65
|
+
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
|
66
|
+
) -> None:
|
|
67
|
+
super().__init__(in_doc, path_or_stream)
|
|
68
|
+
self.XML_KEY = (
|
|
69
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
|
70
|
+
)
|
|
71
|
+
self.xml_namespaces = {
|
|
72
|
+
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
|
|
73
|
+
}
|
|
74
|
+
self.blip_xpath_expr = etree.XPath(
|
|
75
|
+
".//a:blip", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
|
|
76
|
+
)
|
|
77
|
+
# self.initialise(path_or_stream)
|
|
78
|
+
# Word file:
|
|
79
|
+
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
|
80
|
+
self.valid: bool = False
|
|
81
|
+
# Initialise the parents for the hierarchy
|
|
82
|
+
self.max_levels: int = 10
|
|
83
|
+
self.level_at_new_list: Optional[int] = None
|
|
84
|
+
self.parents: dict[int, Optional[NodeItem]] = {}
|
|
85
|
+
self.numbered_headers: dict[int, int] = {}
|
|
86
|
+
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
|
87
|
+
# Track processed textbox elements to avoid duplication
|
|
88
|
+
self.processed_textbox_elements: list[int] = []
|
|
89
|
+
self.docx_to_pdf_converter: Optional[Callable] = None
|
|
90
|
+
self.docx_to_pdf_converter_init = False
|
|
91
|
+
self.display_drawingml_warning = True
|
|
92
|
+
|
|
93
|
+
for i in range(-1, self.max_levels):
|
|
94
|
+
self.parents[i] = None
|
|
95
|
+
|
|
96
|
+
self.level = 0
|
|
97
|
+
self.listIter = 0
|
|
98
|
+
# Track list counters per numId and ilvl
|
|
99
|
+
self.list_counters: dict[tuple[int, int], int] = {}
|
|
100
|
+
# Set starting content layer
|
|
101
|
+
self.content_layer = ContentLayer.BODY
|
|
102
|
+
|
|
103
|
+
self.history: dict[str, Any] = {
|
|
104
|
+
"names": [None],
|
|
105
|
+
"levels": [None],
|
|
106
|
+
"numids": [None],
|
|
107
|
+
"indents": [None],
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
self.docx_obj = self.load_msword_file(
|
|
111
|
+
path_or_stream=self.path_or_stream, document_hash=self.document_hash
|
|
112
|
+
)
|
|
113
|
+
if self.docx_obj:
|
|
114
|
+
self.valid = True
|
|
115
|
+
|
|
116
|
+
@override
|
|
117
|
+
def is_valid(self) -> bool:
|
|
118
|
+
return self.valid
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
@override
|
|
122
|
+
def supports_pagination(cls) -> bool:
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
@override
|
|
126
|
+
def unload(self):
|
|
127
|
+
if isinstance(self.path_or_stream, BytesIO):
|
|
128
|
+
self.path_or_stream.close()
|
|
129
|
+
|
|
130
|
+
self.path_or_stream = None
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
@override
|
|
134
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
135
|
+
return {InputFormat.DOCX}
|
|
136
|
+
|
|
137
|
+
@override
|
|
138
|
+
def convert(self) -> DoclingDocument:
|
|
139
|
+
"""Parses the DOCX into a structured document model.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
The parsed document.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
origin = DocumentOrigin(
|
|
146
|
+
filename=self.file.name or "file",
|
|
147
|
+
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
148
|
+
binary_hash=self.document_hash,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
|
152
|
+
if self.is_valid():
|
|
153
|
+
assert self.docx_obj is not None
|
|
154
|
+
doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
|
|
155
|
+
self._add_header_footer(self.docx_obj, doc)
|
|
156
|
+
|
|
157
|
+
return doc
|
|
158
|
+
else:
|
|
159
|
+
raise RuntimeError(
|
|
160
|
+
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def load_msword_file(
|
|
165
|
+
path_or_stream: Union[BytesIO, Path], document_hash: str
|
|
166
|
+
) -> DocxDocument:
|
|
167
|
+
try:
|
|
168
|
+
if isinstance(path_or_stream, BytesIO):
|
|
169
|
+
return Document(path_or_stream)
|
|
170
|
+
elif isinstance(path_or_stream, Path):
|
|
171
|
+
return Document(str(path_or_stream))
|
|
172
|
+
else:
|
|
173
|
+
return None
|
|
174
|
+
except Exception as e:
|
|
175
|
+
raise RuntimeError(
|
|
176
|
+
f"MsWordDocumentBackend could not load document with hash {document_hash}"
|
|
177
|
+
) from e
|
|
178
|
+
|
|
179
|
+
def _update_history(
|
|
180
|
+
self,
|
|
181
|
+
name: str,
|
|
182
|
+
level: Optional[int],
|
|
183
|
+
numid: Optional[int],
|
|
184
|
+
ilevel: Optional[int],
|
|
185
|
+
):
|
|
186
|
+
self.history["names"].append(name)
|
|
187
|
+
self.history["levels"].append(level)
|
|
188
|
+
|
|
189
|
+
self.history["numids"].append(numid)
|
|
190
|
+
self.history["indents"].append(ilevel)
|
|
191
|
+
|
|
192
|
+
def _prev_name(self) -> Optional[str]:
|
|
193
|
+
return self.history["names"][-1]
|
|
194
|
+
|
|
195
|
+
def _prev_level(self) -> Optional[int]:
|
|
196
|
+
return self.history["levels"][-1]
|
|
197
|
+
|
|
198
|
+
def _prev_numid(self) -> Optional[int]:
|
|
199
|
+
return self.history["numids"][-1]
|
|
200
|
+
|
|
201
|
+
def _prev_indent(self) -> Optional[int]:
|
|
202
|
+
return self.history["indents"][-1]
|
|
203
|
+
|
|
204
|
+
def _get_level(self) -> int:
|
|
205
|
+
"""Return the first None index."""
|
|
206
|
+
for k, v in self.parents.items():
|
|
207
|
+
if k >= 0 and v is None:
|
|
208
|
+
return k
|
|
209
|
+
return 0
|
|
210
|
+
|
|
211
|
+
def _walk_linear(
|
|
212
|
+
self,
|
|
213
|
+
body: BaseOxmlElement,
|
|
214
|
+
doc: DoclingDocument,
|
|
215
|
+
# parent:
|
|
216
|
+
) -> tuple[DoclingDocument, list[RefItem]]:
|
|
217
|
+
added_elements = []
|
|
218
|
+
for element in body:
|
|
219
|
+
tag_name = etree.QName(element).localname
|
|
220
|
+
# Check for Inline Images (blip elements)
|
|
221
|
+
drawing_blip = self.blip_xpath_expr(element)
|
|
222
|
+
drawingml_els = element.findall(
|
|
223
|
+
".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Check for textbox content - check multiple textbox formats
|
|
227
|
+
# Only process if the element hasn't been processed before
|
|
228
|
+
element_id = id(element)
|
|
229
|
+
if element_id not in self.processed_textbox_elements:
|
|
230
|
+
# Modern Word textboxes
|
|
231
|
+
txbx_xpath = etree.XPath(
|
|
232
|
+
".//w:txbxContent|.//v:textbox//w:p",
|
|
233
|
+
namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
|
|
234
|
+
)
|
|
235
|
+
textbox_elements = txbx_xpath(element)
|
|
236
|
+
|
|
237
|
+
# No modern textboxes found, check for alternate/legacy textbox formats
|
|
238
|
+
if not textbox_elements and tag_name in ["drawing", "pict"]:
|
|
239
|
+
# Additional checks for textboxes in DrawingML and VML formats
|
|
240
|
+
alt_txbx_xpath = etree.XPath(
|
|
241
|
+
".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
|
|
242
|
+
namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
|
|
243
|
+
)
|
|
244
|
+
textbox_elements = alt_txbx_xpath(element)
|
|
245
|
+
|
|
246
|
+
# Check for shape text that's not in a standard textbox
|
|
247
|
+
if not textbox_elements:
|
|
248
|
+
shape_text_xpath = etree.XPath(
|
|
249
|
+
".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
|
|
250
|
+
namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
|
|
251
|
+
)
|
|
252
|
+
shape_text_elements = shape_text_xpath(element)
|
|
253
|
+
if shape_text_elements:
|
|
254
|
+
# Create custom text elements from shape text
|
|
255
|
+
text_content = " ".join(
|
|
256
|
+
[t.text for t in shape_text_elements if t.text]
|
|
257
|
+
)
|
|
258
|
+
if text_content.strip():
|
|
259
|
+
_log.debug(f"Found shape text: {text_content[:50]}...")
|
|
260
|
+
# Create a paragraph-like element to process with standard handler
|
|
261
|
+
level = self._get_level()
|
|
262
|
+
shape_group = doc.add_group(
|
|
263
|
+
label=GroupLabel.SECTION,
|
|
264
|
+
parent=self.parents[level - 1],
|
|
265
|
+
name="shape-text",
|
|
266
|
+
content_layer=self.content_layer,
|
|
267
|
+
)
|
|
268
|
+
added_elements.append(shape_group.get_ref())
|
|
269
|
+
doc.add_text(
|
|
270
|
+
label=DocItemLabel.TEXT,
|
|
271
|
+
parent=shape_group,
|
|
272
|
+
text=text_content,
|
|
273
|
+
content_layer=self.content_layer,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
if textbox_elements:
|
|
277
|
+
# Mark the parent element as processed
|
|
278
|
+
self.processed_textbox_elements.append(element_id)
|
|
279
|
+
# Also mark all found textbox elements as processed
|
|
280
|
+
for tb_element in textbox_elements:
|
|
281
|
+
self.processed_textbox_elements.append(id(tb_element))
|
|
282
|
+
|
|
283
|
+
_log.debug(
|
|
284
|
+
f"Found textbox content with {len(textbox_elements)} elements"
|
|
285
|
+
)
|
|
286
|
+
tbc = self._handle_textbox_content(textbox_elements, doc)
|
|
287
|
+
added_elements.extend(tbc)
|
|
288
|
+
|
|
289
|
+
# Check for Tables
|
|
290
|
+
if tag_name == "tbl":
|
|
291
|
+
try:
|
|
292
|
+
t = self._handle_tables(element, doc)
|
|
293
|
+
added_elements.extend(t)
|
|
294
|
+
except Exception:
|
|
295
|
+
_log.debug("could not parse a table, broken docx table")
|
|
296
|
+
# Check for Image
|
|
297
|
+
elif drawing_blip:
|
|
298
|
+
pics = self._handle_pictures(drawing_blip, doc)
|
|
299
|
+
added_elements.extend(pics)
|
|
300
|
+
# Check for Text after the Image
|
|
301
|
+
if (
|
|
302
|
+
tag_name == "p"
|
|
303
|
+
and element.find(
|
|
304
|
+
".//w:t", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
|
|
305
|
+
)
|
|
306
|
+
is not None
|
|
307
|
+
):
|
|
308
|
+
te1 = self._handle_text_elements(element, doc)
|
|
309
|
+
added_elements.extend(te1)
|
|
310
|
+
# Check for DrawingML elements
|
|
311
|
+
elif drawingml_els:
|
|
312
|
+
if (
|
|
313
|
+
self.docx_to_pdf_converter is None
|
|
314
|
+
and self.docx_to_pdf_converter_init is False
|
|
315
|
+
):
|
|
316
|
+
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
|
|
317
|
+
self.docx_to_pdf_converter_init = True
|
|
318
|
+
|
|
319
|
+
if self.docx_to_pdf_converter is None:
|
|
320
|
+
if self.display_drawingml_warning:
|
|
321
|
+
if self.docx_to_pdf_converter is None:
|
|
322
|
+
_log.warning(
|
|
323
|
+
"Found DrawingML elements in document, but no DOCX to PDF converters. "
|
|
324
|
+
"If you want these exported, make sure you have "
|
|
325
|
+
"LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
|
|
326
|
+
)
|
|
327
|
+
self.display_drawingml_warning = False
|
|
328
|
+
else:
|
|
329
|
+
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
|
|
330
|
+
# Check for the sdt containers, like table of contents
|
|
331
|
+
elif tag_name == "sdt":
|
|
332
|
+
sdt_content = element.find(
|
|
333
|
+
".//w:sdtContent", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
|
|
334
|
+
)
|
|
335
|
+
if sdt_content is not None:
|
|
336
|
+
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
|
|
337
|
+
paragraphs = sdt_content.findall(
|
|
338
|
+
".//w:p", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
|
|
339
|
+
)
|
|
340
|
+
for p in paragraphs:
|
|
341
|
+
te = self._handle_text_elements(p, doc)
|
|
342
|
+
added_elements.extend(te)
|
|
343
|
+
# Check for Text
|
|
344
|
+
elif tag_name == "p":
|
|
345
|
+
# "tcPr", "sectPr"
|
|
346
|
+
te = self._handle_text_elements(element, doc)
|
|
347
|
+
added_elements.extend(te)
|
|
348
|
+
else:
|
|
349
|
+
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
|
350
|
+
|
|
351
|
+
return doc, added_elements
|
|
352
|
+
|
|
353
|
+
def _str_to_int(
|
|
354
|
+
self, s: Optional[str], default: Optional[int] = 0
|
|
355
|
+
) -> Optional[int]:
|
|
356
|
+
if s is None:
|
|
357
|
+
return None
|
|
358
|
+
try:
|
|
359
|
+
return int(s)
|
|
360
|
+
except ValueError:
|
|
361
|
+
return default
|
|
362
|
+
|
|
363
|
+
def _split_text_and_number(self, input_string: str) -> list[str]:
|
|
364
|
+
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
|
365
|
+
if match:
|
|
366
|
+
parts = list(filter(None, match.groups()))
|
|
367
|
+
return parts
|
|
368
|
+
else:
|
|
369
|
+
return [input_string]
|
|
370
|
+
|
|
371
|
+
def _get_numId_and_ilvl(
|
|
372
|
+
self, paragraph: Paragraph
|
|
373
|
+
) -> tuple[Optional[int], Optional[int]]:
|
|
374
|
+
# Access the XML element of the paragraph
|
|
375
|
+
numPr = paragraph._element.find(
|
|
376
|
+
".//w:numPr", namespaces=paragraph._element.nsmap
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
if numPr is not None:
|
|
380
|
+
# Get the numId element and extract the value
|
|
381
|
+
numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
|
|
382
|
+
ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
|
|
383
|
+
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
|
384
|
+
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
|
385
|
+
|
|
386
|
+
return self._str_to_int(numId, None), self._str_to_int(ilvl, None)
|
|
387
|
+
|
|
388
|
+
return None, None # If the paragraph is not part of a list
|
|
389
|
+
|
|
390
|
+
def _get_list_counter(self, numid: int, ilvl: int) -> int:
|
|
391
|
+
"""Get and increment the counter for a specific numId and ilvl combination."""
|
|
392
|
+
key = (numid, ilvl)
|
|
393
|
+
if key not in self.list_counters:
|
|
394
|
+
self.list_counters[key] = 0
|
|
395
|
+
self.list_counters[key] += 1
|
|
396
|
+
return self.list_counters[key]
|
|
397
|
+
|
|
398
|
+
def _reset_list_counters_for_new_sequence(self, numid: int):
|
|
399
|
+
"""Reset counters when starting a new numbering sequence."""
|
|
400
|
+
# Reset all counters for this numid
|
|
401
|
+
keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
|
|
402
|
+
for key in keys_to_reset:
|
|
403
|
+
self.list_counters[key] = 0
|
|
404
|
+
|
|
405
|
+
def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
|
|
406
|
+
"""Check if a list is numbered based on its numFmt value."""
|
|
407
|
+
try:
|
|
408
|
+
# Access the numbering part of the document
|
|
409
|
+
if not hasattr(self.docx_obj, "part") or not hasattr(
|
|
410
|
+
self.docx_obj.part, "package"
|
|
411
|
+
):
|
|
412
|
+
return False
|
|
413
|
+
|
|
414
|
+
numbering_part = None
|
|
415
|
+
# Find the numbering part
|
|
416
|
+
for part in self.docx_obj.part.package.parts:
|
|
417
|
+
if "numbering" in part.partname:
|
|
418
|
+
numbering_part = part
|
|
419
|
+
break
|
|
420
|
+
|
|
421
|
+
if numbering_part is None:
|
|
422
|
+
return False
|
|
423
|
+
|
|
424
|
+
# Parse the numbering XML
|
|
425
|
+
numbering_root = numbering_part.element
|
|
426
|
+
namespaces = {
|
|
427
|
+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
# Find the numbering definition with the given numId
|
|
431
|
+
num_xpath = f".//w:num[@w:numId='{numId}']"
|
|
432
|
+
num_element = numbering_root.find(num_xpath, namespaces=namespaces)
|
|
433
|
+
|
|
434
|
+
if num_element is None:
|
|
435
|
+
return False
|
|
436
|
+
|
|
437
|
+
# Get the abstractNumId from the num element
|
|
438
|
+
abstract_num_id_elem = num_element.find(
|
|
439
|
+
".//w:abstractNumId", namespaces=namespaces
|
|
440
|
+
)
|
|
441
|
+
if abstract_num_id_elem is None:
|
|
442
|
+
return False
|
|
443
|
+
|
|
444
|
+
abstract_num_id = abstract_num_id_elem.get(
|
|
445
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
|
446
|
+
)
|
|
447
|
+
if abstract_num_id is None:
|
|
448
|
+
return False
|
|
449
|
+
|
|
450
|
+
# Find the abstract numbering definition
|
|
451
|
+
abstract_num_xpath = (
|
|
452
|
+
f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
|
|
453
|
+
)
|
|
454
|
+
abstract_num_element = numbering_root.find(
|
|
455
|
+
abstract_num_xpath, namespaces=namespaces
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if abstract_num_element is None:
|
|
459
|
+
return False
|
|
460
|
+
|
|
461
|
+
# Find the level definition for the given ilvl
|
|
462
|
+
lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
|
|
463
|
+
lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
|
|
464
|
+
|
|
465
|
+
if lvl_element is None:
|
|
466
|
+
return False
|
|
467
|
+
|
|
468
|
+
# Get the numFmt element
|
|
469
|
+
num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
|
|
470
|
+
if num_fmt_element is None:
|
|
471
|
+
return False
|
|
472
|
+
|
|
473
|
+
num_fmt = num_fmt_element.get(
|
|
474
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
# Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
|
|
478
|
+
# Bullet formats include: bullet
|
|
479
|
+
numbered_formats = {
|
|
480
|
+
"decimal",
|
|
481
|
+
"lowerRoman",
|
|
482
|
+
"upperRoman",
|
|
483
|
+
"lowerLetter",
|
|
484
|
+
"upperLetter",
|
|
485
|
+
"decimalZero",
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
return num_fmt in numbered_formats
|
|
489
|
+
|
|
490
|
+
except Exception as e:
|
|
491
|
+
_log.debug(f"Error determining if list is numbered: {e}")
|
|
492
|
+
return False
|
|
493
|
+
|
|
494
|
+
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
|
495
|
+
parts = self._split_text_and_number(style_label)
|
|
496
|
+
|
|
497
|
+
if len(parts) == 2:
|
|
498
|
+
parts.sort()
|
|
499
|
+
label_str: str = ""
|
|
500
|
+
label_level: Optional[int] = 0
|
|
501
|
+
if parts[0].strip().lower() == "heading":
|
|
502
|
+
label_str = "Heading"
|
|
503
|
+
label_level = self._str_to_int(parts[1], None)
|
|
504
|
+
if parts[1].strip().lower() == "heading":
|
|
505
|
+
label_str = "Heading"
|
|
506
|
+
label_level = self._str_to_int(parts[0], None)
|
|
507
|
+
return label_str, label_level
|
|
508
|
+
|
|
509
|
+
return style_label, None
|
|
510
|
+
|
|
511
|
+
def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
|
512
|
+
if paragraph.style is None:
|
|
513
|
+
return "Normal", None
|
|
514
|
+
|
|
515
|
+
label: str = paragraph.style.style_id
|
|
516
|
+
name: str = paragraph.style.name or ""
|
|
517
|
+
base_style_label: Optional[str] = None
|
|
518
|
+
base_style_name: Optional[str] = None
|
|
519
|
+
if isinstance(
|
|
520
|
+
base_style := getattr(paragraph.style, "base_style", None), ParagraphStyle
|
|
521
|
+
):
|
|
522
|
+
base_style_label = base_style.style_id
|
|
523
|
+
base_style_name = base_style.name
|
|
524
|
+
|
|
525
|
+
if not label:
|
|
526
|
+
return "Normal", None
|
|
527
|
+
|
|
528
|
+
if ":" in label:
|
|
529
|
+
parts = label.split(":")
|
|
530
|
+
if len(parts) == 2:
|
|
531
|
+
return parts[0], self._str_to_int(parts[1], None)
|
|
532
|
+
|
|
533
|
+
if "heading" in label.lower():
|
|
534
|
+
return self._get_heading_and_level(label)
|
|
535
|
+
if "heading" in name.lower():
|
|
536
|
+
return self._get_heading_and_level(name)
|
|
537
|
+
if base_style_label and "heading" in base_style_label.lower():
|
|
538
|
+
return self._get_heading_and_level(base_style_label)
|
|
539
|
+
if base_style_name and "heading" in base_style_name.lower():
|
|
540
|
+
return self._get_heading_and_level(base_style_name)
|
|
541
|
+
|
|
542
|
+
return label, None
|
|
543
|
+
|
|
544
|
+
@classmethod
|
|
545
|
+
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
|
546
|
+
# The .bold and .italic properties are booleans, but .underline can be an enum
|
|
547
|
+
# like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
|
|
548
|
+
is_bold = run.bold or False
|
|
549
|
+
is_italic = run.italic or False
|
|
550
|
+
is_strikethrough = run.font.strike or False
|
|
551
|
+
# Convert any non-None underline value to True
|
|
552
|
+
is_underline = bool(run.underline is not None and run.underline)
|
|
553
|
+
is_sub = run.font.subscript or False
|
|
554
|
+
is_sup = run.font.superscript or False
|
|
555
|
+
script = Script.SUB if is_sub else Script.SUPER if is_sup else Script.BASELINE
|
|
556
|
+
|
|
557
|
+
return Formatting(
|
|
558
|
+
bold=is_bold,
|
|
559
|
+
italic=is_italic,
|
|
560
|
+
underline=is_underline,
|
|
561
|
+
strikethrough=is_strikethrough,
|
|
562
|
+
script=script,
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
def _get_paragraph_elements(self, paragraph: Paragraph):
|
|
566
|
+
"""
|
|
567
|
+
Extract paragraph elements along with their formatting and hyperlink
|
|
568
|
+
"""
|
|
569
|
+
|
|
570
|
+
# for now retain empty paragraphs for backwards compatibility:
|
|
571
|
+
if paragraph.text.strip() == "":
|
|
572
|
+
return [("", None, None)]
|
|
573
|
+
|
|
574
|
+
paragraph_elements: list[
|
|
575
|
+
tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]]
|
|
576
|
+
] = []
|
|
577
|
+
group_text = ""
|
|
578
|
+
previous_format = None
|
|
579
|
+
|
|
580
|
+
# Iterate over the runs of the paragraph and group them by format
|
|
581
|
+
for c in paragraph.iter_inner_content():
|
|
582
|
+
if isinstance(c, Hyperlink):
|
|
583
|
+
text = c.text
|
|
584
|
+
hyperlink = Path(c.address)
|
|
585
|
+
format = (
|
|
586
|
+
self._get_format_from_run(c.runs[0])
|
|
587
|
+
if c.runs and len(c.runs) > 0
|
|
588
|
+
else None
|
|
589
|
+
)
|
|
590
|
+
elif isinstance(c, Run):
|
|
591
|
+
text = c.text
|
|
592
|
+
hyperlink = None
|
|
593
|
+
format = self._get_format_from_run(c)
|
|
594
|
+
else:
|
|
595
|
+
continue
|
|
596
|
+
|
|
597
|
+
if (len(text.strip()) and format != previous_format) or (
|
|
598
|
+
hyperlink is not None
|
|
599
|
+
):
|
|
600
|
+
# If the style changes for a non empty text, add the previous group
|
|
601
|
+
if len(group_text.strip()) > 0:
|
|
602
|
+
paragraph_elements.append(
|
|
603
|
+
(group_text.strip(), previous_format, None)
|
|
604
|
+
)
|
|
605
|
+
group_text = ""
|
|
606
|
+
|
|
607
|
+
# If there is a hyperlink, add it immediately
|
|
608
|
+
if hyperlink is not None:
|
|
609
|
+
paragraph_elements.append((text.strip(), format, hyperlink))
|
|
610
|
+
text = ""
|
|
611
|
+
else:
|
|
612
|
+
previous_format = format
|
|
613
|
+
|
|
614
|
+
group_text += text
|
|
615
|
+
|
|
616
|
+
# Format the last group
|
|
617
|
+
if len(group_text.strip()) > 0:
|
|
618
|
+
paragraph_elements.append((group_text.strip(), format, None))
|
|
619
|
+
|
|
620
|
+
return paragraph_elements
|
|
621
|
+
|
|
622
|
+
def _get_paragraph_position(self, paragraph_element):
|
|
623
|
+
"""Extract vertical position information from paragraph element."""
|
|
624
|
+
# First try to directly get the index from w:p element that has an order-related attribute
|
|
625
|
+
if (
|
|
626
|
+
hasattr(paragraph_element, "getparent")
|
|
627
|
+
and paragraph_element.getparent() is not None
|
|
628
|
+
):
|
|
629
|
+
parent = paragraph_element.getparent()
|
|
630
|
+
# Get all paragraph siblings
|
|
631
|
+
paragraphs = [
|
|
632
|
+
p for p in parent.getchildren() if etree.QName(p).localname == "p"
|
|
633
|
+
]
|
|
634
|
+
# Find index of current paragraph within its siblings
|
|
635
|
+
try:
|
|
636
|
+
paragraph_index = paragraphs.index(paragraph_element)
|
|
637
|
+
return paragraph_index # Use index as position for consistent ordering
|
|
638
|
+
except ValueError:
|
|
639
|
+
pass
|
|
640
|
+
|
|
641
|
+
# Look for position hints in element attributes and ancestor elements
|
|
642
|
+
for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
|
|
643
|
+
# Check for direct position attributes
|
|
644
|
+
for attr_name in ["y", "top", "positionY", "y-position", "position"]:
|
|
645
|
+
value = elem.get(attr_name)
|
|
646
|
+
if value:
|
|
647
|
+
try:
|
|
648
|
+
# Remove any non-numeric characters (like 'pt', 'px', etc.)
|
|
649
|
+
clean_value = re.sub(r"[^0-9.]", "", value)
|
|
650
|
+
if clean_value:
|
|
651
|
+
return float(clean_value)
|
|
652
|
+
except (ValueError, TypeError):
|
|
653
|
+
pass
|
|
654
|
+
|
|
655
|
+
# Check for position in transform attribute
|
|
656
|
+
transform = elem.get("transform")
|
|
657
|
+
if transform:
|
|
658
|
+
# Extract translation component from transform matrix
|
|
659
|
+
match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
|
|
660
|
+
if match:
|
|
661
|
+
try:
|
|
662
|
+
return float(match.group(1))
|
|
663
|
+
except ValueError:
|
|
664
|
+
pass
|
|
665
|
+
|
|
666
|
+
# Check for anchors or relative position indicators in Word format
|
|
667
|
+
# 'dist' attributes can indicate relative positioning
|
|
668
|
+
for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
|
|
669
|
+
if elem.get(attr_name) is not None:
|
|
670
|
+
return elem.sourceline # Use the XML source line number as fallback
|
|
671
|
+
|
|
672
|
+
# For VML shapes, look for specific attributes
|
|
673
|
+
for ns_uri in paragraph_element.nsmap.values():
|
|
674
|
+
if "vml" in ns_uri:
|
|
675
|
+
# Try to extract position from style attribute
|
|
676
|
+
style = paragraph_element.get("style")
|
|
677
|
+
if style:
|
|
678
|
+
match = re.search(r"top:([0-9.]+)pt", style)
|
|
679
|
+
if match:
|
|
680
|
+
try:
|
|
681
|
+
return float(match.group(1))
|
|
682
|
+
except ValueError:
|
|
683
|
+
pass
|
|
684
|
+
|
|
685
|
+
# If no better position indicator found, use XML source line number as proxy for order
|
|
686
|
+
return (
|
|
687
|
+
paragraph_element.sourceline
|
|
688
|
+
if hasattr(paragraph_element, "sourceline")
|
|
689
|
+
else None
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
def _collect_textbox_paragraphs(self, textbox_elements):
|
|
693
|
+
"""Collect and organize paragraphs from textbox elements."""
|
|
694
|
+
processed_paragraphs = []
|
|
695
|
+
container_paragraphs = {}
|
|
696
|
+
|
|
697
|
+
for element in textbox_elements:
|
|
698
|
+
element_id = id(element)
|
|
699
|
+
# Skip if we've already processed this exact element
|
|
700
|
+
if element_id in processed_paragraphs:
|
|
701
|
+
continue
|
|
702
|
+
|
|
703
|
+
tag_name = etree.QName(element).localname
|
|
704
|
+
processed_paragraphs.append(element_id)
|
|
705
|
+
|
|
706
|
+
# Handle paragraphs directly found (VML textboxes)
|
|
707
|
+
if tag_name == "p":
|
|
708
|
+
# Find the containing textbox or shape element
|
|
709
|
+
container_id = None
|
|
710
|
+
for ancestor in element.iterancestors():
|
|
711
|
+
if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
|
|
712
|
+
container_id = id(ancestor)
|
|
713
|
+
break
|
|
714
|
+
|
|
715
|
+
if container_id not in container_paragraphs:
|
|
716
|
+
container_paragraphs[container_id] = []
|
|
717
|
+
container_paragraphs[container_id].append(
|
|
718
|
+
(element, self._get_paragraph_position(element))
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
# Handle txbxContent elements (Word DrawingML textboxes)
|
|
722
|
+
elif tag_name == "txbxContent":
|
|
723
|
+
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
|
724
|
+
container_id = id(element)
|
|
725
|
+
if container_id not in container_paragraphs:
|
|
726
|
+
container_paragraphs[container_id] = []
|
|
727
|
+
|
|
728
|
+
for p in paragraphs:
|
|
729
|
+
p_id = id(p)
|
|
730
|
+
if p_id not in processed_paragraphs:
|
|
731
|
+
processed_paragraphs.append(p_id)
|
|
732
|
+
container_paragraphs[container_id].append(
|
|
733
|
+
(p, self._get_paragraph_position(p))
|
|
734
|
+
)
|
|
735
|
+
else:
|
|
736
|
+
# Try to extract any paragraphs from unknown elements
|
|
737
|
+
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
|
738
|
+
container_id = id(element)
|
|
739
|
+
if container_id not in container_paragraphs:
|
|
740
|
+
container_paragraphs[container_id] = []
|
|
741
|
+
|
|
742
|
+
for p in paragraphs:
|
|
743
|
+
p_id = id(p)
|
|
744
|
+
if p_id not in processed_paragraphs:
|
|
745
|
+
processed_paragraphs.append(p_id)
|
|
746
|
+
container_paragraphs[container_id].append(
|
|
747
|
+
(p, self._get_paragraph_position(p))
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
return container_paragraphs
|
|
751
|
+
|
|
752
|
+
def _handle_textbox_content(
|
|
753
|
+
self,
|
|
754
|
+
textbox_elements: list,
|
|
755
|
+
doc: DoclingDocument,
|
|
756
|
+
) -> list[RefItem]:
|
|
757
|
+
elem_ref: list[RefItem] = []
|
|
758
|
+
"""Process textbox content and add it to the document structure."""
|
|
759
|
+
level = self._get_level()
|
|
760
|
+
# Create a textbox group to contain all text from the textbox
|
|
761
|
+
textbox_group = doc.add_group(
|
|
762
|
+
label=GroupLabel.SECTION,
|
|
763
|
+
parent=self.parents[level - 1],
|
|
764
|
+
name="textbox",
|
|
765
|
+
content_layer=self.content_layer,
|
|
766
|
+
)
|
|
767
|
+
elem_ref.append(textbox_group.get_ref())
|
|
768
|
+
# Set this as the current parent to ensure textbox content
|
|
769
|
+
# is properly nested in document structure
|
|
770
|
+
original_parent = self.parents[level]
|
|
771
|
+
self.parents[level] = textbox_group
|
|
772
|
+
|
|
773
|
+
# Collect and organize paragraphs
|
|
774
|
+
container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
|
|
775
|
+
|
|
776
|
+
# Process all paragraphs
|
|
777
|
+
all_paragraphs = []
|
|
778
|
+
|
|
779
|
+
# Sort paragraphs within each container, then process containers
|
|
780
|
+
for paragraphs in container_paragraphs.values():
|
|
781
|
+
# Sort by vertical position within each container
|
|
782
|
+
sorted_container_paragraphs = sorted(
|
|
783
|
+
paragraphs,
|
|
784
|
+
key=lambda x: (
|
|
785
|
+
x[1] is None,
|
|
786
|
+
x[1] if x[1] is not None else float("inf"),
|
|
787
|
+
),
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
# Add the sorted paragraphs to our processing list
|
|
791
|
+
all_paragraphs.extend(sorted_container_paragraphs)
|
|
792
|
+
|
|
793
|
+
# Track processed paragraphs to avoid duplicates (same content and position)
|
|
794
|
+
processed_paragraphs = set()
|
|
795
|
+
|
|
796
|
+
# Process all the paragraphs
|
|
797
|
+
for p, position in all_paragraphs:
|
|
798
|
+
# Create paragraph object to get text content
|
|
799
|
+
paragraph = Paragraph(p, self.docx_obj)
|
|
800
|
+
text_content = paragraph.text
|
|
801
|
+
|
|
802
|
+
# Create a unique identifier based on content and position
|
|
803
|
+
paragraph_id = (text_content, position)
|
|
804
|
+
|
|
805
|
+
# Skip if this paragraph (same content and position) was already processed
|
|
806
|
+
if paragraph_id in processed_paragraphs:
|
|
807
|
+
_log.debug(
|
|
808
|
+
f"Skipping duplicate paragraph: content='{text_content[:50]}...', position={position}"
|
|
809
|
+
)
|
|
810
|
+
continue
|
|
811
|
+
|
|
812
|
+
# Mark this paragraph as processed
|
|
813
|
+
processed_paragraphs.add(paragraph_id)
|
|
814
|
+
|
|
815
|
+
elem_ref.extend(self._handle_text_elements(p, doc))
|
|
816
|
+
|
|
817
|
+
# Restore original parent
|
|
818
|
+
self.parents[level] = original_parent
|
|
819
|
+
return elem_ref
|
|
820
|
+
|
|
821
|
+
def _handle_equations_in_text(self, element, text):
|
|
822
|
+
only_texts = []
|
|
823
|
+
only_equations = []
|
|
824
|
+
texts_and_equations = []
|
|
825
|
+
for subt in element.iter():
|
|
826
|
+
tag_name = etree.QName(subt).localname
|
|
827
|
+
if tag_name == "t" and "math" not in subt.tag:
|
|
828
|
+
if isinstance(subt.text, str):
|
|
829
|
+
only_texts.append(subt.text)
|
|
830
|
+
texts_and_equations.append(subt.text)
|
|
831
|
+
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
|
832
|
+
latex_equation = str(oMath2Latex(subt)).strip()
|
|
833
|
+
if len(latex_equation) > 0:
|
|
834
|
+
only_equations.append(
|
|
835
|
+
self.equation_bookends.format(EQ=latex_equation)
|
|
836
|
+
)
|
|
837
|
+
texts_and_equations.append(
|
|
838
|
+
self.equation_bookends.format(EQ=latex_equation)
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
if len(only_equations) < 1:
|
|
842
|
+
return text, []
|
|
843
|
+
|
|
844
|
+
if (
|
|
845
|
+
re.sub(r"\s+", "", "".join(only_texts)).strip()
|
|
846
|
+
!= re.sub(r"\s+", "", text).strip()
|
|
847
|
+
):
|
|
848
|
+
# If we are not able to reconstruct the initial raw text
|
|
849
|
+
# do not try to parse equations and return the original
|
|
850
|
+
return text, []
|
|
851
|
+
|
|
852
|
+
# Insert equations into original text
|
|
853
|
+
# This is done to preserve white space structure
|
|
854
|
+
output_text = text[:]
|
|
855
|
+
init_i = 0
|
|
856
|
+
for i_substr, substr in enumerate(texts_and_equations):
|
|
857
|
+
if len(substr) == 0:
|
|
858
|
+
continue
|
|
859
|
+
|
|
860
|
+
if substr in output_text[init_i:]:
|
|
861
|
+
init_i += output_text[init_i:].find(substr) + len(substr)
|
|
862
|
+
else:
|
|
863
|
+
if i_substr > 0:
|
|
864
|
+
output_text = output_text[:init_i] + substr + output_text[init_i:]
|
|
865
|
+
init_i += len(substr)
|
|
866
|
+
else:
|
|
867
|
+
output_text = substr + output_text
|
|
868
|
+
|
|
869
|
+
return output_text, only_equations
|
|
870
|
+
|
|
871
|
+
def _create_or_reuse_parent(
|
|
872
|
+
self,
|
|
873
|
+
*,
|
|
874
|
+
doc: DoclingDocument,
|
|
875
|
+
prev_parent: Optional[NodeItem],
|
|
876
|
+
paragraph_elements: list,
|
|
877
|
+
) -> Optional[NodeItem]:
|
|
878
|
+
return (
|
|
879
|
+
doc.add_inline_group(parent=prev_parent, content_layer=self.content_layer)
|
|
880
|
+
if len(paragraph_elements) > 1
|
|
881
|
+
else prev_parent
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
def _handle_text_elements(
|
|
885
|
+
self,
|
|
886
|
+
element: BaseOxmlElement,
|
|
887
|
+
doc: DoclingDocument,
|
|
888
|
+
) -> list[RefItem]:
|
|
889
|
+
elem_ref: list[RefItem] = []
|
|
890
|
+
paragraph = Paragraph(element, self.docx_obj)
|
|
891
|
+
paragraph_elements = self._get_paragraph_elements(paragraph)
|
|
892
|
+
text, equations = self._handle_equations_in_text(
|
|
893
|
+
element=element, text=paragraph.text
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
if text is None:
|
|
897
|
+
return elem_ref
|
|
898
|
+
text = text.strip()
|
|
899
|
+
|
|
900
|
+
# Common styles for bullet and numbered lists.
|
|
901
|
+
# "List Bullet", "List Number", "List Paragraph"
|
|
902
|
+
# Identify whether list is a numbered list or not
|
|
903
|
+
p_style_id, p_level = self._get_label_and_level(paragraph)
|
|
904
|
+
numid, ilevel = self._get_numId_and_ilvl(paragraph)
|
|
905
|
+
|
|
906
|
+
if numid == 0:
|
|
907
|
+
numid = None
|
|
908
|
+
|
|
909
|
+
# Handle lists
|
|
910
|
+
if (
|
|
911
|
+
numid is not None
|
|
912
|
+
and ilevel is not None
|
|
913
|
+
and p_style_id not in ["Title", "Heading"]
|
|
914
|
+
):
|
|
915
|
+
# Check if this is actually a numbered list by examining the numFmt
|
|
916
|
+
is_numbered = self._is_numbered_list(numid, ilevel)
|
|
917
|
+
|
|
918
|
+
li = self._add_list_item(
|
|
919
|
+
doc=doc,
|
|
920
|
+
numid=numid,
|
|
921
|
+
ilevel=ilevel,
|
|
922
|
+
elements=paragraph_elements,
|
|
923
|
+
is_numbered=is_numbered,
|
|
924
|
+
)
|
|
925
|
+
elem_ref.extend(li) # MUST BE REF!!!
|
|
926
|
+
self._update_history(p_style_id, p_level, numid, ilevel)
|
|
927
|
+
return elem_ref
|
|
928
|
+
elif (
|
|
929
|
+
numid is None
|
|
930
|
+
and self._prev_numid() is not None
|
|
931
|
+
and p_style_id not in ["Title", "Heading"]
|
|
932
|
+
): # Close list
|
|
933
|
+
if self.level_at_new_list:
|
|
934
|
+
for key in range(len(self.parents)):
|
|
935
|
+
if key >= self.level_at_new_list:
|
|
936
|
+
self.parents[key] = None
|
|
937
|
+
self.level = self.level_at_new_list - 1
|
|
938
|
+
self.level_at_new_list = None
|
|
939
|
+
else:
|
|
940
|
+
for key in range(len(self.parents)):
|
|
941
|
+
self.parents[key] = None
|
|
942
|
+
self.level = 0
|
|
943
|
+
|
|
944
|
+
if p_style_id in ["Title"]:
|
|
945
|
+
for key in range(len(self.parents)):
|
|
946
|
+
self.parents[key] = None
|
|
947
|
+
te = doc.add_text(
|
|
948
|
+
parent=None,
|
|
949
|
+
label=DocItemLabel.TITLE,
|
|
950
|
+
text=text,
|
|
951
|
+
content_layer=self.content_layer,
|
|
952
|
+
)
|
|
953
|
+
self.parents[0] = te
|
|
954
|
+
elem_ref.append(te.get_ref())
|
|
955
|
+
elif "Heading" in p_style_id:
|
|
956
|
+
style_element = getattr(paragraph.style, "element", None)
|
|
957
|
+
if style_element is not None:
|
|
958
|
+
is_numbered_style = (
|
|
959
|
+
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
|
|
960
|
+
)
|
|
961
|
+
else:
|
|
962
|
+
is_numbered_style = False
|
|
963
|
+
h1 = self._add_heading(doc, p_level, text, is_numbered_style)
|
|
964
|
+
elem_ref.extend(h1)
|
|
965
|
+
|
|
966
|
+
elif len(equations) > 0:
|
|
967
|
+
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
|
|
968
|
+
text
|
|
969
|
+
) > 0:
|
|
970
|
+
# Standalone equation
|
|
971
|
+
level = self._get_level()
|
|
972
|
+
t1 = doc.add_text(
|
|
973
|
+
label=DocItemLabel.FORMULA,
|
|
974
|
+
parent=self.parents[level - 1],
|
|
975
|
+
text=text.replace("<eq>", "").replace("</eq>", ""),
|
|
976
|
+
content_layer=self.content_layer,
|
|
977
|
+
)
|
|
978
|
+
elem_ref.append(t1.get_ref())
|
|
979
|
+
else:
|
|
980
|
+
# Inline equation
|
|
981
|
+
level = self._get_level()
|
|
982
|
+
inline_equation = doc.add_inline_group(
|
|
983
|
+
parent=self.parents[level - 1], content_layer=self.content_layer
|
|
984
|
+
)
|
|
985
|
+
elem_ref.append(inline_equation.get_ref())
|
|
986
|
+
text_tmp = text
|
|
987
|
+
for eq in equations:
|
|
988
|
+
if len(text_tmp) == 0:
|
|
989
|
+
break
|
|
990
|
+
|
|
991
|
+
split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
|
|
992
|
+
|
|
993
|
+
pre_eq_text = split_text_tmp[0]
|
|
994
|
+
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
|
|
995
|
+
|
|
996
|
+
if len(pre_eq_text) > 0:
|
|
997
|
+
e1 = doc.add_text(
|
|
998
|
+
label=DocItemLabel.TEXT,
|
|
999
|
+
parent=inline_equation,
|
|
1000
|
+
text=pre_eq_text,
|
|
1001
|
+
content_layer=self.content_layer,
|
|
1002
|
+
)
|
|
1003
|
+
elem_ref.append(e1.get_ref())
|
|
1004
|
+
e2 = doc.add_text(
|
|
1005
|
+
label=DocItemLabel.FORMULA,
|
|
1006
|
+
parent=inline_equation,
|
|
1007
|
+
text=eq.replace("<eq>", "").replace("</eq>", ""),
|
|
1008
|
+
content_layer=self.content_layer,
|
|
1009
|
+
)
|
|
1010
|
+
elem_ref.append(e2.get_ref())
|
|
1011
|
+
|
|
1012
|
+
if len(text_tmp) > 0:
|
|
1013
|
+
e3 = doc.add_text(
|
|
1014
|
+
label=DocItemLabel.TEXT,
|
|
1015
|
+
parent=inline_equation,
|
|
1016
|
+
text=text_tmp.strip(),
|
|
1017
|
+
content_layer=self.content_layer,
|
|
1018
|
+
)
|
|
1019
|
+
elem_ref.append(e3.get_ref())
|
|
1020
|
+
|
|
1021
|
+
elif p_style_id in [
|
|
1022
|
+
"Paragraph",
|
|
1023
|
+
"Normal",
|
|
1024
|
+
"Subtitle",
|
|
1025
|
+
"Author",
|
|
1026
|
+
"DefaultText",
|
|
1027
|
+
"ListParagraph",
|
|
1028
|
+
"ListBullet",
|
|
1029
|
+
"Quote",
|
|
1030
|
+
]:
|
|
1031
|
+
level = self._get_level()
|
|
1032
|
+
parent = self._create_or_reuse_parent(
|
|
1033
|
+
doc=doc,
|
|
1034
|
+
prev_parent=self.parents.get(level - 1),
|
|
1035
|
+
paragraph_elements=paragraph_elements,
|
|
1036
|
+
)
|
|
1037
|
+
for text, format, hyperlink in paragraph_elements:
|
|
1038
|
+
t2 = doc.add_text(
|
|
1039
|
+
label=DocItemLabel.TEXT,
|
|
1040
|
+
parent=parent,
|
|
1041
|
+
text=text,
|
|
1042
|
+
formatting=format,
|
|
1043
|
+
hyperlink=hyperlink,
|
|
1044
|
+
content_layer=self.content_layer,
|
|
1045
|
+
)
|
|
1046
|
+
elem_ref.append(t2.get_ref())
|
|
1047
|
+
|
|
1048
|
+
else:
|
|
1049
|
+
# Text style names can, and will have, not only default values but user values too
|
|
1050
|
+
# hence we treat all other labels as pure text
|
|
1051
|
+
level = self._get_level()
|
|
1052
|
+
parent = self._create_or_reuse_parent(
|
|
1053
|
+
doc=doc,
|
|
1054
|
+
prev_parent=self.parents.get(level - 1),
|
|
1055
|
+
paragraph_elements=paragraph_elements,
|
|
1056
|
+
)
|
|
1057
|
+
for text, format, hyperlink in paragraph_elements:
|
|
1058
|
+
t3 = doc.add_text(
|
|
1059
|
+
label=DocItemLabel.TEXT,
|
|
1060
|
+
parent=parent,
|
|
1061
|
+
text=text,
|
|
1062
|
+
formatting=format,
|
|
1063
|
+
hyperlink=hyperlink,
|
|
1064
|
+
content_layer=self.content_layer,
|
|
1065
|
+
)
|
|
1066
|
+
elem_ref.append(t3.get_ref())
|
|
1067
|
+
|
|
1068
|
+
self._update_history(p_style_id, p_level, numid, ilevel)
|
|
1069
|
+
return elem_ref
|
|
1070
|
+
|
|
1071
|
+
def _add_heading(
|
|
1072
|
+
self,
|
|
1073
|
+
doc: DoclingDocument,
|
|
1074
|
+
curr_level: Optional[int],
|
|
1075
|
+
text: str,
|
|
1076
|
+
is_numbered_style: bool = False,
|
|
1077
|
+
) -> list[RefItem]:
|
|
1078
|
+
elem_ref: list[RefItem] = []
|
|
1079
|
+
level = self._get_level()
|
|
1080
|
+
if isinstance(curr_level, int):
|
|
1081
|
+
if curr_level > level:
|
|
1082
|
+
# add invisible group
|
|
1083
|
+
for i in range(level, curr_level):
|
|
1084
|
+
gr1 = doc.add_group(
|
|
1085
|
+
parent=self.parents[i - 1],
|
|
1086
|
+
label=GroupLabel.SECTION,
|
|
1087
|
+
name=f"header-{i}",
|
|
1088
|
+
)
|
|
1089
|
+
elem_ref.append(gr1.get_ref())
|
|
1090
|
+
self.parents[i] = gr1
|
|
1091
|
+
|
|
1092
|
+
elif curr_level < level:
|
|
1093
|
+
# remove the tail
|
|
1094
|
+
for key in range(len(self.parents)):
|
|
1095
|
+
if key >= curr_level:
|
|
1096
|
+
self.parents[key] = None
|
|
1097
|
+
|
|
1098
|
+
current_level = curr_level
|
|
1099
|
+
parent_level = curr_level - 1
|
|
1100
|
+
add_level = curr_level
|
|
1101
|
+
else:
|
|
1102
|
+
current_level = self.level
|
|
1103
|
+
parent_level = self.level - 1
|
|
1104
|
+
add_level = 1
|
|
1105
|
+
|
|
1106
|
+
if is_numbered_style:
|
|
1107
|
+
if add_level in self.numbered_headers:
|
|
1108
|
+
self.numbered_headers[add_level] += 1
|
|
1109
|
+
else:
|
|
1110
|
+
self.numbered_headers[add_level] = 1
|
|
1111
|
+
text = f"{self.numbered_headers[add_level]} {text}"
|
|
1112
|
+
|
|
1113
|
+
# Reset deeper levels
|
|
1114
|
+
next_level = add_level + 1
|
|
1115
|
+
while next_level in self.numbered_headers:
|
|
1116
|
+
self.numbered_headers[next_level] = 0
|
|
1117
|
+
next_level += 1
|
|
1118
|
+
|
|
1119
|
+
# Scan upper levels
|
|
1120
|
+
previous_level = add_level - 1
|
|
1121
|
+
while previous_level in self.numbered_headers:
|
|
1122
|
+
# MSWord convention: no empty sublevels
|
|
1123
|
+
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
|
|
1124
|
+
# is processed as 2.1.1
|
|
1125
|
+
if self.numbered_headers[previous_level] == 0:
|
|
1126
|
+
self.numbered_headers[previous_level] += 1
|
|
1127
|
+
|
|
1128
|
+
text = f"{self.numbered_headers[previous_level]}.{text}"
|
|
1129
|
+
previous_level -= 1
|
|
1130
|
+
|
|
1131
|
+
hd = doc.add_heading(
|
|
1132
|
+
parent=self.parents[parent_level],
|
|
1133
|
+
text=text,
|
|
1134
|
+
level=add_level,
|
|
1135
|
+
)
|
|
1136
|
+
self.parents[current_level] = hd
|
|
1137
|
+
elem_ref.append(hd.get_ref())
|
|
1138
|
+
return elem_ref
|
|
1139
|
+
|
|
1140
|
+
def _add_formatted_list_item(
|
|
1141
|
+
self,
|
|
1142
|
+
doc: DoclingDocument,
|
|
1143
|
+
elements: list,
|
|
1144
|
+
marker: str,
|
|
1145
|
+
enumerated: bool,
|
|
1146
|
+
level: int,
|
|
1147
|
+
) -> list[RefItem]:
|
|
1148
|
+
elem_ref: list[RefItem] = []
|
|
1149
|
+
# This should not happen by construction
|
|
1150
|
+
if not isinstance(self.parents[level], ListGroup):
|
|
1151
|
+
_log.warning(
|
|
1152
|
+
"Parent element of the list item is not a ListGroup. The list item will be ignored."
|
|
1153
|
+
)
|
|
1154
|
+
return elem_ref
|
|
1155
|
+
if not elements:
|
|
1156
|
+
return elem_ref
|
|
1157
|
+
|
|
1158
|
+
if len(elements) == 1:
|
|
1159
|
+
text, format, hyperlink = elements[0]
|
|
1160
|
+
if text:
|
|
1161
|
+
doc.add_list_item(
|
|
1162
|
+
marker=marker,
|
|
1163
|
+
enumerated=enumerated,
|
|
1164
|
+
parent=self.parents[level],
|
|
1165
|
+
text=text,
|
|
1166
|
+
formatting=format,
|
|
1167
|
+
hyperlink=hyperlink,
|
|
1168
|
+
)
|
|
1169
|
+
else:
|
|
1170
|
+
new_item = doc.add_list_item(
|
|
1171
|
+
marker=marker,
|
|
1172
|
+
enumerated=enumerated,
|
|
1173
|
+
parent=self.parents[level],
|
|
1174
|
+
text="",
|
|
1175
|
+
)
|
|
1176
|
+
new_parent = doc.add_inline_group(parent=new_item)
|
|
1177
|
+
for text, format, hyperlink in elements:
|
|
1178
|
+
if text:
|
|
1179
|
+
doc.add_text(
|
|
1180
|
+
label=DocItemLabel.TEXT,
|
|
1181
|
+
parent=new_parent,
|
|
1182
|
+
text=text,
|
|
1183
|
+
formatting=format,
|
|
1184
|
+
hyperlink=hyperlink,
|
|
1185
|
+
content_layer=self.content_layer,
|
|
1186
|
+
)
|
|
1187
|
+
return elem_ref
|
|
1188
|
+
|
|
1189
|
+
def _add_list_item(
|
|
1190
|
+
self,
|
|
1191
|
+
*,
|
|
1192
|
+
doc: DoclingDocument,
|
|
1193
|
+
numid: int,
|
|
1194
|
+
ilevel: int,
|
|
1195
|
+
elements: list,
|
|
1196
|
+
is_numbered: bool = False,
|
|
1197
|
+
) -> list[RefItem]:
|
|
1198
|
+
elem_ref: list[RefItem] = []
|
|
1199
|
+
# this method is always called with is_numbered. Numbered lists should be properly addressed.
|
|
1200
|
+
if not elements:
|
|
1201
|
+
return elem_ref
|
|
1202
|
+
enum_marker = ""
|
|
1203
|
+
|
|
1204
|
+
level = self._get_level()
|
|
1205
|
+
prev_indent = self._prev_indent()
|
|
1206
|
+
if self._prev_numid() is None or (
|
|
1207
|
+
self._prev_numid() == numid and self.level_at_new_list is None
|
|
1208
|
+
): # Open new list
|
|
1209
|
+
self.level_at_new_list = level
|
|
1210
|
+
|
|
1211
|
+
# Reset counters for the new numbering sequence
|
|
1212
|
+
self._reset_list_counters_for_new_sequence(numid)
|
|
1213
|
+
|
|
1214
|
+
list_gr = doc.add_list_group(
|
|
1215
|
+
name="list",
|
|
1216
|
+
parent=self.parents[level - 1],
|
|
1217
|
+
content_layer=self.content_layer,
|
|
1218
|
+
)
|
|
1219
|
+
self.parents[level] = list_gr
|
|
1220
|
+
elem_ref.append(list_gr.get_ref())
|
|
1221
|
+
|
|
1222
|
+
# Set marker and enumerated arguments if this is an enumeration element.
|
|
1223
|
+
if is_numbered:
|
|
1224
|
+
counter = self._get_list_counter(numid, ilevel)
|
|
1225
|
+
enum_marker = str(counter) + "."
|
|
1226
|
+
else:
|
|
1227
|
+
enum_marker = ""
|
|
1228
|
+
self._add_formatted_list_item(
|
|
1229
|
+
doc, elements, enum_marker, is_numbered, level
|
|
1230
|
+
)
|
|
1231
|
+
elif (
|
|
1232
|
+
self._prev_numid() == numid
|
|
1233
|
+
and self.level_at_new_list is not None
|
|
1234
|
+
and prev_indent is not None
|
|
1235
|
+
and prev_indent < ilevel
|
|
1236
|
+
): # Open indented list
|
|
1237
|
+
for i in range(
|
|
1238
|
+
self.level_at_new_list + prev_indent + 1,
|
|
1239
|
+
self.level_at_new_list + ilevel + 1,
|
|
1240
|
+
):
|
|
1241
|
+
list_gr1 = doc.add_list_group(
|
|
1242
|
+
name="list",
|
|
1243
|
+
parent=self.parents[i - 1],
|
|
1244
|
+
content_layer=self.content_layer,
|
|
1245
|
+
)
|
|
1246
|
+
self.parents[i] = list_gr1
|
|
1247
|
+
elem_ref.append(list_gr1.get_ref())
|
|
1248
|
+
|
|
1249
|
+
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
|
1250
|
+
if is_numbered:
|
|
1251
|
+
counter = self._get_list_counter(numid, ilevel)
|
|
1252
|
+
enum_marker = str(counter) + "."
|
|
1253
|
+
else:
|
|
1254
|
+
enum_marker = ""
|
|
1255
|
+
self._add_formatted_list_item(
|
|
1256
|
+
doc,
|
|
1257
|
+
elements,
|
|
1258
|
+
enum_marker,
|
|
1259
|
+
is_numbered,
|
|
1260
|
+
self.level_at_new_list + ilevel,
|
|
1261
|
+
)
|
|
1262
|
+
elif (
|
|
1263
|
+
self._prev_numid() == numid
|
|
1264
|
+
and self.level_at_new_list is not None
|
|
1265
|
+
and prev_indent is not None
|
|
1266
|
+
and ilevel < prev_indent
|
|
1267
|
+
): # Close list
|
|
1268
|
+
for k in self.parents:
|
|
1269
|
+
if k > self.level_at_new_list + ilevel:
|
|
1270
|
+
self.parents[k] = None
|
|
1271
|
+
|
|
1272
|
+
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
|
1273
|
+
if is_numbered:
|
|
1274
|
+
counter = self._get_list_counter(numid, ilevel)
|
|
1275
|
+
enum_marker = str(counter) + "."
|
|
1276
|
+
else:
|
|
1277
|
+
enum_marker = ""
|
|
1278
|
+
self._add_formatted_list_item(
|
|
1279
|
+
doc,
|
|
1280
|
+
elements,
|
|
1281
|
+
enum_marker,
|
|
1282
|
+
is_numbered,
|
|
1283
|
+
self.level_at_new_list + ilevel,
|
|
1284
|
+
)
|
|
1285
|
+
|
|
1286
|
+
elif self._prev_numid() == numid or prev_indent == ilevel:
|
|
1287
|
+
# Set marker and enumerated arguments if this is an enumeration element.
|
|
1288
|
+
if is_numbered:
|
|
1289
|
+
counter = self._get_list_counter(numid, ilevel)
|
|
1290
|
+
enum_marker = str(counter) + "."
|
|
1291
|
+
else:
|
|
1292
|
+
enum_marker = ""
|
|
1293
|
+
self._add_formatted_list_item(
|
|
1294
|
+
doc, elements, enum_marker, is_numbered, level - 1
|
|
1295
|
+
)
|
|
1296
|
+
else:
|
|
1297
|
+
_log.warning("List item not matching any insert condition.")
|
|
1298
|
+
return elem_ref
|
|
1299
|
+
|
|
1300
|
+
@staticmethod
|
|
1301
|
+
def _group_cell_elements(
|
|
1302
|
+
group_name: str,
|
|
1303
|
+
doc: DoclingDocument,
|
|
1304
|
+
provs_in_cell: list[RefItem],
|
|
1305
|
+
docling_table: TableItem,
|
|
1306
|
+
content_layer: ContentLayer = ContentLayer.BODY,
|
|
1307
|
+
) -> RefItem:
|
|
1308
|
+
group_element = doc.add_group(
|
|
1309
|
+
label=GroupLabel.UNSPECIFIED,
|
|
1310
|
+
name=group_name,
|
|
1311
|
+
parent=docling_table,
|
|
1312
|
+
content_layer=content_layer,
|
|
1313
|
+
)
|
|
1314
|
+
for prov in provs_in_cell:
|
|
1315
|
+
group_element.children.append(prov)
|
|
1316
|
+
pr_item = prov.resolve(doc)
|
|
1317
|
+
item_parent = pr_item.parent.resolve(doc)
|
|
1318
|
+
if pr_item.get_ref() in item_parent.children:
|
|
1319
|
+
item_parent.children.remove(pr_item.get_ref())
|
|
1320
|
+
pr_item.parent = group_element.get_ref()
|
|
1321
|
+
ref_for_rich_cell = group_element.get_ref()
|
|
1322
|
+
return ref_for_rich_cell
|
|
1323
|
+
|
|
1324
|
+
def _handle_tables(
|
|
1325
|
+
self,
|
|
1326
|
+
element: BaseOxmlElement,
|
|
1327
|
+
doc: DoclingDocument,
|
|
1328
|
+
) -> list[RefItem]:
|
|
1329
|
+
elem_ref: list[RefItem] = []
|
|
1330
|
+
table: Table = Table(element, self.docx_obj)
|
|
1331
|
+
num_rows = len(table.rows)
|
|
1332
|
+
num_cols = len(table.columns)
|
|
1333
|
+
_log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
|
|
1334
|
+
|
|
1335
|
+
if num_rows == 1 and num_cols == 1:
|
|
1336
|
+
cell_element = table.rows[0].cells[0]
|
|
1337
|
+
# In case we have a table of only 1 cell, we consider it furniture
|
|
1338
|
+
# And proceed processing the content of the cell as though it's in the document body
|
|
1339
|
+
self._walk_linear(cell_element._element, doc)
|
|
1340
|
+
return elem_ref
|
|
1341
|
+
|
|
1342
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
|
1343
|
+
level = self._get_level()
|
|
1344
|
+
docling_table = doc.add_table(
|
|
1345
|
+
data=data, parent=self.parents[level - 1], content_layer=self.content_layer
|
|
1346
|
+
)
|
|
1347
|
+
elem_ref.append(docling_table.get_ref())
|
|
1348
|
+
|
|
1349
|
+
cell_set: set[CT_Tc] = set()
|
|
1350
|
+
for row_idx, row in enumerate(table.rows):
|
|
1351
|
+
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
|
1352
|
+
col_idx = 0
|
|
1353
|
+
while col_idx < num_cols:
|
|
1354
|
+
# Handle merged cells: row may have fewer cells than num_cols
|
|
1355
|
+
if col_idx >= len(row.cells):
|
|
1356
|
+
break
|
|
1357
|
+
cell: _Cell = row.cells[col_idx]
|
|
1358
|
+
_log.debug(
|
|
1359
|
+
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
|
1360
|
+
)
|
|
1361
|
+
if cell is None or cell._tc in cell_set:
|
|
1362
|
+
_log.debug(" skipped since repeated content")
|
|
1363
|
+
col_idx += cell.grid_span
|
|
1364
|
+
continue
|
|
1365
|
+
else:
|
|
1366
|
+
cell_set.add(cell._tc)
|
|
1367
|
+
|
|
1368
|
+
spanned_idx = row_idx
|
|
1369
|
+
spanned_tc: Optional[CT_Tc] = cell._tc
|
|
1370
|
+
while spanned_tc == cell._tc:
|
|
1371
|
+
spanned_idx += 1
|
|
1372
|
+
spanned_tc = (
|
|
1373
|
+
table.rows[spanned_idx].cells[col_idx]._tc
|
|
1374
|
+
if spanned_idx < num_rows
|
|
1375
|
+
else None
|
|
1376
|
+
)
|
|
1377
|
+
_log.debug(f" spanned before row {spanned_idx}")
|
|
1378
|
+
|
|
1379
|
+
# Detect equations in cell text
|
|
1380
|
+
text, equations = self._handle_equations_in_text(
|
|
1381
|
+
element=cell._element, text=cell.text
|
|
1382
|
+
)
|
|
1383
|
+
if len(equations) == 0:
|
|
1384
|
+
text = cell.text
|
|
1385
|
+
else:
|
|
1386
|
+
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
|
1387
|
+
|
|
1388
|
+
provs_in_cell: list[RefItem] = []
|
|
1389
|
+
rich_table_cell: bool = self._is_rich_table_cell(cell)
|
|
1390
|
+
|
|
1391
|
+
if rich_table_cell:
|
|
1392
|
+
_, provs_in_cell = self._walk_linear(cell._element, doc)
|
|
1393
|
+
_log.debug(f"Table cell {row_idx},{col_idx} rich? {rich_table_cell}")
|
|
1394
|
+
|
|
1395
|
+
if len(provs_in_cell) > 0:
|
|
1396
|
+
# Cell has multiple elements, we need to group them
|
|
1397
|
+
rich_table_cell = True
|
|
1398
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
|
|
1399
|
+
ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
|
|
1400
|
+
group_name,
|
|
1401
|
+
doc,
|
|
1402
|
+
provs_in_cell,
|
|
1403
|
+
docling_table,
|
|
1404
|
+
content_layer=self.content_layer,
|
|
1405
|
+
)
|
|
1406
|
+
|
|
1407
|
+
if rich_table_cell:
|
|
1408
|
+
rich_cell = RichTableCell(
|
|
1409
|
+
text=text,
|
|
1410
|
+
row_span=spanned_idx - row_idx,
|
|
1411
|
+
col_span=cell.grid_span,
|
|
1412
|
+
start_row_offset_idx=row.grid_cols_before + row_idx,
|
|
1413
|
+
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
|
1414
|
+
start_col_offset_idx=col_idx,
|
|
1415
|
+
end_col_offset_idx=col_idx + cell.grid_span,
|
|
1416
|
+
column_header=row.grid_cols_before + row_idx == 0,
|
|
1417
|
+
row_header=False,
|
|
1418
|
+
ref=ref_for_rich_cell, # points to an artificial group around children
|
|
1419
|
+
)
|
|
1420
|
+
doc.add_table_cell(table_item=docling_table, cell=rich_cell)
|
|
1421
|
+
col_idx += cell.grid_span
|
|
1422
|
+
else:
|
|
1423
|
+
simple_cell = TableCell(
|
|
1424
|
+
text=text,
|
|
1425
|
+
row_span=spanned_idx - row_idx,
|
|
1426
|
+
col_span=cell.grid_span,
|
|
1427
|
+
start_row_offset_idx=row.grid_cols_before + row_idx,
|
|
1428
|
+
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
|
1429
|
+
start_col_offset_idx=col_idx,
|
|
1430
|
+
end_col_offset_idx=col_idx + cell.grid_span,
|
|
1431
|
+
column_header=row.grid_cols_before + row_idx == 0,
|
|
1432
|
+
row_header=False,
|
|
1433
|
+
)
|
|
1434
|
+
doc.add_table_cell(table_item=docling_table, cell=simple_cell)
|
|
1435
|
+
col_idx += cell.grid_span
|
|
1436
|
+
return elem_ref
|
|
1437
|
+
|
|
1438
|
+
def _has_blip(self, element: BaseOxmlElement) -> bool:
|
|
1439
|
+
"""Check if a docx element holds any BLIP as a child.
|
|
1440
|
+
|
|
1441
|
+
Args:
|
|
1442
|
+
element: a docx element
|
|
1443
|
+
|
|
1444
|
+
Returns:
|
|
1445
|
+
Whether the element contains a BLIP as a direct child.
|
|
1446
|
+
"""
|
|
1447
|
+
|
|
1448
|
+
for item in element:
|
|
1449
|
+
if self.blip_xpath_expr(item):
|
|
1450
|
+
return True
|
|
1451
|
+
if item.findall(
|
|
1452
|
+
".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
|
|
1453
|
+
):
|
|
1454
|
+
return True
|
|
1455
|
+
|
|
1456
|
+
return False
|
|
1457
|
+
|
|
1458
|
+
def _is_rich_table_cell(self, cell: _Cell) -> bool:
|
|
1459
|
+
"""Determine whether a docx cell should be parsed as a Docling RichTableCell.
|
|
1460
|
+
|
|
1461
|
+
A docx cell can hold rich content and be parsed with a Docling RichTableCell.
|
|
1462
|
+
However, this requires walking through the lxml elements and creating
|
|
1463
|
+
node items. If the cell holds only plain text, a TableCell, the parsing
|
|
1464
|
+
is simpler and using a TableCell is prefered.
|
|
1465
|
+
|
|
1466
|
+
Plain text means:
|
|
1467
|
+
- The cell has only one paragraph
|
|
1468
|
+
- The paragraph consists solely of runs with no run properties
|
|
1469
|
+
(no need of Docling formatting).
|
|
1470
|
+
- No other block-level elements are present inside the cell element.
|
|
1471
|
+
|
|
1472
|
+
Args:
|
|
1473
|
+
cell: A docx cell
|
|
1474
|
+
|
|
1475
|
+
Returns:
|
|
1476
|
+
Whether the docx cell should be parsed as RichTableCell
|
|
1477
|
+
"""
|
|
1478
|
+
tc = cell._tc
|
|
1479
|
+
|
|
1480
|
+
# must contain only one paragraph
|
|
1481
|
+
paragraphs = list(
|
|
1482
|
+
tc.iterchildren(
|
|
1483
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"
|
|
1484
|
+
)
|
|
1485
|
+
)
|
|
1486
|
+
if len(paragraphs) > 1:
|
|
1487
|
+
return True
|
|
1488
|
+
|
|
1489
|
+
# no other content
|
|
1490
|
+
allowed_tags = {"p", "tcPr"} # paragraph or table-cell properties
|
|
1491
|
+
for child in tc:
|
|
1492
|
+
tag = child.tag.split("}")[-1]
|
|
1493
|
+
if tag not in allowed_tags:
|
|
1494
|
+
return True
|
|
1495
|
+
if self._has_blip(tc):
|
|
1496
|
+
return True
|
|
1497
|
+
|
|
1498
|
+
# paragraph must contain runs with no run-properties
|
|
1499
|
+
for para in paragraphs:
|
|
1500
|
+
runs = list(
|
|
1501
|
+
para.iterchildren(
|
|
1502
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
|
|
1503
|
+
)
|
|
1504
|
+
)
|
|
1505
|
+
for rn in runs:
|
|
1506
|
+
item: Run = Run(rn, self.docx_obj)
|
|
1507
|
+
if item is not None:
|
|
1508
|
+
fm = MsWordDocumentBackend._get_format_from_run(item)
|
|
1509
|
+
if fm != Formatting():
|
|
1510
|
+
return True
|
|
1511
|
+
|
|
1512
|
+
# All checks passed: plain text only
|
|
1513
|
+
return False
|
|
1514
|
+
|
|
1515
|
+
def _handle_pictures(
|
|
1516
|
+
self, drawing_blip: Any, doc: DoclingDocument
|
|
1517
|
+
) -> list[RefItem]:
|
|
1518
|
+
def get_docx_image(image: Any) -> Optional[bytes]:
|
|
1519
|
+
image_data: Optional[bytes] = None
|
|
1520
|
+
rId = image.get(
|
|
1521
|
+
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
|
1522
|
+
)
|
|
1523
|
+
if rId in self.docx_obj.part.rels:
|
|
1524
|
+
# Access the image part using the relationship ID
|
|
1525
|
+
image_part = self.docx_obj.part.rels[rId].target_part
|
|
1526
|
+
image_data = image_part.blob # Get the binary image data
|
|
1527
|
+
return image_data
|
|
1528
|
+
|
|
1529
|
+
elem_ref: list[RefItem] = []
|
|
1530
|
+
if drawing_blip:
|
|
1531
|
+
level = self._get_level()
|
|
1532
|
+
# Open the BytesIO object with PIL to create an Image
|
|
1533
|
+
parent: Optional[NodeItem] = (
|
|
1534
|
+
self.parents[level - 1]
|
|
1535
|
+
if len(drawing_blip) == 1
|
|
1536
|
+
else doc.add_group(
|
|
1537
|
+
label=GroupLabel.PICTURE_AREA,
|
|
1538
|
+
parent=self.parents[level - 1],
|
|
1539
|
+
content_layer=self.content_layer,
|
|
1540
|
+
)
|
|
1541
|
+
)
|
|
1542
|
+
for image in drawing_blip:
|
|
1543
|
+
image_data: Optional[bytes] = get_docx_image(image)
|
|
1544
|
+
if image_data is None:
|
|
1545
|
+
_log.warning("Warning: image cannot be found")
|
|
1546
|
+
p1 = doc.add_picture(
|
|
1547
|
+
parent=parent,
|
|
1548
|
+
caption=None,
|
|
1549
|
+
content_layer=self.content_layer,
|
|
1550
|
+
)
|
|
1551
|
+
elem_ref.append(p1.get_ref())
|
|
1552
|
+
else:
|
|
1553
|
+
try:
|
|
1554
|
+
image_bytes = BytesIO(image_data)
|
|
1555
|
+
pil_image = Image.open(image_bytes)
|
|
1556
|
+
p2 = doc.add_picture(
|
|
1557
|
+
parent=parent,
|
|
1558
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
|
1559
|
+
caption=None,
|
|
1560
|
+
content_layer=self.content_layer,
|
|
1561
|
+
)
|
|
1562
|
+
elem_ref.append(p2.get_ref())
|
|
1563
|
+
except (UnidentifiedImageError, OSError):
|
|
1564
|
+
_log.warning("Warning: image cannot be loaded by Pillow")
|
|
1565
|
+
p3 = doc.add_picture(
|
|
1566
|
+
parent=parent,
|
|
1567
|
+
caption=None,
|
|
1568
|
+
content_layer=self.content_layer,
|
|
1569
|
+
)
|
|
1570
|
+
elem_ref.append(p3.get_ref())
|
|
1571
|
+
return elem_ref
|
|
1572
|
+
|
|
1573
|
+
def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
|
|
1574
|
+
# 1) Make an empty copy of the original document
|
|
1575
|
+
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
|
|
1576
|
+
body = dml_doc._element.body
|
|
1577
|
+
for child in list(body):
|
|
1578
|
+
body.remove(child)
|
|
1579
|
+
|
|
1580
|
+
# 2) Add DrawingML to empty document
|
|
1581
|
+
new_para = dml_doc.add_paragraph()
|
|
1582
|
+
new_r = new_para.add_run()
|
|
1583
|
+
for dml in drawingml_els:
|
|
1584
|
+
new_r._r.append(deepcopy(dml))
|
|
1585
|
+
|
|
1586
|
+
# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
|
|
1587
|
+
level = self._get_level()
|
|
1588
|
+
try:
|
|
1589
|
+
pil_image = get_pil_from_dml_docx(
|
|
1590
|
+
dml_doc, converter=self.docx_to_pdf_converter
|
|
1591
|
+
)
|
|
1592
|
+
if pil_image is None:
|
|
1593
|
+
raise UnidentifiedImageError
|
|
1594
|
+
|
|
1595
|
+
doc.add_picture(
|
|
1596
|
+
parent=self.parents[level - 1],
|
|
1597
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
|
1598
|
+
caption=None,
|
|
1599
|
+
content_layer=self.content_layer,
|
|
1600
|
+
)
|
|
1601
|
+
except (UnidentifiedImageError, OSError):
|
|
1602
|
+
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
|
|
1603
|
+
doc.add_picture(
|
|
1604
|
+
parent=self.parents[level - 1],
|
|
1605
|
+
caption=None,
|
|
1606
|
+
content_layer=self.content_layer,
|
|
1607
|
+
)
|
|
1608
|
+
|
|
1609
|
+
return
|
|
1610
|
+
|
|
1611
|
+
def _add_header_footer(self, docx_obj: DocxDocument, doc: DoclingDocument) -> None:
|
|
1612
|
+
"""Add section headers and footers.
|
|
1613
|
+
|
|
1614
|
+
Headers and footers are added in the furniture content and only the text paragraphs
|
|
1615
|
+
are parsed. The paragraphs are attached to a single group item for the header or the
|
|
1616
|
+
footer. If the document has a section with new header and footer, they will be parsed
|
|
1617
|
+
in new group items.
|
|
1618
|
+
|
|
1619
|
+
Args:
|
|
1620
|
+
docx_obj: A docx Document object to be parsed.
|
|
1621
|
+
doc: A DoclingDocument object to add the header and footer from docx_obj.
|
|
1622
|
+
"""
|
|
1623
|
+
current_layer = self.content_layer
|
|
1624
|
+
base_parent = self.parents[0]
|
|
1625
|
+
self.content_layer = ContentLayer.FURNITURE
|
|
1626
|
+
for sec_idx, section in enumerate(docx_obj.sections):
|
|
1627
|
+
if sec_idx > 0 and not section.different_first_page_header_footer:
|
|
1628
|
+
continue
|
|
1629
|
+
|
|
1630
|
+
hdr = (
|
|
1631
|
+
section.first_page_header
|
|
1632
|
+
if section.different_first_page_header_footer
|
|
1633
|
+
else section.header
|
|
1634
|
+
)
|
|
1635
|
+
par = [txt for txt in (par.text.strip() for par in hdr.paragraphs) if txt]
|
|
1636
|
+
tables = hdr.tables
|
|
1637
|
+
has_blip = self._has_blip(hdr._element)
|
|
1638
|
+
if par or tables or has_blip:
|
|
1639
|
+
self.parents[0] = doc.add_group(
|
|
1640
|
+
label=GroupLabel.SECTION,
|
|
1641
|
+
name="page header",
|
|
1642
|
+
content_layer=self.content_layer,
|
|
1643
|
+
)
|
|
1644
|
+
self._walk_linear(hdr._element, doc)
|
|
1645
|
+
|
|
1646
|
+
ftr = (
|
|
1647
|
+
section.first_page_footer
|
|
1648
|
+
if section.different_first_page_header_footer
|
|
1649
|
+
else section.footer
|
|
1650
|
+
)
|
|
1651
|
+
par = [txt for txt in (par.text.strip() for par in ftr.paragraphs) if txt]
|
|
1652
|
+
tables = ftr.tables
|
|
1653
|
+
has_blip = self._has_blip(ftr._element)
|
|
1654
|
+
if par or tables or has_blip:
|
|
1655
|
+
self.parents[0] = doc.add_group(
|
|
1656
|
+
label=GroupLabel.SECTION,
|
|
1657
|
+
name="page footer",
|
|
1658
|
+
content_layer=self.content_layer,
|
|
1659
|
+
)
|
|
1660
|
+
self._walk_linear(ftr._element, doc)
|
|
1661
|
+
|
|
1662
|
+
self.content_layer = current_layer
|
|
1663
|
+
self.parents[0] = base_parent
|