docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1663 @@
1
+ import logging
2
+ import re
3
+ from copy import deepcopy
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import Any, Callable, Final, Optional, Union
7
+
8
+ from docling_core.types.doc import (
9
+ ContentLayer,
10
+ DocItemLabel,
11
+ DoclingDocument,
12
+ DocumentOrigin,
13
+ GroupLabel,
14
+ ImageRef,
15
+ ListGroup,
16
+ NodeItem,
17
+ RefItem,
18
+ RichTableCell,
19
+ TableCell,
20
+ TableData,
21
+ TableItem,
22
+ )
23
+ from docling_core.types.doc.document import Formatting, Script
24
+ from docx import Document
25
+ from docx.document import Document as DocxDocument
26
+ from docx.oxml.table import CT_Tc
27
+ from docx.oxml.xmlchemy import BaseOxmlElement
28
+ from docx.styles.style import ParagraphStyle
29
+ from docx.table import Table, _Cell
30
+ from docx.text.hyperlink import Hyperlink
31
+ from docx.text.paragraph import Paragraph
32
+ from docx.text.run import Run
33
+ from lxml import etree
34
+ from PIL import Image, UnidentifiedImageError
35
+ from pydantic import AnyUrl
36
+ from typing_extensions import override
37
+
38
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
39
+ from docling.backend.docx.drawingml.utils import (
40
+ get_docx_to_pdf_converter,
41
+ get_pil_from_dml_docx,
42
+ )
43
+ from docling.backend.docx.latex.omml import oMath2Latex
44
+ from docling.datamodel.base_models import InputFormat
45
+ from docling.datamodel.document import InputDocument
46
+
47
+ _log = logging.getLogger(__name__)
48
+
49
+
50
+ class MsWordDocumentBackend(DeclarativeDocumentBackend):
51
+ _BLIP_NAMESPACES: Final = {
52
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
53
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
54
+ "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
55
+ "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
56
+ "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
57
+ "v": "urn:schemas-microsoft-com:vml",
58
+ "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
59
+ "w10": "urn:schemas-microsoft-com:office:word",
60
+ "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
61
+ }
62
+
63
+ @override
64
+ def __init__(
65
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
66
+ ) -> None:
67
+ super().__init__(in_doc, path_or_stream)
68
+ self.XML_KEY = (
69
+ "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
70
+ )
71
+ self.xml_namespaces = {
72
+ "w": "http://schemas.microsoft.com/office/word/2003/wordml"
73
+ }
74
+ self.blip_xpath_expr = etree.XPath(
75
+ ".//a:blip", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
76
+ )
77
+ # self.initialise(path_or_stream)
78
+ # Word file:
79
+ self.path_or_stream: Union[BytesIO, Path] = path_or_stream
80
+ self.valid: bool = False
81
+ # Initialise the parents for the hierarchy
82
+ self.max_levels: int = 10
83
+ self.level_at_new_list: Optional[int] = None
84
+ self.parents: dict[int, Optional[NodeItem]] = {}
85
+ self.numbered_headers: dict[int, int] = {}
86
+ self.equation_bookends: str = "<eq>{EQ}</eq>"
87
+ # Track processed textbox elements to avoid duplication
88
+ self.processed_textbox_elements: list[int] = []
89
+ self.docx_to_pdf_converter: Optional[Callable] = None
90
+ self.docx_to_pdf_converter_init = False
91
+ self.display_drawingml_warning = True
92
+
93
+ for i in range(-1, self.max_levels):
94
+ self.parents[i] = None
95
+
96
+ self.level = 0
97
+ self.listIter = 0
98
+ # Track list counters per numId and ilvl
99
+ self.list_counters: dict[tuple[int, int], int] = {}
100
+ # Set starting content layer
101
+ self.content_layer = ContentLayer.BODY
102
+
103
+ self.history: dict[str, Any] = {
104
+ "names": [None],
105
+ "levels": [None],
106
+ "numids": [None],
107
+ "indents": [None],
108
+ }
109
+
110
+ self.docx_obj = self.load_msword_file(
111
+ path_or_stream=self.path_or_stream, document_hash=self.document_hash
112
+ )
113
+ if self.docx_obj:
114
+ self.valid = True
115
+
116
+ @override
117
+ def is_valid(self) -> bool:
118
+ return self.valid
119
+
120
+ @classmethod
121
+ @override
122
+ def supports_pagination(cls) -> bool:
123
+ return False
124
+
125
+ @override
126
+ def unload(self):
127
+ if isinstance(self.path_or_stream, BytesIO):
128
+ self.path_or_stream.close()
129
+
130
+ self.path_or_stream = None
131
+
132
+ @classmethod
133
+ @override
134
+ def supported_formats(cls) -> set[InputFormat]:
135
+ return {InputFormat.DOCX}
136
+
137
+ @override
138
+ def convert(self) -> DoclingDocument:
139
+ """Parses the DOCX into a structured document model.
140
+
141
+ Returns:
142
+ The parsed document.
143
+ """
144
+
145
+ origin = DocumentOrigin(
146
+ filename=self.file.name or "file",
147
+ mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
148
+ binary_hash=self.document_hash,
149
+ )
150
+
151
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
152
+ if self.is_valid():
153
+ assert self.docx_obj is not None
154
+ doc, _ = self._walk_linear(self.docx_obj.element.body, doc)
155
+ self._add_header_footer(self.docx_obj, doc)
156
+
157
+ return doc
158
+ else:
159
+ raise RuntimeError(
160
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
161
+ )
162
+
163
+ @staticmethod
164
+ def load_msword_file(
165
+ path_or_stream: Union[BytesIO, Path], document_hash: str
166
+ ) -> DocxDocument:
167
+ try:
168
+ if isinstance(path_or_stream, BytesIO):
169
+ return Document(path_or_stream)
170
+ elif isinstance(path_or_stream, Path):
171
+ return Document(str(path_or_stream))
172
+ else:
173
+ return None
174
+ except Exception as e:
175
+ raise RuntimeError(
176
+ f"MsWordDocumentBackend could not load document with hash {document_hash}"
177
+ ) from e
178
+
179
+ def _update_history(
180
+ self,
181
+ name: str,
182
+ level: Optional[int],
183
+ numid: Optional[int],
184
+ ilevel: Optional[int],
185
+ ):
186
+ self.history["names"].append(name)
187
+ self.history["levels"].append(level)
188
+
189
+ self.history["numids"].append(numid)
190
+ self.history["indents"].append(ilevel)
191
+
192
+ def _prev_name(self) -> Optional[str]:
193
+ return self.history["names"][-1]
194
+
195
+ def _prev_level(self) -> Optional[int]:
196
+ return self.history["levels"][-1]
197
+
198
+ def _prev_numid(self) -> Optional[int]:
199
+ return self.history["numids"][-1]
200
+
201
+ def _prev_indent(self) -> Optional[int]:
202
+ return self.history["indents"][-1]
203
+
204
+ def _get_level(self) -> int:
205
+ """Return the first None index."""
206
+ for k, v in self.parents.items():
207
+ if k >= 0 and v is None:
208
+ return k
209
+ return 0
210
+
211
+ def _walk_linear(
212
+ self,
213
+ body: BaseOxmlElement,
214
+ doc: DoclingDocument,
215
+ # parent:
216
+ ) -> tuple[DoclingDocument, list[RefItem]]:
217
+ added_elements = []
218
+ for element in body:
219
+ tag_name = etree.QName(element).localname
220
+ # Check for Inline Images (blip elements)
221
+ drawing_blip = self.blip_xpath_expr(element)
222
+ drawingml_els = element.findall(
223
+ ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
224
+ )
225
+
226
+ # Check for textbox content - check multiple textbox formats
227
+ # Only process if the element hasn't been processed before
228
+ element_id = id(element)
229
+ if element_id not in self.processed_textbox_elements:
230
+ # Modern Word textboxes
231
+ txbx_xpath = etree.XPath(
232
+ ".//w:txbxContent|.//v:textbox//w:p",
233
+ namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
234
+ )
235
+ textbox_elements = txbx_xpath(element)
236
+
237
+ # No modern textboxes found, check for alternate/legacy textbox formats
238
+ if not textbox_elements and tag_name in ["drawing", "pict"]:
239
+ # Additional checks for textboxes in DrawingML and VML formats
240
+ alt_txbx_xpath = etree.XPath(
241
+ ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
242
+ namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
243
+ )
244
+ textbox_elements = alt_txbx_xpath(element)
245
+
246
+ # Check for shape text that's not in a standard textbox
247
+ if not textbox_elements:
248
+ shape_text_xpath = etree.XPath(
249
+ ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
250
+ namespaces=MsWordDocumentBackend._BLIP_NAMESPACES,
251
+ )
252
+ shape_text_elements = shape_text_xpath(element)
253
+ if shape_text_elements:
254
+ # Create custom text elements from shape text
255
+ text_content = " ".join(
256
+ [t.text for t in shape_text_elements if t.text]
257
+ )
258
+ if text_content.strip():
259
+ _log.debug(f"Found shape text: {text_content[:50]}...")
260
+ # Create a paragraph-like element to process with standard handler
261
+ level = self._get_level()
262
+ shape_group = doc.add_group(
263
+ label=GroupLabel.SECTION,
264
+ parent=self.parents[level - 1],
265
+ name="shape-text",
266
+ content_layer=self.content_layer,
267
+ )
268
+ added_elements.append(shape_group.get_ref())
269
+ doc.add_text(
270
+ label=DocItemLabel.TEXT,
271
+ parent=shape_group,
272
+ text=text_content,
273
+ content_layer=self.content_layer,
274
+ )
275
+
276
+ if textbox_elements:
277
+ # Mark the parent element as processed
278
+ self.processed_textbox_elements.append(element_id)
279
+ # Also mark all found textbox elements as processed
280
+ for tb_element in textbox_elements:
281
+ self.processed_textbox_elements.append(id(tb_element))
282
+
283
+ _log.debug(
284
+ f"Found textbox content with {len(textbox_elements)} elements"
285
+ )
286
+ tbc = self._handle_textbox_content(textbox_elements, doc)
287
+ added_elements.extend(tbc)
288
+
289
+ # Check for Tables
290
+ if tag_name == "tbl":
291
+ try:
292
+ t = self._handle_tables(element, doc)
293
+ added_elements.extend(t)
294
+ except Exception:
295
+ _log.debug("could not parse a table, broken docx table")
296
+ # Check for Image
297
+ elif drawing_blip:
298
+ pics = self._handle_pictures(drawing_blip, doc)
299
+ added_elements.extend(pics)
300
+ # Check for Text after the Image
301
+ if (
302
+ tag_name == "p"
303
+ and element.find(
304
+ ".//w:t", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
305
+ )
306
+ is not None
307
+ ):
308
+ te1 = self._handle_text_elements(element, doc)
309
+ added_elements.extend(te1)
310
+ # Check for DrawingML elements
311
+ elif drawingml_els:
312
+ if (
313
+ self.docx_to_pdf_converter is None
314
+ and self.docx_to_pdf_converter_init is False
315
+ ):
316
+ self.docx_to_pdf_converter = get_docx_to_pdf_converter()
317
+ self.docx_to_pdf_converter_init = True
318
+
319
+ if self.docx_to_pdf_converter is None:
320
+ if self.display_drawingml_warning:
321
+ if self.docx_to_pdf_converter is None:
322
+ _log.warning(
323
+ "Found DrawingML elements in document, but no DOCX to PDF converters. "
324
+ "If you want these exported, make sure you have "
325
+ "LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
326
+ )
327
+ self.display_drawingml_warning = False
328
+ else:
329
+ self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
330
+ # Check for the sdt containers, like table of contents
331
+ elif tag_name == "sdt":
332
+ sdt_content = element.find(
333
+ ".//w:sdtContent", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
334
+ )
335
+ if sdt_content is not None:
336
+ # Iterate paragraphs, runs, or text inside <w:sdtContent>.
337
+ paragraphs = sdt_content.findall(
338
+ ".//w:p", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
339
+ )
340
+ for p in paragraphs:
341
+ te = self._handle_text_elements(p, doc)
342
+ added_elements.extend(te)
343
+ # Check for Text
344
+ elif tag_name == "p":
345
+ # "tcPr", "sectPr"
346
+ te = self._handle_text_elements(element, doc)
347
+ added_elements.extend(te)
348
+ else:
349
+ _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
350
+
351
+ return doc, added_elements
352
+
353
+ def _str_to_int(
354
+ self, s: Optional[str], default: Optional[int] = 0
355
+ ) -> Optional[int]:
356
+ if s is None:
357
+ return None
358
+ try:
359
+ return int(s)
360
+ except ValueError:
361
+ return default
362
+
363
+ def _split_text_and_number(self, input_string: str) -> list[str]:
364
+ match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
365
+ if match:
366
+ parts = list(filter(None, match.groups()))
367
+ return parts
368
+ else:
369
+ return [input_string]
370
+
371
+ def _get_numId_and_ilvl(
372
+ self, paragraph: Paragraph
373
+ ) -> tuple[Optional[int], Optional[int]]:
374
+ # Access the XML element of the paragraph
375
+ numPr = paragraph._element.find(
376
+ ".//w:numPr", namespaces=paragraph._element.nsmap
377
+ )
378
+
379
+ if numPr is not None:
380
+ # Get the numId element and extract the value
381
+ numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
382
+ ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
383
+ numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
384
+ ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
385
+
386
+ return self._str_to_int(numId, None), self._str_to_int(ilvl, None)
387
+
388
+ return None, None # If the paragraph is not part of a list
389
+
390
+ def _get_list_counter(self, numid: int, ilvl: int) -> int:
391
+ """Get and increment the counter for a specific numId and ilvl combination."""
392
+ key = (numid, ilvl)
393
+ if key not in self.list_counters:
394
+ self.list_counters[key] = 0
395
+ self.list_counters[key] += 1
396
+ return self.list_counters[key]
397
+
398
+ def _reset_list_counters_for_new_sequence(self, numid: int):
399
+ """Reset counters when starting a new numbering sequence."""
400
+ # Reset all counters for this numid
401
+ keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
402
+ for key in keys_to_reset:
403
+ self.list_counters[key] = 0
404
+
405
+ def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
406
+ """Check if a list is numbered based on its numFmt value."""
407
+ try:
408
+ # Access the numbering part of the document
409
+ if not hasattr(self.docx_obj, "part") or not hasattr(
410
+ self.docx_obj.part, "package"
411
+ ):
412
+ return False
413
+
414
+ numbering_part = None
415
+ # Find the numbering part
416
+ for part in self.docx_obj.part.package.parts:
417
+ if "numbering" in part.partname:
418
+ numbering_part = part
419
+ break
420
+
421
+ if numbering_part is None:
422
+ return False
423
+
424
+ # Parse the numbering XML
425
+ numbering_root = numbering_part.element
426
+ namespaces = {
427
+ "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
428
+ }
429
+
430
+ # Find the numbering definition with the given numId
431
+ num_xpath = f".//w:num[@w:numId='{numId}']"
432
+ num_element = numbering_root.find(num_xpath, namespaces=namespaces)
433
+
434
+ if num_element is None:
435
+ return False
436
+
437
+ # Get the abstractNumId from the num element
438
+ abstract_num_id_elem = num_element.find(
439
+ ".//w:abstractNumId", namespaces=namespaces
440
+ )
441
+ if abstract_num_id_elem is None:
442
+ return False
443
+
444
+ abstract_num_id = abstract_num_id_elem.get(
445
+ "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
446
+ )
447
+ if abstract_num_id is None:
448
+ return False
449
+
450
+ # Find the abstract numbering definition
451
+ abstract_num_xpath = (
452
+ f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
453
+ )
454
+ abstract_num_element = numbering_root.find(
455
+ abstract_num_xpath, namespaces=namespaces
456
+ )
457
+
458
+ if abstract_num_element is None:
459
+ return False
460
+
461
+ # Find the level definition for the given ilvl
462
+ lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
463
+ lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
464
+
465
+ if lvl_element is None:
466
+ return False
467
+
468
+ # Get the numFmt element
469
+ num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
470
+ if num_fmt_element is None:
471
+ return False
472
+
473
+ num_fmt = num_fmt_element.get(
474
+ "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
475
+ )
476
+
477
+ # Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
478
+ # Bullet formats include: bullet
479
+ numbered_formats = {
480
+ "decimal",
481
+ "lowerRoman",
482
+ "upperRoman",
483
+ "lowerLetter",
484
+ "upperLetter",
485
+ "decimalZero",
486
+ }
487
+
488
+ return num_fmt in numbered_formats
489
+
490
+ except Exception as e:
491
+ _log.debug(f"Error determining if list is numbered: {e}")
492
+ return False
493
+
494
+ def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
495
+ parts = self._split_text_and_number(style_label)
496
+
497
+ if len(parts) == 2:
498
+ parts.sort()
499
+ label_str: str = ""
500
+ label_level: Optional[int] = 0
501
+ if parts[0].strip().lower() == "heading":
502
+ label_str = "Heading"
503
+ label_level = self._str_to_int(parts[1], None)
504
+ if parts[1].strip().lower() == "heading":
505
+ label_str = "Heading"
506
+ label_level = self._str_to_int(parts[0], None)
507
+ return label_str, label_level
508
+
509
+ return style_label, None
510
+
511
+ def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
512
+ if paragraph.style is None:
513
+ return "Normal", None
514
+
515
+ label: str = paragraph.style.style_id
516
+ name: str = paragraph.style.name or ""
517
+ base_style_label: Optional[str] = None
518
+ base_style_name: Optional[str] = None
519
+ if isinstance(
520
+ base_style := getattr(paragraph.style, "base_style", None), ParagraphStyle
521
+ ):
522
+ base_style_label = base_style.style_id
523
+ base_style_name = base_style.name
524
+
525
+ if not label:
526
+ return "Normal", None
527
+
528
+ if ":" in label:
529
+ parts = label.split(":")
530
+ if len(parts) == 2:
531
+ return parts[0], self._str_to_int(parts[1], None)
532
+
533
+ if "heading" in label.lower():
534
+ return self._get_heading_and_level(label)
535
+ if "heading" in name.lower():
536
+ return self._get_heading_and_level(name)
537
+ if base_style_label and "heading" in base_style_label.lower():
538
+ return self._get_heading_and_level(base_style_label)
539
+ if base_style_name and "heading" in base_style_name.lower():
540
+ return self._get_heading_and_level(base_style_name)
541
+
542
+ return label, None
543
+
544
+ @classmethod
545
+ def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
546
+ # The .bold and .italic properties are booleans, but .underline can be an enum
547
+ # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
548
+ is_bold = run.bold or False
549
+ is_italic = run.italic or False
550
+ is_strikethrough = run.font.strike or False
551
+ # Convert any non-None underline value to True
552
+ is_underline = bool(run.underline is not None and run.underline)
553
+ is_sub = run.font.subscript or False
554
+ is_sup = run.font.superscript or False
555
+ script = Script.SUB if is_sub else Script.SUPER if is_sup else Script.BASELINE
556
+
557
+ return Formatting(
558
+ bold=is_bold,
559
+ italic=is_italic,
560
+ underline=is_underline,
561
+ strikethrough=is_strikethrough,
562
+ script=script,
563
+ )
564
+
565
+ def _get_paragraph_elements(self, paragraph: Paragraph):
566
+ """
567
+ Extract paragraph elements along with their formatting and hyperlink
568
+ """
569
+
570
+ # for now retain empty paragraphs for backwards compatibility:
571
+ if paragraph.text.strip() == "":
572
+ return [("", None, None)]
573
+
574
+ paragraph_elements: list[
575
+ tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]]
576
+ ] = []
577
+ group_text = ""
578
+ previous_format = None
579
+
580
+ # Iterate over the runs of the paragraph and group them by format
581
+ for c in paragraph.iter_inner_content():
582
+ if isinstance(c, Hyperlink):
583
+ text = c.text
584
+ hyperlink = Path(c.address)
585
+ format = (
586
+ self._get_format_from_run(c.runs[0])
587
+ if c.runs and len(c.runs) > 0
588
+ else None
589
+ )
590
+ elif isinstance(c, Run):
591
+ text = c.text
592
+ hyperlink = None
593
+ format = self._get_format_from_run(c)
594
+ else:
595
+ continue
596
+
597
+ if (len(text.strip()) and format != previous_format) or (
598
+ hyperlink is not None
599
+ ):
600
+ # If the style changes for a non empty text, add the previous group
601
+ if len(group_text.strip()) > 0:
602
+ paragraph_elements.append(
603
+ (group_text.strip(), previous_format, None)
604
+ )
605
+ group_text = ""
606
+
607
+ # If there is a hyperlink, add it immediately
608
+ if hyperlink is not None:
609
+ paragraph_elements.append((text.strip(), format, hyperlink))
610
+ text = ""
611
+ else:
612
+ previous_format = format
613
+
614
+ group_text += text
615
+
616
+ # Format the last group
617
+ if len(group_text.strip()) > 0:
618
+ paragraph_elements.append((group_text.strip(), format, None))
619
+
620
+ return paragraph_elements
621
+
622
+ def _get_paragraph_position(self, paragraph_element):
623
+ """Extract vertical position information from paragraph element."""
624
+ # First try to directly get the index from w:p element that has an order-related attribute
625
+ if (
626
+ hasattr(paragraph_element, "getparent")
627
+ and paragraph_element.getparent() is not None
628
+ ):
629
+ parent = paragraph_element.getparent()
630
+ # Get all paragraph siblings
631
+ paragraphs = [
632
+ p for p in parent.getchildren() if etree.QName(p).localname == "p"
633
+ ]
634
+ # Find index of current paragraph within its siblings
635
+ try:
636
+ paragraph_index = paragraphs.index(paragraph_element)
637
+ return paragraph_index # Use index as position for consistent ordering
638
+ except ValueError:
639
+ pass
640
+
641
+ # Look for position hints in element attributes and ancestor elements
642
+ for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
643
+ # Check for direct position attributes
644
+ for attr_name in ["y", "top", "positionY", "y-position", "position"]:
645
+ value = elem.get(attr_name)
646
+ if value:
647
+ try:
648
+ # Remove any non-numeric characters (like 'pt', 'px', etc.)
649
+ clean_value = re.sub(r"[^0-9.]", "", value)
650
+ if clean_value:
651
+ return float(clean_value)
652
+ except (ValueError, TypeError):
653
+ pass
654
+
655
+ # Check for position in transform attribute
656
+ transform = elem.get("transform")
657
+ if transform:
658
+ # Extract translation component from transform matrix
659
+ match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
660
+ if match:
661
+ try:
662
+ return float(match.group(1))
663
+ except ValueError:
664
+ pass
665
+
666
+ # Check for anchors or relative position indicators in Word format
667
+ # 'dist' attributes can indicate relative positioning
668
+ for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
669
+ if elem.get(attr_name) is not None:
670
+ return elem.sourceline # Use the XML source line number as fallback
671
+
672
+ # For VML shapes, look for specific attributes
673
+ for ns_uri in paragraph_element.nsmap.values():
674
+ if "vml" in ns_uri:
675
+ # Try to extract position from style attribute
676
+ style = paragraph_element.get("style")
677
+ if style:
678
+ match = re.search(r"top:([0-9.]+)pt", style)
679
+ if match:
680
+ try:
681
+ return float(match.group(1))
682
+ except ValueError:
683
+ pass
684
+
685
+ # If no better position indicator found, use XML source line number as proxy for order
686
+ return (
687
+ paragraph_element.sourceline
688
+ if hasattr(paragraph_element, "sourceline")
689
+ else None
690
+ )
691
+
692
+ def _collect_textbox_paragraphs(self, textbox_elements):
693
+ """Collect and organize paragraphs from textbox elements."""
694
+ processed_paragraphs = []
695
+ container_paragraphs = {}
696
+
697
+ for element in textbox_elements:
698
+ element_id = id(element)
699
+ # Skip if we've already processed this exact element
700
+ if element_id in processed_paragraphs:
701
+ continue
702
+
703
+ tag_name = etree.QName(element).localname
704
+ processed_paragraphs.append(element_id)
705
+
706
+ # Handle paragraphs directly found (VML textboxes)
707
+ if tag_name == "p":
708
+ # Find the containing textbox or shape element
709
+ container_id = None
710
+ for ancestor in element.iterancestors():
711
+ if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
712
+ container_id = id(ancestor)
713
+ break
714
+
715
+ if container_id not in container_paragraphs:
716
+ container_paragraphs[container_id] = []
717
+ container_paragraphs[container_id].append(
718
+ (element, self._get_paragraph_position(element))
719
+ )
720
+
721
+ # Handle txbxContent elements (Word DrawingML textboxes)
722
+ elif tag_name == "txbxContent":
723
+ paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
724
+ container_id = id(element)
725
+ if container_id not in container_paragraphs:
726
+ container_paragraphs[container_id] = []
727
+
728
+ for p in paragraphs:
729
+ p_id = id(p)
730
+ if p_id not in processed_paragraphs:
731
+ processed_paragraphs.append(p_id)
732
+ container_paragraphs[container_id].append(
733
+ (p, self._get_paragraph_position(p))
734
+ )
735
+ else:
736
+ # Try to extract any paragraphs from unknown elements
737
+ paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
738
+ container_id = id(element)
739
+ if container_id not in container_paragraphs:
740
+ container_paragraphs[container_id] = []
741
+
742
+ for p in paragraphs:
743
+ p_id = id(p)
744
+ if p_id not in processed_paragraphs:
745
+ processed_paragraphs.append(p_id)
746
+ container_paragraphs[container_id].append(
747
+ (p, self._get_paragraph_position(p))
748
+ )
749
+
750
+ return container_paragraphs
751
+
752
+ def _handle_textbox_content(
753
+ self,
754
+ textbox_elements: list,
755
+ doc: DoclingDocument,
756
+ ) -> list[RefItem]:
757
+ elem_ref: list[RefItem] = []
758
+ """Process textbox content and add it to the document structure."""
759
+ level = self._get_level()
760
+ # Create a textbox group to contain all text from the textbox
761
+ textbox_group = doc.add_group(
762
+ label=GroupLabel.SECTION,
763
+ parent=self.parents[level - 1],
764
+ name="textbox",
765
+ content_layer=self.content_layer,
766
+ )
767
+ elem_ref.append(textbox_group.get_ref())
768
+ # Set this as the current parent to ensure textbox content
769
+ # is properly nested in document structure
770
+ original_parent = self.parents[level]
771
+ self.parents[level] = textbox_group
772
+
773
+ # Collect and organize paragraphs
774
+ container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
775
+
776
+ # Process all paragraphs
777
+ all_paragraphs = []
778
+
779
+ # Sort paragraphs within each container, then process containers
780
+ for paragraphs in container_paragraphs.values():
781
+ # Sort by vertical position within each container
782
+ sorted_container_paragraphs = sorted(
783
+ paragraphs,
784
+ key=lambda x: (
785
+ x[1] is None,
786
+ x[1] if x[1] is not None else float("inf"),
787
+ ),
788
+ )
789
+
790
+ # Add the sorted paragraphs to our processing list
791
+ all_paragraphs.extend(sorted_container_paragraphs)
792
+
793
+ # Track processed paragraphs to avoid duplicates (same content and position)
794
+ processed_paragraphs = set()
795
+
796
+ # Process all the paragraphs
797
+ for p, position in all_paragraphs:
798
+ # Create paragraph object to get text content
799
+ paragraph = Paragraph(p, self.docx_obj)
800
+ text_content = paragraph.text
801
+
802
+ # Create a unique identifier based on content and position
803
+ paragraph_id = (text_content, position)
804
+
805
+ # Skip if this paragraph (same content and position) was already processed
806
+ if paragraph_id in processed_paragraphs:
807
+ _log.debug(
808
+ f"Skipping duplicate paragraph: content='{text_content[:50]}...', position={position}"
809
+ )
810
+ continue
811
+
812
+ # Mark this paragraph as processed
813
+ processed_paragraphs.add(paragraph_id)
814
+
815
+ elem_ref.extend(self._handle_text_elements(p, doc))
816
+
817
+ # Restore original parent
818
+ self.parents[level] = original_parent
819
+ return elem_ref
820
+
821
+ def _handle_equations_in_text(self, element, text):
822
+ only_texts = []
823
+ only_equations = []
824
+ texts_and_equations = []
825
+ for subt in element.iter():
826
+ tag_name = etree.QName(subt).localname
827
+ if tag_name == "t" and "math" not in subt.tag:
828
+ if isinstance(subt.text, str):
829
+ only_texts.append(subt.text)
830
+ texts_and_equations.append(subt.text)
831
+ elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
832
+ latex_equation = str(oMath2Latex(subt)).strip()
833
+ if len(latex_equation) > 0:
834
+ only_equations.append(
835
+ self.equation_bookends.format(EQ=latex_equation)
836
+ )
837
+ texts_and_equations.append(
838
+ self.equation_bookends.format(EQ=latex_equation)
839
+ )
840
+
841
+ if len(only_equations) < 1:
842
+ return text, []
843
+
844
+ if (
845
+ re.sub(r"\s+", "", "".join(only_texts)).strip()
846
+ != re.sub(r"\s+", "", text).strip()
847
+ ):
848
+ # If we are not able to reconstruct the initial raw text
849
+ # do not try to parse equations and return the original
850
+ return text, []
851
+
852
+ # Insert equations into original text
853
+ # This is done to preserve white space structure
854
+ output_text = text[:]
855
+ init_i = 0
856
+ for i_substr, substr in enumerate(texts_and_equations):
857
+ if len(substr) == 0:
858
+ continue
859
+
860
+ if substr in output_text[init_i:]:
861
+ init_i += output_text[init_i:].find(substr) + len(substr)
862
+ else:
863
+ if i_substr > 0:
864
+ output_text = output_text[:init_i] + substr + output_text[init_i:]
865
+ init_i += len(substr)
866
+ else:
867
+ output_text = substr + output_text
868
+
869
+ return output_text, only_equations
870
+
871
+ def _create_or_reuse_parent(
872
+ self,
873
+ *,
874
+ doc: DoclingDocument,
875
+ prev_parent: Optional[NodeItem],
876
+ paragraph_elements: list,
877
+ ) -> Optional[NodeItem]:
878
+ return (
879
+ doc.add_inline_group(parent=prev_parent, content_layer=self.content_layer)
880
+ if len(paragraph_elements) > 1
881
+ else prev_parent
882
+ )
883
+
884
+ def _handle_text_elements(
885
+ self,
886
+ element: BaseOxmlElement,
887
+ doc: DoclingDocument,
888
+ ) -> list[RefItem]:
889
+ elem_ref: list[RefItem] = []
890
+ paragraph = Paragraph(element, self.docx_obj)
891
+ paragraph_elements = self._get_paragraph_elements(paragraph)
892
+ text, equations = self._handle_equations_in_text(
893
+ element=element, text=paragraph.text
894
+ )
895
+
896
+ if text is None:
897
+ return elem_ref
898
+ text = text.strip()
899
+
900
+ # Common styles for bullet and numbered lists.
901
+ # "List Bullet", "List Number", "List Paragraph"
902
+ # Identify whether list is a numbered list or not
903
+ p_style_id, p_level = self._get_label_and_level(paragraph)
904
+ numid, ilevel = self._get_numId_and_ilvl(paragraph)
905
+
906
+ if numid == 0:
907
+ numid = None
908
+
909
+ # Handle lists
910
+ if (
911
+ numid is not None
912
+ and ilevel is not None
913
+ and p_style_id not in ["Title", "Heading"]
914
+ ):
915
+ # Check if this is actually a numbered list by examining the numFmt
916
+ is_numbered = self._is_numbered_list(numid, ilevel)
917
+
918
+ li = self._add_list_item(
919
+ doc=doc,
920
+ numid=numid,
921
+ ilevel=ilevel,
922
+ elements=paragraph_elements,
923
+ is_numbered=is_numbered,
924
+ )
925
+ elem_ref.extend(li) # MUST BE REF!!!
926
+ self._update_history(p_style_id, p_level, numid, ilevel)
927
+ return elem_ref
928
+ elif (
929
+ numid is None
930
+ and self._prev_numid() is not None
931
+ and p_style_id not in ["Title", "Heading"]
932
+ ): # Close list
933
+ if self.level_at_new_list:
934
+ for key in range(len(self.parents)):
935
+ if key >= self.level_at_new_list:
936
+ self.parents[key] = None
937
+ self.level = self.level_at_new_list - 1
938
+ self.level_at_new_list = None
939
+ else:
940
+ for key in range(len(self.parents)):
941
+ self.parents[key] = None
942
+ self.level = 0
943
+
944
+ if p_style_id in ["Title"]:
945
+ for key in range(len(self.parents)):
946
+ self.parents[key] = None
947
+ te = doc.add_text(
948
+ parent=None,
949
+ label=DocItemLabel.TITLE,
950
+ text=text,
951
+ content_layer=self.content_layer,
952
+ )
953
+ self.parents[0] = te
954
+ elem_ref.append(te.get_ref())
955
+ elif "Heading" in p_style_id:
956
+ style_element = getattr(paragraph.style, "element", None)
957
+ if style_element is not None:
958
+ is_numbered_style = (
959
+ "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
960
+ )
961
+ else:
962
+ is_numbered_style = False
963
+ h1 = self._add_heading(doc, p_level, text, is_numbered_style)
964
+ elem_ref.extend(h1)
965
+
966
+ elif len(equations) > 0:
967
+ if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
968
+ text
969
+ ) > 0:
970
+ # Standalone equation
971
+ level = self._get_level()
972
+ t1 = doc.add_text(
973
+ label=DocItemLabel.FORMULA,
974
+ parent=self.parents[level - 1],
975
+ text=text.replace("<eq>", "").replace("</eq>", ""),
976
+ content_layer=self.content_layer,
977
+ )
978
+ elem_ref.append(t1.get_ref())
979
+ else:
980
+ # Inline equation
981
+ level = self._get_level()
982
+ inline_equation = doc.add_inline_group(
983
+ parent=self.parents[level - 1], content_layer=self.content_layer
984
+ )
985
+ elem_ref.append(inline_equation.get_ref())
986
+ text_tmp = text
987
+ for eq in equations:
988
+ if len(text_tmp) == 0:
989
+ break
990
+
991
+ split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
992
+
993
+ pre_eq_text = split_text_tmp[0]
994
+ text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
995
+
996
+ if len(pre_eq_text) > 0:
997
+ e1 = doc.add_text(
998
+ label=DocItemLabel.TEXT,
999
+ parent=inline_equation,
1000
+ text=pre_eq_text,
1001
+ content_layer=self.content_layer,
1002
+ )
1003
+ elem_ref.append(e1.get_ref())
1004
+ e2 = doc.add_text(
1005
+ label=DocItemLabel.FORMULA,
1006
+ parent=inline_equation,
1007
+ text=eq.replace("<eq>", "").replace("</eq>", ""),
1008
+ content_layer=self.content_layer,
1009
+ )
1010
+ elem_ref.append(e2.get_ref())
1011
+
1012
+ if len(text_tmp) > 0:
1013
+ e3 = doc.add_text(
1014
+ label=DocItemLabel.TEXT,
1015
+ parent=inline_equation,
1016
+ text=text_tmp.strip(),
1017
+ content_layer=self.content_layer,
1018
+ )
1019
+ elem_ref.append(e3.get_ref())
1020
+
1021
+ elif p_style_id in [
1022
+ "Paragraph",
1023
+ "Normal",
1024
+ "Subtitle",
1025
+ "Author",
1026
+ "DefaultText",
1027
+ "ListParagraph",
1028
+ "ListBullet",
1029
+ "Quote",
1030
+ ]:
1031
+ level = self._get_level()
1032
+ parent = self._create_or_reuse_parent(
1033
+ doc=doc,
1034
+ prev_parent=self.parents.get(level - 1),
1035
+ paragraph_elements=paragraph_elements,
1036
+ )
1037
+ for text, format, hyperlink in paragraph_elements:
1038
+ t2 = doc.add_text(
1039
+ label=DocItemLabel.TEXT,
1040
+ parent=parent,
1041
+ text=text,
1042
+ formatting=format,
1043
+ hyperlink=hyperlink,
1044
+ content_layer=self.content_layer,
1045
+ )
1046
+ elem_ref.append(t2.get_ref())
1047
+
1048
+ else:
1049
+ # Text style names can, and will have, not only default values but user values too
1050
+ # hence we treat all other labels as pure text
1051
+ level = self._get_level()
1052
+ parent = self._create_or_reuse_parent(
1053
+ doc=doc,
1054
+ prev_parent=self.parents.get(level - 1),
1055
+ paragraph_elements=paragraph_elements,
1056
+ )
1057
+ for text, format, hyperlink in paragraph_elements:
1058
+ t3 = doc.add_text(
1059
+ label=DocItemLabel.TEXT,
1060
+ parent=parent,
1061
+ text=text,
1062
+ formatting=format,
1063
+ hyperlink=hyperlink,
1064
+ content_layer=self.content_layer,
1065
+ )
1066
+ elem_ref.append(t3.get_ref())
1067
+
1068
+ self._update_history(p_style_id, p_level, numid, ilevel)
1069
+ return elem_ref
1070
+
1071
+ def _add_heading(
1072
+ self,
1073
+ doc: DoclingDocument,
1074
+ curr_level: Optional[int],
1075
+ text: str,
1076
+ is_numbered_style: bool = False,
1077
+ ) -> list[RefItem]:
1078
+ elem_ref: list[RefItem] = []
1079
+ level = self._get_level()
1080
+ if isinstance(curr_level, int):
1081
+ if curr_level > level:
1082
+ # add invisible group
1083
+ for i in range(level, curr_level):
1084
+ gr1 = doc.add_group(
1085
+ parent=self.parents[i - 1],
1086
+ label=GroupLabel.SECTION,
1087
+ name=f"header-{i}",
1088
+ )
1089
+ elem_ref.append(gr1.get_ref())
1090
+ self.parents[i] = gr1
1091
+
1092
+ elif curr_level < level:
1093
+ # remove the tail
1094
+ for key in range(len(self.parents)):
1095
+ if key >= curr_level:
1096
+ self.parents[key] = None
1097
+
1098
+ current_level = curr_level
1099
+ parent_level = curr_level - 1
1100
+ add_level = curr_level
1101
+ else:
1102
+ current_level = self.level
1103
+ parent_level = self.level - 1
1104
+ add_level = 1
1105
+
1106
+ if is_numbered_style:
1107
+ if add_level in self.numbered_headers:
1108
+ self.numbered_headers[add_level] += 1
1109
+ else:
1110
+ self.numbered_headers[add_level] = 1
1111
+ text = f"{self.numbered_headers[add_level]} {text}"
1112
+
1113
+ # Reset deeper levels
1114
+ next_level = add_level + 1
1115
+ while next_level in self.numbered_headers:
1116
+ self.numbered_headers[next_level] = 0
1117
+ next_level += 1
1118
+
1119
+ # Scan upper levels
1120
+ previous_level = add_level - 1
1121
+ while previous_level in self.numbered_headers:
1122
+ # MSWord convention: no empty sublevels
1123
+ # I.e., sub-sub section (2.0.1) without a sub-section (2.1)
1124
+ # is processed as 2.1.1
1125
+ if self.numbered_headers[previous_level] == 0:
1126
+ self.numbered_headers[previous_level] += 1
1127
+
1128
+ text = f"{self.numbered_headers[previous_level]}.{text}"
1129
+ previous_level -= 1
1130
+
1131
+ hd = doc.add_heading(
1132
+ parent=self.parents[parent_level],
1133
+ text=text,
1134
+ level=add_level,
1135
+ )
1136
+ self.parents[current_level] = hd
1137
+ elem_ref.append(hd.get_ref())
1138
+ return elem_ref
1139
+
1140
+ def _add_formatted_list_item(
1141
+ self,
1142
+ doc: DoclingDocument,
1143
+ elements: list,
1144
+ marker: str,
1145
+ enumerated: bool,
1146
+ level: int,
1147
+ ) -> list[RefItem]:
1148
+ elem_ref: list[RefItem] = []
1149
+ # This should not happen by construction
1150
+ if not isinstance(self.parents[level], ListGroup):
1151
+ _log.warning(
1152
+ "Parent element of the list item is not a ListGroup. The list item will be ignored."
1153
+ )
1154
+ return elem_ref
1155
+ if not elements:
1156
+ return elem_ref
1157
+
1158
+ if len(elements) == 1:
1159
+ text, format, hyperlink = elements[0]
1160
+ if text:
1161
+ doc.add_list_item(
1162
+ marker=marker,
1163
+ enumerated=enumerated,
1164
+ parent=self.parents[level],
1165
+ text=text,
1166
+ formatting=format,
1167
+ hyperlink=hyperlink,
1168
+ )
1169
+ else:
1170
+ new_item = doc.add_list_item(
1171
+ marker=marker,
1172
+ enumerated=enumerated,
1173
+ parent=self.parents[level],
1174
+ text="",
1175
+ )
1176
+ new_parent = doc.add_inline_group(parent=new_item)
1177
+ for text, format, hyperlink in elements:
1178
+ if text:
1179
+ doc.add_text(
1180
+ label=DocItemLabel.TEXT,
1181
+ parent=new_parent,
1182
+ text=text,
1183
+ formatting=format,
1184
+ hyperlink=hyperlink,
1185
+ content_layer=self.content_layer,
1186
+ )
1187
+ return elem_ref
1188
+
1189
+ def _add_list_item(
1190
+ self,
1191
+ *,
1192
+ doc: DoclingDocument,
1193
+ numid: int,
1194
+ ilevel: int,
1195
+ elements: list,
1196
+ is_numbered: bool = False,
1197
+ ) -> list[RefItem]:
1198
+ elem_ref: list[RefItem] = []
1199
+ # this method is always called with is_numbered. Numbered lists should be properly addressed.
1200
+ if not elements:
1201
+ return elem_ref
1202
+ enum_marker = ""
1203
+
1204
+ level = self._get_level()
1205
+ prev_indent = self._prev_indent()
1206
+ if self._prev_numid() is None or (
1207
+ self._prev_numid() == numid and self.level_at_new_list is None
1208
+ ): # Open new list
1209
+ self.level_at_new_list = level
1210
+
1211
+ # Reset counters for the new numbering sequence
1212
+ self._reset_list_counters_for_new_sequence(numid)
1213
+
1214
+ list_gr = doc.add_list_group(
1215
+ name="list",
1216
+ parent=self.parents[level - 1],
1217
+ content_layer=self.content_layer,
1218
+ )
1219
+ self.parents[level] = list_gr
1220
+ elem_ref.append(list_gr.get_ref())
1221
+
1222
+ # Set marker and enumerated arguments if this is an enumeration element.
1223
+ if is_numbered:
1224
+ counter = self._get_list_counter(numid, ilevel)
1225
+ enum_marker = str(counter) + "."
1226
+ else:
1227
+ enum_marker = ""
1228
+ self._add_formatted_list_item(
1229
+ doc, elements, enum_marker, is_numbered, level
1230
+ )
1231
+ elif (
1232
+ self._prev_numid() == numid
1233
+ and self.level_at_new_list is not None
1234
+ and prev_indent is not None
1235
+ and prev_indent < ilevel
1236
+ ): # Open indented list
1237
+ for i in range(
1238
+ self.level_at_new_list + prev_indent + 1,
1239
+ self.level_at_new_list + ilevel + 1,
1240
+ ):
1241
+ list_gr1 = doc.add_list_group(
1242
+ name="list",
1243
+ parent=self.parents[i - 1],
1244
+ content_layer=self.content_layer,
1245
+ )
1246
+ self.parents[i] = list_gr1
1247
+ elem_ref.append(list_gr1.get_ref())
1248
+
1249
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
1250
+ if is_numbered:
1251
+ counter = self._get_list_counter(numid, ilevel)
1252
+ enum_marker = str(counter) + "."
1253
+ else:
1254
+ enum_marker = ""
1255
+ self._add_formatted_list_item(
1256
+ doc,
1257
+ elements,
1258
+ enum_marker,
1259
+ is_numbered,
1260
+ self.level_at_new_list + ilevel,
1261
+ )
1262
+ elif (
1263
+ self._prev_numid() == numid
1264
+ and self.level_at_new_list is not None
1265
+ and prev_indent is not None
1266
+ and ilevel < prev_indent
1267
+ ): # Close list
1268
+ for k in self.parents:
1269
+ if k > self.level_at_new_list + ilevel:
1270
+ self.parents[k] = None
1271
+
1272
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
1273
+ if is_numbered:
1274
+ counter = self._get_list_counter(numid, ilevel)
1275
+ enum_marker = str(counter) + "."
1276
+ else:
1277
+ enum_marker = ""
1278
+ self._add_formatted_list_item(
1279
+ doc,
1280
+ elements,
1281
+ enum_marker,
1282
+ is_numbered,
1283
+ self.level_at_new_list + ilevel,
1284
+ )
1285
+
1286
+ elif self._prev_numid() == numid or prev_indent == ilevel:
1287
+ # Set marker and enumerated arguments if this is an enumeration element.
1288
+ if is_numbered:
1289
+ counter = self._get_list_counter(numid, ilevel)
1290
+ enum_marker = str(counter) + "."
1291
+ else:
1292
+ enum_marker = ""
1293
+ self._add_formatted_list_item(
1294
+ doc, elements, enum_marker, is_numbered, level - 1
1295
+ )
1296
+ else:
1297
+ _log.warning("List item not matching any insert condition.")
1298
+ return elem_ref
1299
+
1300
+ @staticmethod
1301
+ def _group_cell_elements(
1302
+ group_name: str,
1303
+ doc: DoclingDocument,
1304
+ provs_in_cell: list[RefItem],
1305
+ docling_table: TableItem,
1306
+ content_layer: ContentLayer = ContentLayer.BODY,
1307
+ ) -> RefItem:
1308
+ group_element = doc.add_group(
1309
+ label=GroupLabel.UNSPECIFIED,
1310
+ name=group_name,
1311
+ parent=docling_table,
1312
+ content_layer=content_layer,
1313
+ )
1314
+ for prov in provs_in_cell:
1315
+ group_element.children.append(prov)
1316
+ pr_item = prov.resolve(doc)
1317
+ item_parent = pr_item.parent.resolve(doc)
1318
+ if pr_item.get_ref() in item_parent.children:
1319
+ item_parent.children.remove(pr_item.get_ref())
1320
+ pr_item.parent = group_element.get_ref()
1321
+ ref_for_rich_cell = group_element.get_ref()
1322
+ return ref_for_rich_cell
1323
+
1324
+ def _handle_tables(
1325
+ self,
1326
+ element: BaseOxmlElement,
1327
+ doc: DoclingDocument,
1328
+ ) -> list[RefItem]:
1329
+ elem_ref: list[RefItem] = []
1330
+ table: Table = Table(element, self.docx_obj)
1331
+ num_rows = len(table.rows)
1332
+ num_cols = len(table.columns)
1333
+ _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
1334
+
1335
+ if num_rows == 1 and num_cols == 1:
1336
+ cell_element = table.rows[0].cells[0]
1337
+ # In case we have a table of only 1 cell, we consider it furniture
1338
+ # And proceed processing the content of the cell as though it's in the document body
1339
+ self._walk_linear(cell_element._element, doc)
1340
+ return elem_ref
1341
+
1342
+ data = TableData(num_rows=num_rows, num_cols=num_cols)
1343
+ level = self._get_level()
1344
+ docling_table = doc.add_table(
1345
+ data=data, parent=self.parents[level - 1], content_layer=self.content_layer
1346
+ )
1347
+ elem_ref.append(docling_table.get_ref())
1348
+
1349
+ cell_set: set[CT_Tc] = set()
1350
+ for row_idx, row in enumerate(table.rows):
1351
+ _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
1352
+ col_idx = 0
1353
+ while col_idx < num_cols:
1354
+ # Handle merged cells: row may have fewer cells than num_cols
1355
+ if col_idx >= len(row.cells):
1356
+ break
1357
+ cell: _Cell = row.cells[col_idx]
1358
+ _log.debug(
1359
+ f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
1360
+ )
1361
+ if cell is None or cell._tc in cell_set:
1362
+ _log.debug(" skipped since repeated content")
1363
+ col_idx += cell.grid_span
1364
+ continue
1365
+ else:
1366
+ cell_set.add(cell._tc)
1367
+
1368
+ spanned_idx = row_idx
1369
+ spanned_tc: Optional[CT_Tc] = cell._tc
1370
+ while spanned_tc == cell._tc:
1371
+ spanned_idx += 1
1372
+ spanned_tc = (
1373
+ table.rows[spanned_idx].cells[col_idx]._tc
1374
+ if spanned_idx < num_rows
1375
+ else None
1376
+ )
1377
+ _log.debug(f" spanned before row {spanned_idx}")
1378
+
1379
+ # Detect equations in cell text
1380
+ text, equations = self._handle_equations_in_text(
1381
+ element=cell._element, text=cell.text
1382
+ )
1383
+ if len(equations) == 0:
1384
+ text = cell.text
1385
+ else:
1386
+ text = text.replace("<eq>", "$").replace("</eq>", "$")
1387
+
1388
+ provs_in_cell: list[RefItem] = []
1389
+ rich_table_cell: bool = self._is_rich_table_cell(cell)
1390
+
1391
+ if rich_table_cell:
1392
+ _, provs_in_cell = self._walk_linear(cell._element, doc)
1393
+ _log.debug(f"Table cell {row_idx},{col_idx} rich? {rich_table_cell}")
1394
+
1395
+ if len(provs_in_cell) > 0:
1396
+ # Cell has multiple elements, we need to group them
1397
+ rich_table_cell = True
1398
+ group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{row.grid_cols_before + row_idx}"
1399
+ ref_for_rich_cell = MsWordDocumentBackend._group_cell_elements(
1400
+ group_name,
1401
+ doc,
1402
+ provs_in_cell,
1403
+ docling_table,
1404
+ content_layer=self.content_layer,
1405
+ )
1406
+
1407
+ if rich_table_cell:
1408
+ rich_cell = RichTableCell(
1409
+ text=text,
1410
+ row_span=spanned_idx - row_idx,
1411
+ col_span=cell.grid_span,
1412
+ start_row_offset_idx=row.grid_cols_before + row_idx,
1413
+ end_row_offset_idx=row.grid_cols_before + spanned_idx,
1414
+ start_col_offset_idx=col_idx,
1415
+ end_col_offset_idx=col_idx + cell.grid_span,
1416
+ column_header=row.grid_cols_before + row_idx == 0,
1417
+ row_header=False,
1418
+ ref=ref_for_rich_cell, # points to an artificial group around children
1419
+ )
1420
+ doc.add_table_cell(table_item=docling_table, cell=rich_cell)
1421
+ col_idx += cell.grid_span
1422
+ else:
1423
+ simple_cell = TableCell(
1424
+ text=text,
1425
+ row_span=spanned_idx - row_idx,
1426
+ col_span=cell.grid_span,
1427
+ start_row_offset_idx=row.grid_cols_before + row_idx,
1428
+ end_row_offset_idx=row.grid_cols_before + spanned_idx,
1429
+ start_col_offset_idx=col_idx,
1430
+ end_col_offset_idx=col_idx + cell.grid_span,
1431
+ column_header=row.grid_cols_before + row_idx == 0,
1432
+ row_header=False,
1433
+ )
1434
+ doc.add_table_cell(table_item=docling_table, cell=simple_cell)
1435
+ col_idx += cell.grid_span
1436
+ return elem_ref
1437
+
1438
+ def _has_blip(self, element: BaseOxmlElement) -> bool:
1439
+ """Check if a docx element holds any BLIP as a child.
1440
+
1441
+ Args:
1442
+ element: a docx element
1443
+
1444
+ Returns:
1445
+ Whether the element contains a BLIP as a direct child.
1446
+ """
1447
+
1448
+ for item in element:
1449
+ if self.blip_xpath_expr(item):
1450
+ return True
1451
+ if item.findall(
1452
+ ".//w:drawing", namespaces=MsWordDocumentBackend._BLIP_NAMESPACES
1453
+ ):
1454
+ return True
1455
+
1456
+ return False
1457
+
1458
+ def _is_rich_table_cell(self, cell: _Cell) -> bool:
1459
+ """Determine whether a docx cell should be parsed as a Docling RichTableCell.
1460
+
1461
+ A docx cell can hold rich content and be parsed with a Docling RichTableCell.
1462
+ However, this requires walking through the lxml elements and creating
1463
+ node items. If the cell holds only plain text, a TableCell, the parsing
1464
+ is simpler and using a TableCell is prefered.
1465
+
1466
+ Plain text means:
1467
+ - The cell has only one paragraph
1468
+ - The paragraph consists solely of runs with no run properties
1469
+ (no need of Docling formatting).
1470
+ - No other block-level elements are present inside the cell element.
1471
+
1472
+ Args:
1473
+ cell: A docx cell
1474
+
1475
+ Returns:
1476
+ Whether the docx cell should be parsed as RichTableCell
1477
+ """
1478
+ tc = cell._tc
1479
+
1480
+ # must contain only one paragraph
1481
+ paragraphs = list(
1482
+ tc.iterchildren(
1483
+ "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"
1484
+ )
1485
+ )
1486
+ if len(paragraphs) > 1:
1487
+ return True
1488
+
1489
+ # no other content
1490
+ allowed_tags = {"p", "tcPr"} # paragraph or table-cell properties
1491
+ for child in tc:
1492
+ tag = child.tag.split("}")[-1]
1493
+ if tag not in allowed_tags:
1494
+ return True
1495
+ if self._has_blip(tc):
1496
+ return True
1497
+
1498
+ # paragraph must contain runs with no run-properties
1499
+ for para in paragraphs:
1500
+ runs = list(
1501
+ para.iterchildren(
1502
+ "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
1503
+ )
1504
+ )
1505
+ for rn in runs:
1506
+ item: Run = Run(rn, self.docx_obj)
1507
+ if item is not None:
1508
+ fm = MsWordDocumentBackend._get_format_from_run(item)
1509
+ if fm != Formatting():
1510
+ return True
1511
+
1512
+ # All checks passed: plain text only
1513
+ return False
1514
+
1515
+ def _handle_pictures(
1516
+ self, drawing_blip: Any, doc: DoclingDocument
1517
+ ) -> list[RefItem]:
1518
+ def get_docx_image(image: Any) -> Optional[bytes]:
1519
+ image_data: Optional[bytes] = None
1520
+ rId = image.get(
1521
+ "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
1522
+ )
1523
+ if rId in self.docx_obj.part.rels:
1524
+ # Access the image part using the relationship ID
1525
+ image_part = self.docx_obj.part.rels[rId].target_part
1526
+ image_data = image_part.blob # Get the binary image data
1527
+ return image_data
1528
+
1529
+ elem_ref: list[RefItem] = []
1530
+ if drawing_blip:
1531
+ level = self._get_level()
1532
+ # Open the BytesIO object with PIL to create an Image
1533
+ parent: Optional[NodeItem] = (
1534
+ self.parents[level - 1]
1535
+ if len(drawing_blip) == 1
1536
+ else doc.add_group(
1537
+ label=GroupLabel.PICTURE_AREA,
1538
+ parent=self.parents[level - 1],
1539
+ content_layer=self.content_layer,
1540
+ )
1541
+ )
1542
+ for image in drawing_blip:
1543
+ image_data: Optional[bytes] = get_docx_image(image)
1544
+ if image_data is None:
1545
+ _log.warning("Warning: image cannot be found")
1546
+ p1 = doc.add_picture(
1547
+ parent=parent,
1548
+ caption=None,
1549
+ content_layer=self.content_layer,
1550
+ )
1551
+ elem_ref.append(p1.get_ref())
1552
+ else:
1553
+ try:
1554
+ image_bytes = BytesIO(image_data)
1555
+ pil_image = Image.open(image_bytes)
1556
+ p2 = doc.add_picture(
1557
+ parent=parent,
1558
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
1559
+ caption=None,
1560
+ content_layer=self.content_layer,
1561
+ )
1562
+ elem_ref.append(p2.get_ref())
1563
+ except (UnidentifiedImageError, OSError):
1564
+ _log.warning("Warning: image cannot be loaded by Pillow")
1565
+ p3 = doc.add_picture(
1566
+ parent=parent,
1567
+ caption=None,
1568
+ content_layer=self.content_layer,
1569
+ )
1570
+ elem_ref.append(p3.get_ref())
1571
+ return elem_ref
1572
+
1573
+ def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
1574
+ # 1) Make an empty copy of the original document
1575
+ dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
1576
+ body = dml_doc._element.body
1577
+ for child in list(body):
1578
+ body.remove(child)
1579
+
1580
+ # 2) Add DrawingML to empty document
1581
+ new_para = dml_doc.add_paragraph()
1582
+ new_r = new_para.add_run()
1583
+ for dml in drawingml_els:
1584
+ new_r._r.append(deepcopy(dml))
1585
+
1586
+ # 3) Export DOCX->PDF->PNG and save it in DoclingDocument
1587
+ level = self._get_level()
1588
+ try:
1589
+ pil_image = get_pil_from_dml_docx(
1590
+ dml_doc, converter=self.docx_to_pdf_converter
1591
+ )
1592
+ if pil_image is None:
1593
+ raise UnidentifiedImageError
1594
+
1595
+ doc.add_picture(
1596
+ parent=self.parents[level - 1],
1597
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
1598
+ caption=None,
1599
+ content_layer=self.content_layer,
1600
+ )
1601
+ except (UnidentifiedImageError, OSError):
1602
+ _log.warning("Warning: DrawingML image cannot be loaded by Pillow")
1603
+ doc.add_picture(
1604
+ parent=self.parents[level - 1],
1605
+ caption=None,
1606
+ content_layer=self.content_layer,
1607
+ )
1608
+
1609
+ return
1610
+
1611
+ def _add_header_footer(self, docx_obj: DocxDocument, doc: DoclingDocument) -> None:
1612
+ """Add section headers and footers.
1613
+
1614
+ Headers and footers are added in the furniture content and only the text paragraphs
1615
+ are parsed. The paragraphs are attached to a single group item for the header or the
1616
+ footer. If the document has a section with new header and footer, they will be parsed
1617
+ in new group items.
1618
+
1619
+ Args:
1620
+ docx_obj: A docx Document object to be parsed.
1621
+ doc: A DoclingDocument object to add the header and footer from docx_obj.
1622
+ """
1623
+ current_layer = self.content_layer
1624
+ base_parent = self.parents[0]
1625
+ self.content_layer = ContentLayer.FURNITURE
1626
+ for sec_idx, section in enumerate(docx_obj.sections):
1627
+ if sec_idx > 0 and not section.different_first_page_header_footer:
1628
+ continue
1629
+
1630
+ hdr = (
1631
+ section.first_page_header
1632
+ if section.different_first_page_header_footer
1633
+ else section.header
1634
+ )
1635
+ par = [txt for txt in (par.text.strip() for par in hdr.paragraphs) if txt]
1636
+ tables = hdr.tables
1637
+ has_blip = self._has_blip(hdr._element)
1638
+ if par or tables or has_blip:
1639
+ self.parents[0] = doc.add_group(
1640
+ label=GroupLabel.SECTION,
1641
+ name="page header",
1642
+ content_layer=self.content_layer,
1643
+ )
1644
+ self._walk_linear(hdr._element, doc)
1645
+
1646
+ ftr = (
1647
+ section.first_page_footer
1648
+ if section.different_first_page_header_footer
1649
+ else section.footer
1650
+ )
1651
+ par = [txt for txt in (par.text.strip() for par in ftr.paragraphs) if txt]
1652
+ tables = ftr.tables
1653
+ has_blip = self._has_blip(ftr._element)
1654
+ if par or tables or has_blip:
1655
+ self.parents[0] = doc.add_group(
1656
+ label=GroupLabel.SECTION,
1657
+ name="page footer",
1658
+ content_layer=self.content_layer,
1659
+ )
1660
+ self._walk_linear(ftr._element, doc)
1661
+
1662
+ self.content_layer = current_layer
1663
+ self.parents[0] = base_parent