docling 2.28.4__tar.gz → 2.29.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {docling-2.28.4 → docling-2.29.0}/PKG-INFO +1 -1
  2. {docling-2.28.4 → docling-2.29.0}/docling/backend/docx/latex/latex_dict.py +3 -0
  3. {docling-2.28.4 → docling-2.29.0}/docling/backend/docx/latex/omml.py +14 -14
  4. {docling-2.28.4 → docling-2.29.0}/docling/backend/html_backend.py +2 -1
  5. {docling-2.28.4 → docling-2.29.0}/docling/backend/mspowerpoint_backend.py +4 -3
  6. {docling-2.28.4 → docling-2.29.0}/docling/backend/msword_backend.py +300 -106
  7. {docling-2.28.4 → docling-2.29.0}/docling/cli/main.py +50 -0
  8. {docling-2.28.4 → docling-2.29.0}/docling/models/tesseract_ocr_cli_model.py +1 -1
  9. {docling-2.28.4 → docling-2.29.0}/pyproject.toml +1 -1
  10. {docling-2.28.4 → docling-2.29.0}/LICENSE +0 -0
  11. {docling-2.28.4 → docling-2.29.0}/README.md +0 -0
  12. {docling-2.28.4 → docling-2.29.0}/docling/__init__.py +0 -0
  13. {docling-2.28.4 → docling-2.29.0}/docling/backend/__init__.py +0 -0
  14. {docling-2.28.4 → docling-2.29.0}/docling/backend/abstract_backend.py +0 -0
  15. {docling-2.28.4 → docling-2.29.0}/docling/backend/asciidoc_backend.py +0 -0
  16. {docling-2.28.4 → docling-2.29.0}/docling/backend/csv_backend.py +0 -0
  17. {docling-2.28.4 → docling-2.29.0}/docling/backend/docling_parse_backend.py +0 -0
  18. {docling-2.28.4 → docling-2.29.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  19. {docling-2.28.4 → docling-2.29.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  20. {docling-2.28.4 → docling-2.29.0}/docling/backend/docx/__init__.py +0 -0
  21. {docling-2.28.4 → docling-2.29.0}/docling/backend/docx/latex/__init__.py +0 -0
  22. {docling-2.28.4 → docling-2.29.0}/docling/backend/json/__init__.py +0 -0
  23. {docling-2.28.4 → docling-2.29.0}/docling/backend/json/docling_json_backend.py +0 -0
  24. {docling-2.28.4 → docling-2.29.0}/docling/backend/md_backend.py +0 -0
  25. {docling-2.28.4 → docling-2.29.0}/docling/backend/msexcel_backend.py +0 -0
  26. {docling-2.28.4 → docling-2.29.0}/docling/backend/pdf_backend.py +0 -0
  27. {docling-2.28.4 → docling-2.29.0}/docling/backend/pypdfium2_backend.py +0 -0
  28. {docling-2.28.4 → docling-2.29.0}/docling/backend/xml/__init__.py +0 -0
  29. {docling-2.28.4 → docling-2.29.0}/docling/backend/xml/jats_backend.py +0 -0
  30. {docling-2.28.4 → docling-2.29.0}/docling/backend/xml/uspto_backend.py +0 -0
  31. {docling-2.28.4 → docling-2.29.0}/docling/chunking/__init__.py +0 -0
  32. {docling-2.28.4 → docling-2.29.0}/docling/cli/__init__.py +0 -0
  33. {docling-2.28.4 → docling-2.29.0}/docling/cli/models.py +0 -0
  34. {docling-2.28.4 → docling-2.29.0}/docling/cli/tools.py +0 -0
  35. {docling-2.28.4 → docling-2.29.0}/docling/datamodel/__init__.py +0 -0
  36. {docling-2.28.4 → docling-2.29.0}/docling/datamodel/base_models.py +0 -0
  37. {docling-2.28.4 → docling-2.29.0}/docling/datamodel/document.py +0 -0
  38. {docling-2.28.4 → docling-2.29.0}/docling/datamodel/pipeline_options.py +0 -0
  39. {docling-2.28.4 → docling-2.29.0}/docling/datamodel/settings.py +0 -0
  40. {docling-2.28.4 → docling-2.29.0}/docling/document_converter.py +0 -0
  41. {docling-2.28.4 → docling-2.29.0}/docling/exceptions.py +0 -0
  42. {docling-2.28.4 → docling-2.29.0}/docling/models/__init__.py +0 -0
  43. {docling-2.28.4 → docling-2.29.0}/docling/models/base_model.py +0 -0
  44. {docling-2.28.4 → docling-2.29.0}/docling/models/base_ocr_model.py +0 -0
  45. {docling-2.28.4 → docling-2.29.0}/docling/models/code_formula_model.py +0 -0
  46. {docling-2.28.4 → docling-2.29.0}/docling/models/document_picture_classifier.py +0 -0
  47. {docling-2.28.4 → docling-2.29.0}/docling/models/easyocr_model.py +0 -0
  48. {docling-2.28.4 → docling-2.29.0}/docling/models/factories/__init__.py +0 -0
  49. {docling-2.28.4 → docling-2.29.0}/docling/models/factories/base_factory.py +0 -0
  50. {docling-2.28.4 → docling-2.29.0}/docling/models/factories/ocr_factory.py +0 -0
  51. {docling-2.28.4 → docling-2.29.0}/docling/models/factories/picture_description_factory.py +0 -0
  52. {docling-2.28.4 → docling-2.29.0}/docling/models/hf_mlx_model.py +0 -0
  53. {docling-2.28.4 → docling-2.29.0}/docling/models/hf_vlm_model.py +0 -0
  54. {docling-2.28.4 → docling-2.29.0}/docling/models/layout_model.py +0 -0
  55. {docling-2.28.4 → docling-2.29.0}/docling/models/ocr_mac_model.py +0 -0
  56. {docling-2.28.4 → docling-2.29.0}/docling/models/page_assemble_model.py +0 -0
  57. {docling-2.28.4 → docling-2.29.0}/docling/models/page_preprocessing_model.py +0 -0
  58. {docling-2.28.4 → docling-2.29.0}/docling/models/picture_description_api_model.py +0 -0
  59. {docling-2.28.4 → docling-2.29.0}/docling/models/picture_description_base_model.py +0 -0
  60. {docling-2.28.4 → docling-2.29.0}/docling/models/picture_description_vlm_model.py +0 -0
  61. {docling-2.28.4 → docling-2.29.0}/docling/models/plugins/__init__.py +0 -0
  62. {docling-2.28.4 → docling-2.29.0}/docling/models/plugins/defaults.py +0 -0
  63. {docling-2.28.4 → docling-2.29.0}/docling/models/rapid_ocr_model.py +0 -0
  64. {docling-2.28.4 → docling-2.29.0}/docling/models/readingorder_model.py +0 -0
  65. {docling-2.28.4 → docling-2.29.0}/docling/models/table_structure_model.py +0 -0
  66. {docling-2.28.4 → docling-2.29.0}/docling/models/tesseract_ocr_model.py +0 -0
  67. {docling-2.28.4 → docling-2.29.0}/docling/pipeline/__init__.py +0 -0
  68. {docling-2.28.4 → docling-2.29.0}/docling/pipeline/base_pipeline.py +0 -0
  69. {docling-2.28.4 → docling-2.29.0}/docling/pipeline/simple_pipeline.py +0 -0
  70. {docling-2.28.4 → docling-2.29.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  71. {docling-2.28.4 → docling-2.29.0}/docling/pipeline/vlm_pipeline.py +0 -0
  72. {docling-2.28.4 → docling-2.29.0}/docling/py.typed +0 -0
  73. {docling-2.28.4 → docling-2.29.0}/docling/utils/__init__.py +0 -0
  74. {docling-2.28.4 → docling-2.29.0}/docling/utils/accelerator_utils.py +0 -0
  75. {docling-2.28.4 → docling-2.29.0}/docling/utils/export.py +0 -0
  76. {docling-2.28.4 → docling-2.29.0}/docling/utils/glm_utils.py +0 -0
  77. {docling-2.28.4 → docling-2.29.0}/docling/utils/layout_postprocessor.py +0 -0
  78. {docling-2.28.4 → docling-2.29.0}/docling/utils/locks.py +0 -0
  79. {docling-2.28.4 → docling-2.29.0}/docling/utils/model_downloader.py +0 -0
  80. {docling-2.28.4 → docling-2.29.0}/docling/utils/ocr_utils.py +0 -0
  81. {docling-2.28.4 → docling-2.29.0}/docling/utils/profiling.py +0 -0
  82. {docling-2.28.4 → docling-2.29.0}/docling/utils/utils.py +0 -0
  83. {docling-2.28.4 → docling-2.29.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.28.4
3
+ Version: 2.29.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -215,6 +215,9 @@ FUNC = {
215
215
  "coth": "\\coth({fe})",
216
216
  "sec": "\\sec({fe})",
217
217
  "csc": "\\csc({fe})",
218
+ "mod": "\\mod {fe}",
219
+ "max": "\\max({fe})",
220
+ "min": "\\min({fe})",
218
221
  }
219
222
 
220
223
  FUNC_PLACE = "{fe}"
@@ -5,6 +5,8 @@ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
5
5
  On 23/01/2025
6
6
  """
7
7
 
8
+ import logging
9
+
8
10
  import lxml.etree as ET
9
11
  from pylatexenc.latexencode import UnicodeToLatexEncoder
10
12
 
@@ -39,6 +41,8 @@ from docling.backend.docx.latex.latex_dict import (
39
41
 
40
42
  OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
41
43
 
44
+ _log = logging.getLogger(__name__)
45
+
42
46
 
43
47
  def load(stream):
44
48
  tree = ET.parse(stream)
@@ -281,8 +285,10 @@ class oMath2Latex(Tag2Method):
281
285
  if FUNC.get(t):
282
286
  latex_chars.append(FUNC[t])
283
287
  else:
284
- raise NotSupport("Not support func %s" % t)
285
- else:
288
+ _log.warning("Function not supported, will default to text: %s", t)
289
+ if isinstance(t, str):
290
+ latex_chars.append(t)
291
+ elif isinstance(t, str):
286
292
  latex_chars.append(t)
287
293
  t = BLANK.join(latex_chars)
288
294
  return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
@@ -382,8 +388,6 @@ class oMath2Latex(Tag2Method):
382
388
 
383
389
  out_latex_str = self.u.unicode_to_latex(s)
384
390
 
385
- # print(s, out_latex_str)
386
-
387
391
  if (
388
392
  s.startswith("{") is False
389
393
  and out_latex_str.startswith("{")
@@ -392,19 +396,13 @@ class oMath2Latex(Tag2Method):
392
396
  ):
393
397
  out_latex_str = f" {out_latex_str[1:-1]} "
394
398
 
395
- # print(s, out_latex_str)
396
-
397
399
  if "ensuremath" in out_latex_str:
398
400
  out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
399
401
  out_latex_str = out_latex_str.replace("}", " ")
400
402
 
401
- # print(s, out_latex_str)
402
-
403
403
  if out_latex_str.strip().startswith("\\text"):
404
404
  out_latex_str = f" \\text{{{out_latex_str}}} "
405
405
 
406
- # print(s, out_latex_str)
407
-
408
406
  return out_latex_str
409
407
 
410
408
  def do_r(self, elm):
@@ -415,10 +413,12 @@ class oMath2Latex(Tag2Method):
415
413
  """
416
414
  _str = []
417
415
  _base_str = []
418
- for s in elm.findtext("./{0}t".format(OMML_NS)):
419
- out_latex_str = self.process_unicode(s)
420
- _str.append(out_latex_str)
421
- _base_str.append(s)
416
+ found_text = elm.findtext("./{0}t".format(OMML_NS))
417
+ if found_text:
418
+ for s in found_text:
419
+ out_latex_str = self.process_unicode(s)
420
+ _str.append(out_latex_str)
421
+ _base_str.append(s)
422
422
 
423
423
  proc_str = escape_latex(BLANK.join(_str))
424
424
  base_proc_str = BLANK.join(_base_str)
@@ -34,6 +34,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
34
34
  "h6",
35
35
  "p",
36
36
  "pre",
37
+ "code",
37
38
  "ul",
38
39
  "ol",
39
40
  "li",
@@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
165
166
  self.handle_header(tag, doc)
166
167
  elif tag.name in ["p"]:
167
168
  self.handle_paragraph(tag, doc)
168
- elif tag.name in ["pre"]:
169
+ elif tag.name in ["pre", "code"]:
169
170
  self.handle_code(tag, doc)
170
171
  elif tag.name in ["ul", "ol"]:
171
172
  self.handle_list(tag, doc)
@@ -392,9 +392,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
392
392
  self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
393
393
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
394
394
  # Handle Pictures
395
- self.handle_pictures(
396
- shape, parent_slide, slide_ind, doc, slide_size
397
- )
395
+ if hasattr(shape, "image"):
396
+ self.handle_pictures(
397
+ shape, parent_slide, slide_ind, doc, slide_size
398
+ )
398
399
  # If shape doesn't have any text, move on to the next shape
399
400
  if not hasattr(shape, "text"):
400
401
  return
@@ -14,15 +14,19 @@ from docling_core.types.doc import (
14
14
  TableCell,
15
15
  TableData,
16
16
  )
17
+ from docling_core.types.doc.document import Formatting
17
18
  from docx import Document
18
19
  from docx.document import Document as DocxDocument
19
20
  from docx.oxml.table import CT_Tc
20
21
  from docx.oxml.xmlchemy import BaseOxmlElement
21
22
  from docx.table import Table, _Cell
23
+ from docx.text.hyperlink import Hyperlink
22
24
  from docx.text.paragraph import Paragraph
25
+ from docx.text.run import Run
23
26
  from lxml import etree
24
27
  from lxml.etree import XPath
25
28
  from PIL import Image, UnidentifiedImageError
29
+ from pydantic import AnyUrl
26
30
  from typing_extensions import override
27
31
 
28
32
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -54,6 +58,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
54
58
  self.level_at_new_list: Optional[int] = None
55
59
  self.parents: dict[int, Optional[NodeItem]] = {}
56
60
  self.numbered_headers: dict[int, int] = {}
61
+ self.equation_bookends: str = "<eq>{EQ}</eq>"
57
62
  for i in range(-1, self.max_levels):
58
63
  self.parents[i] = None
59
64
 
@@ -118,14 +123,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
118
123
  doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
119
124
  if self.is_valid():
120
125
  assert self.docx_obj is not None
121
- doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
126
+ doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
122
127
  return doc
123
128
  else:
124
129
  raise RuntimeError(
125
130
  f"Cannot convert doc with {self.document_hash} because the backend failed to init."
126
131
  )
127
132
 
128
- def update_history(
133
+ def _update_history(
129
134
  self,
130
135
  name: str,
131
136
  level: Optional[int],
@@ -138,26 +143,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
138
143
  self.history["numids"].append(numid)
139
144
  self.history["indents"].append(ilevel)
140
145
 
141
- def prev_name(self) -> Optional[str]:
146
+ def _prev_name(self) -> Optional[str]:
142
147
  return self.history["names"][-1]
143
148
 
144
- def prev_level(self) -> Optional[int]:
149
+ def _prev_level(self) -> Optional[int]:
145
150
  return self.history["levels"][-1]
146
151
 
147
- def prev_numid(self) -> Optional[int]:
152
+ def _prev_numid(self) -> Optional[int]:
148
153
  return self.history["numids"][-1]
149
154
 
150
- def prev_indent(self) -> Optional[int]:
155
+ def _prev_indent(self) -> Optional[int]:
151
156
  return self.history["indents"][-1]
152
157
 
153
- def get_level(self) -> int:
158
+ def _get_level(self) -> int:
154
159
  """Return the first None index."""
155
160
  for k, v in self.parents.items():
156
161
  if k >= 0 and v == None:
157
162
  return k
158
163
  return 0
159
164
 
160
- def walk_linear(
165
+ def _walk_linear(
161
166
  self,
162
167
  body: BaseOxmlElement,
163
168
  docx_obj: DocxDocument,
@@ -177,12 +182,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
177
182
  # Check for Tables
178
183
  if element.tag.endswith("tbl"):
179
184
  try:
180
- self.handle_tables(element, docx_obj, doc)
185
+ self._handle_tables(element, docx_obj, doc)
181
186
  except Exception:
182
187
  _log.debug("could not parse a table, broken docx table")
183
188
 
184
189
  elif drawing_blip:
185
- self.handle_pictures(docx_obj, drawing_blip, doc)
190
+ self._handle_pictures(docx_obj, drawing_blip, doc)
186
191
  # Check for the sdt containers, like table of contents
187
192
  elif tag_name in ["sdt"]:
188
193
  sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -190,16 +195,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
190
195
  # Iterate paragraphs, runs, or text inside <w:sdtContent>.
191
196
  paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
192
197
  for p in paragraphs:
193
- self.handle_text_elements(p, docx_obj, doc)
198
+ self._handle_text_elements(p, docx_obj, doc)
194
199
  # Check for Text
195
200
  elif tag_name in ["p"]:
196
201
  # "tcPr", "sectPr"
197
- self.handle_text_elements(element, docx_obj, doc)
202
+ self._handle_text_elements(element, docx_obj, doc)
198
203
  else:
199
204
  _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
200
205
  return doc
201
206
 
202
- def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
207
+ def _str_to_int(
208
+ self, s: Optional[str], default: Optional[int] = 0
209
+ ) -> Optional[int]:
203
210
  if s is None:
204
211
  return None
205
212
  try:
@@ -207,7 +214,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
207
214
  except ValueError:
208
215
  return default
209
216
 
210
- def split_text_and_number(self, input_string: str) -> list[str]:
217
+ def _split_text_and_number(self, input_string: str) -> list[str]:
211
218
  match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
212
219
  if match:
213
220
  parts = list(filter(None, match.groups()))
@@ -215,7 +222,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
215
222
  else:
216
223
  return [input_string]
217
224
 
218
- def get_numId_and_ilvl(
225
+ def _get_numId_and_ilvl(
219
226
  self, paragraph: Paragraph
220
227
  ) -> tuple[Optional[int], Optional[int]]:
221
228
  # Access the XML element of the paragraph
@@ -230,60 +237,188 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
230
237
  numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
231
238
  ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
232
239
 
233
- return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
240
+ return self._str_to_int(numId, None), self._str_to_int(ilvl, None)
234
241
 
235
242
  return None, None # If the paragraph is not part of a list
236
243
 
237
- def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
244
+ def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
245
+ parts = self._split_text_and_number(style_label)
246
+
247
+ if len(parts) == 2:
248
+ parts.sort()
249
+ label_str: str = ""
250
+ label_level: Optional[int] = 0
251
+ if parts[0].strip().lower() == "heading":
252
+ label_str = "Heading"
253
+ label_level = self._str_to_int(parts[1], None)
254
+ if parts[1].strip().lower() == "heading":
255
+ label_str = "Heading"
256
+ label_level = self._str_to_int(parts[0], None)
257
+ return label_str, label_level
258
+
259
+ return style_label, None
260
+
261
+ def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
238
262
  if paragraph.style is None:
239
263
  return "Normal", None
264
+
240
265
  label = paragraph.style.style_id
266
+ name = paragraph.style.name
267
+ base_style_label = None
268
+ base_style_name = None
269
+ if base_style := getattr(paragraph.style, "base_style", None):
270
+ base_style_label = base_style.style_id
271
+ base_style_name = base_style.name
272
+
241
273
  if label is None:
242
274
  return "Normal", None
275
+
243
276
  if ":" in label:
244
277
  parts = label.split(":")
245
-
246
278
  if len(parts) == 2:
247
- return parts[0], self.str_to_int(parts[1], None)
279
+ return parts[0], self._str_to_int(parts[1], None)
248
280
 
249
- parts = self.split_text_and_number(label)
281
+ if "heading" in label.lower():
282
+ return self._get_heading_and_level(label)
283
+ if "heading" in name.lower():
284
+ return self._get_heading_and_level(name)
285
+ if base_style_label and "heading" in base_style_label.lower():
286
+ return self._get_heading_and_level(base_style_label)
287
+ if base_style_name and "heading" in base_style_name.lower():
288
+ return self._get_heading_and_level(base_style_name)
250
289
 
251
- if "Heading" in label and len(parts) == 2:
252
- parts.sort()
253
- label_str: str = ""
254
- label_level: Optional[int] = 0
255
- if parts[0] == "Heading":
256
- label_str = parts[0]
257
- label_level = self.str_to_int(parts[1], None)
258
- if parts[1] == "Heading":
259
- label_str = parts[1]
260
- label_level = self.str_to_int(parts[0], None)
261
- return label_str, label_level
262
- else:
263
- return label, None
290
+ return label, None
264
291
 
265
- def handle_equations_in_text(self, element, text):
292
+ @classmethod
293
+ def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
294
+ has_any_formatting = run.bold or run.italic or run.underline
295
+ return (
296
+ Formatting(
297
+ bold=run.bold or False,
298
+ italic=run.italic or False,
299
+ underline=run.underline or False,
300
+ )
301
+ if has_any_formatting
302
+ else None
303
+ )
304
+
305
+ def _get_paragraph_elements(self, paragraph: Paragraph):
306
+ """
307
+ Extract paragraph elements along with their formatting and hyperlink
308
+ """
309
+
310
+ # for now retain empty paragraphs for backwards compatibility:
311
+ if paragraph.text.strip() == "":
312
+ return [("", None, None)]
313
+
314
+ paragraph_elements: list[
315
+ tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]]
316
+ ] = []
317
+ group_text = ""
318
+ previous_format = None
319
+
320
+ # Iterate over the runs of the paragraph and group them by format
321
+ for c in paragraph.iter_inner_content():
322
+ if isinstance(c, Hyperlink):
323
+ text = c.text
324
+ hyperlink = Path(c.address)
325
+ format = self._get_format_from_run(c.runs[0])
326
+ elif isinstance(c, Run):
327
+ text = c.text
328
+ hyperlink = None
329
+ format = self._get_format_from_run(c)
330
+ else:
331
+ continue
332
+
333
+ if (len(text.strip()) and format != previous_format) or (
334
+ hyperlink is not None
335
+ ):
336
+ # If the style changes for a non empty text, add the previous group
337
+ if len(group_text.strip()) > 0:
338
+ paragraph_elements.append(
339
+ (group_text.strip(), previous_format, None)
340
+ )
341
+ group_text = ""
342
+
343
+ # If there is a hyperlink, add it immediately
344
+ if hyperlink is not None:
345
+ paragraph_elements.append((text.strip(), format, hyperlink))
346
+ text = ""
347
+ else:
348
+ previous_format = format
349
+
350
+ group_text += text
351
+
352
+ # Format the last group
353
+ if len(group_text.strip()) > 0:
354
+ paragraph_elements.append((group_text.strip(), format, None))
355
+
356
+ return paragraph_elements
357
+
358
+ def _handle_equations_in_text(self, element, text):
266
359
  only_texts = []
267
360
  only_equations = []
268
361
  texts_and_equations = []
269
362
  for subt in element.iter():
270
363
  tag_name = etree.QName(subt).localname
271
364
  if tag_name == "t" and "math" not in subt.tag:
272
- only_texts.append(subt.text)
273
- texts_and_equations.append(subt.text)
365
+ if isinstance(subt.text, str):
366
+ only_texts.append(subt.text)
367
+ texts_and_equations.append(subt.text)
274
368
  elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
275
- latex_equation = str(oMath2Latex(subt))
276
- only_equations.append(latex_equation)
277
- texts_and_equations.append(latex_equation)
369
+ latex_equation = str(oMath2Latex(subt)).strip()
370
+ if len(latex_equation) > 0:
371
+ only_equations.append(
372
+ self.equation_bookends.format(EQ=latex_equation)
373
+ )
374
+ texts_and_equations.append(
375
+ self.equation_bookends.format(EQ=latex_equation)
376
+ )
278
377
 
279
- if "".join(only_texts).strip() != text.strip():
378
+ if len(only_equations) < 1:
379
+ return text, []
380
+
381
+ if (
382
+ re.sub(r"\s+", "", "".join(only_texts)).strip()
383
+ != re.sub(r"\s+", "", text).strip()
384
+ ):
280
385
  # If we are not able to reconstruct the initial raw text
281
386
  # do not try to parse equations and return the original
282
387
  return text, []
283
388
 
284
- return "".join(texts_and_equations), only_equations
389
+ # Insert equations into original text
390
+ # This is done to preserve white space structure
391
+ output_text = text[:]
392
+ init_i = 0
393
+ for i_substr, substr in enumerate(texts_and_equations):
394
+ if len(substr) == 0:
395
+ continue
285
396
 
286
- def handle_text_elements(
397
+ if substr in output_text[init_i:]:
398
+ init_i += output_text[init_i:].find(substr) + len(substr)
399
+ else:
400
+ if i_substr > 0:
401
+ output_text = output_text[:init_i] + substr + output_text[init_i:]
402
+ init_i += len(substr)
403
+ else:
404
+ output_text = substr + output_text
405
+
406
+ return output_text, only_equations
407
+
408
+ def _create_or_reuse_parent(
409
+ self,
410
+ *,
411
+ doc: DoclingDocument,
412
+ prev_parent: Optional[NodeItem],
413
+ paragraph_elements: list,
414
+ ) -> Optional[NodeItem]:
415
+ return (
416
+ doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
417
+ if len(paragraph_elements) > 1
418
+ else prev_parent
419
+ )
420
+
421
+ def _handle_text_elements(
287
422
  self,
288
423
  element: BaseOxmlElement,
289
424
  docx_obj: DocxDocument,
@@ -292,10 +427,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
292
427
  paragraph = Paragraph(element, docx_obj)
293
428
 
294
429
  raw_text = paragraph.text
295
- text, equations = self.handle_equations_in_text(element=element, text=raw_text)
430
+ text, equations = self._handle_equations_in_text(element=element, text=raw_text)
296
431
 
297
432
  if text is None:
298
433
  return
434
+ paragraph_elements = self._get_paragraph_elements(paragraph)
299
435
  text = text.strip()
300
436
 
301
437
  # Common styles for bullet and numbered lists.
@@ -303,8 +439,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
303
439
  # Identify wether list is a numbered list or not
304
440
  # is_numbered = "List Bullet" not in paragraph.style.name
305
441
  is_numbered = False
306
- p_style_id, p_level = self.get_label_and_level(paragraph)
307
- numid, ilevel = self.get_numId_and_ilvl(paragraph)
442
+ p_style_id, p_level = self._get_label_and_level(paragraph)
443
+ numid, ilevel = self._get_numId_and_ilvl(paragraph)
308
444
 
309
445
  if numid == 0:
310
446
  numid = None
@@ -315,18 +451,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
315
451
  and ilevel is not None
316
452
  and p_style_id not in ["Title", "Heading"]
317
453
  ):
318
- self.add_listitem(
319
- doc,
320
- numid,
321
- ilevel,
322
- text,
323
- is_numbered,
454
+ self._add_list_item(
455
+ doc=doc,
456
+ numid=numid,
457
+ ilevel=ilevel,
458
+ elements=paragraph_elements,
459
+ is_numbered=is_numbered,
324
460
  )
325
- self.update_history(p_style_id, p_level, numid, ilevel)
461
+ self._update_history(p_style_id, p_level, numid, ilevel)
326
462
  return
327
463
  elif (
328
464
  numid is None
329
- and self.prev_numid() is not None
465
+ and self._prev_numid() is not None
330
466
  and p_style_id not in ["Title", "Heading"]
331
467
  ): # Close list
332
468
  if self.level_at_new_list:
@@ -348,26 +484,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
348
484
  )
349
485
  elif "Heading" in p_style_id:
350
486
  style_element = getattr(paragraph.style, "element", None)
351
- if style_element:
487
+ if style_element is not None:
352
488
  is_numbered_style = (
353
489
  "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
354
490
  )
355
491
  else:
356
492
  is_numbered_style = False
357
- self.add_header(doc, p_level, text, is_numbered_style)
493
+ self._add_header(doc, p_level, text, is_numbered_style)
358
494
 
359
495
  elif len(equations) > 0:
360
- if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
496
+ if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
361
497
  # Standalone equation
362
- level = self.get_level()
498
+ level = self._get_level()
363
499
  doc.add_text(
364
500
  label=DocItemLabel.FORMULA,
365
501
  parent=self.parents[level - 1],
366
- text=text,
502
+ text=text.replace("<eq>", "").replace("</eq>", ""),
367
503
  )
368
504
  else:
369
505
  # Inline equation
370
- level = self.get_level()
506
+ level = self._get_level()
371
507
  inline_equation = doc.add_group(
372
508
  label=GroupLabel.INLINE, parent=self.parents[level - 1]
373
509
  )
@@ -376,8 +512,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
376
512
  if len(text_tmp) == 0:
377
513
  break
378
514
 
379
- pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
380
- text_tmp = text_tmp.split(eq, maxsplit=1)[1]
515
+ split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
516
+
517
+ pre_eq_text = split_text_tmp[0]
518
+ text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
519
+
381
520
  if len(pre_eq_text) > 0:
382
521
  doc.add_text(
383
522
  label=DocItemLabel.PARAGRAPH,
@@ -387,13 +526,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
387
526
  doc.add_text(
388
527
  label=DocItemLabel.FORMULA,
389
528
  parent=inline_equation,
390
- text=eq,
529
+ text=eq.replace("<eq>", "").replace("</eq>", ""),
391
530
  )
531
+
392
532
  if len(text_tmp) > 0:
393
533
  doc.add_text(
394
534
  label=DocItemLabel.PARAGRAPH,
395
535
  parent=inline_equation,
396
- text=text_tmp,
536
+ text=text_tmp.strip(),
397
537
  )
398
538
 
399
539
  elif p_style_id in [
@@ -406,30 +546,50 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
406
546
  "ListBullet",
407
547
  "Quote",
408
548
  ]:
409
- level = self.get_level()
410
- doc.add_text(
411
- label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
549
+ level = self._get_level()
550
+ parent = self._create_or_reuse_parent(
551
+ doc=doc,
552
+ prev_parent=self.parents.get(level - 1),
553
+ paragraph_elements=paragraph_elements,
412
554
  )
555
+ for text, format, hyperlink in paragraph_elements:
556
+ doc.add_text(
557
+ label=DocItemLabel.PARAGRAPH,
558
+ parent=parent,
559
+ text=text,
560
+ formatting=format,
561
+ hyperlink=hyperlink,
562
+ )
413
563
 
414
564
  else:
415
565
  # Text style names can, and will have, not only default values but user values too
416
566
  # hence we treat all other labels as pure text
417
- level = self.get_level()
418
- doc.add_text(
419
- label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
567
+ level = self._get_level()
568
+ parent = self._create_or_reuse_parent(
569
+ doc=doc,
570
+ prev_parent=self.parents.get(level - 1),
571
+ paragraph_elements=paragraph_elements,
420
572
  )
573
+ for text, format, hyperlink in paragraph_elements:
574
+ doc.add_text(
575
+ label=DocItemLabel.PARAGRAPH,
576
+ parent=parent,
577
+ text=text,
578
+ formatting=format,
579
+ hyperlink=hyperlink,
580
+ )
421
581
 
422
- self.update_history(p_style_id, p_level, numid, ilevel)
582
+ self._update_history(p_style_id, p_level, numid, ilevel)
423
583
  return
424
584
 
425
- def add_header(
585
+ def _add_header(
426
586
  self,
427
587
  doc: DoclingDocument,
428
588
  curr_level: Optional[int],
429
589
  text: str,
430
590
  is_numbered_style: bool = False,
431
591
  ) -> None:
432
- level = self.get_level()
592
+ level = self._get_level()
433
593
  if isinstance(curr_level, int):
434
594
  if curr_level > level:
435
595
  # add invisible group
@@ -485,19 +645,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
485
645
  )
486
646
  return
487
647
 
488
- def add_listitem(
648
+ def _add_list_item(
489
649
  self,
650
+ *,
490
651
  doc: DoclingDocument,
491
652
  numid: int,
492
653
  ilevel: int,
493
- text: str,
654
+ elements: list,
494
655
  is_numbered: bool = False,
495
656
  ) -> None:
496
657
  enum_marker = ""
497
658
 
498
- level = self.get_level()
499
- prev_indent = self.prev_indent()
500
- if self.prev_numid() is None: # Open new list
659
+ level = self._get_level()
660
+ prev_indent = self._prev_indent()
661
+ if self._prev_numid() is None: # Open new list
501
662
  self.level_at_new_list = level
502
663
 
503
664
  self.parents[level] = doc.add_group(
@@ -509,15 +670,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
509
670
  if is_numbered:
510
671
  enum_marker = str(self.listIter) + "."
511
672
  is_numbered = True
512
- doc.add_list_item(
513
- marker=enum_marker,
514
- enumerated=is_numbered,
515
- parent=self.parents[level],
516
- text=text,
673
+ new_parent = self._create_or_reuse_parent(
674
+ doc=doc,
675
+ prev_parent=self.parents[level],
676
+ paragraph_elements=elements,
517
677
  )
678
+ for text, format, hyperlink in elements:
679
+ doc.add_list_item(
680
+ marker=enum_marker,
681
+ enumerated=is_numbered,
682
+ parent=new_parent,
683
+ text=text,
684
+ formatting=format,
685
+ hyperlink=hyperlink,
686
+ )
518
687
 
519
688
  elif (
520
- self.prev_numid() == numid
689
+ self._prev_numid() == numid
521
690
  and self.level_at_new_list is not None
522
691
  and prev_indent is not None
523
692
  and prev_indent < ilevel
@@ -545,15 +714,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
545
714
  if is_numbered:
546
715
  enum_marker = str(self.listIter) + "."
547
716
  is_numbered = True
548
- doc.add_list_item(
549
- marker=enum_marker,
550
- enumerated=is_numbered,
551
- parent=self.parents[self.level_at_new_list + ilevel],
552
- text=text,
553
- )
554
717
 
718
+ new_parent = self._create_or_reuse_parent(
719
+ doc=doc,
720
+ prev_parent=self.parents[self.level_at_new_list + ilevel],
721
+ paragraph_elements=elements,
722
+ )
723
+ for text, format, hyperlink in elements:
724
+ doc.add_list_item(
725
+ marker=enum_marker,
726
+ enumerated=is_numbered,
727
+ parent=new_parent,
728
+ text=text,
729
+ formatting=format,
730
+ hyperlink=hyperlink,
731
+ )
555
732
  elif (
556
- self.prev_numid() == numid
733
+ self._prev_numid() == numid
557
734
  and self.level_at_new_list is not None
558
735
  and prev_indent is not None
559
736
  and ilevel < prev_indent
@@ -567,29 +744,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
567
744
  if is_numbered:
568
745
  enum_marker = str(self.listIter) + "."
569
746
  is_numbered = True
570
- doc.add_list_item(
571
- marker=enum_marker,
572
- enumerated=is_numbered,
573
- parent=self.parents[self.level_at_new_list + ilevel],
574
- text=text,
747
+ new_parent = self._create_or_reuse_parent(
748
+ doc=doc,
749
+ prev_parent=self.parents[self.level_at_new_list + ilevel],
750
+ paragraph_elements=elements,
575
751
  )
752
+ for text, format, hyperlink in elements:
753
+ doc.add_list_item(
754
+ marker=enum_marker,
755
+ enumerated=is_numbered,
756
+ parent=new_parent,
757
+ text=text,
758
+ formatting=format,
759
+ hyperlink=hyperlink,
760
+ )
576
761
  self.listIter = 0
577
762
 
578
- elif self.prev_numid() == numid or prev_indent == ilevel:
763
+ elif self._prev_numid() == numid or prev_indent == ilevel:
579
764
  # TODO: Set marker and enumerated arguments if this is an enumeration element.
580
765
  self.listIter += 1
581
766
  if is_numbered:
582
767
  enum_marker = str(self.listIter) + "."
583
768
  is_numbered = True
584
- doc.add_list_item(
585
- marker=enum_marker,
586
- enumerated=is_numbered,
587
- parent=self.parents[level - 1],
588
- text=text,
769
+ new_parent = self._create_or_reuse_parent(
770
+ doc=doc,
771
+ prev_parent=self.parents[level - 1],
772
+ paragraph_elements=elements,
589
773
  )
774
+ for text, format, hyperlink in elements:
775
+ # Add the list item to the parent group
776
+ doc.add_list_item(
777
+ marker=enum_marker,
778
+ enumerated=is_numbered,
779
+ parent=new_parent,
780
+ text=text,
781
+ formatting=format,
782
+ hyperlink=hyperlink,
783
+ )
590
784
  return
591
785
 
592
- def handle_tables(
786
+ def _handle_tables(
593
787
  self,
594
788
  element: BaseOxmlElement,
595
789
  docx_obj: DocxDocument,
@@ -604,7 +798,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
604
798
  cell_element = table.rows[0].cells[0]
605
799
  # In case we have a table of only 1 cell, we consider it furniture
606
800
  # And proceed processing the content of the cell as though it's in the document body
607
- self.walk_linear(cell_element._element, docx_obj, doc)
801
+ self._walk_linear(cell_element._element, docx_obj, doc)
608
802
  return
609
803
 
610
804
  data = TableData(num_rows=num_rows, num_cols=num_cols)
@@ -649,11 +843,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
649
843
  data.table_cells.append(table_cell)
650
844
  col_idx += cell.grid_span
651
845
 
652
- level = self.get_level()
846
+ level = self._get_level()
653
847
  doc.add_table(data=data, parent=self.parents[level - 1])
654
848
  return
655
849
 
656
- def handle_pictures(
850
+ def _handle_pictures(
657
851
  self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
658
852
  ) -> None:
659
853
  def get_docx_image(drawing_blip):
@@ -666,7 +860,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
666
860
  image_data = image_part.blob # Get the binary image data
667
861
  return image_data
668
862
 
669
- level = self.get_level()
863
+ level = self._get_level()
670
864
  # Open the BytesIO object with PIL to create an Image
671
865
  try:
672
866
  image_data = get_docx_image(drawing_blip)
@@ -60,6 +60,44 @@ err_console = Console(stderr=True)
60
60
  ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
61
61
  ocr_engines_enum_internal = ocr_factory_internal.get_enum()
62
62
 
63
+ DOCLING_ASCII_ART = r"""
64
+ ████ ██████
65
+ ███░░██░░░░░██████
66
+ ████████░░░░░░░░████████████
67
+ ████████░░░░░░░░░░░░░░░░░░████████
68
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░██████
69
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
70
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
71
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
72
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
73
+ ██████░░░░░░░ ░░░░░░░░░░░░░░░░░░░░░░ ░░░░░░░██████
74
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
75
+ ██████░░░░░░ ░░░░░░░░░░░░░░░ ░░░░░░██████
76
+ ███▒██░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒███
77
+ ███▒██░░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒████
78
+ ███▒██░░░░░░ ██ ██ ░░░░░░░░░░░░ ██ ██ ░░░░░██▒▒███
79
+ ███▒███░░░░░ ██ ░░░░████░░░░ ██ ░░░░░██▒▒███
80
+ ████▒▒██░░░░░░ ░░░███▒▒▒▒███░░░ ░░░░░░░██▒▒████
81
+ ████▒▒██░░░░░░░░░░░░░░░░░█▒▒▒▒▒▒▒▒▒▒█░░░░░░░░░░░░░░░░███▒▒████
82
+ ████▒▒▒██░░░░░░░░░░░░█████ ▒▒▒▒▒▒ ██████░░░░░░░░░░░██▒▒▒████
83
+ ███▒▒▒▒██░░░░░░░░███▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒███░░░░░░░░██▒▒▒▒███
84
+ ███▒▒▒▒▒███░░░░░░██▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒██░░░░░░███▒▒▒▒▒███
85
+ ████▒▒▒▒▒████░░░░░░██████████████████████░░░░░░████▒▒▒▒▒████
86
+ ███▒▒▒▒▒▒▒▒████░░░░░░░░░░░░░░░░░░░░░░░░░░░████▒▒▒▒▒▒▒▒▒███
87
+ ████▒▒▒▒▒▒▒▒███░░░░░████████████████████████▒▒▒▒▒▒▒▒▒████
88
+ ████▒▒▒▒▒▒██░░░░░░█ █░░░░░██▒▒▒▒▒▒████
89
+ ████▒▒▒▒█░░░░░░░█ D O C L I N G █░░░░░░░░██▒▒▒████
90
+ ████▒▒██░░░░░░█ █░░░░░░░░░░█▒▒████
91
+ ██████░░░░░░█ D O C L I N G █░░░░░░░░░░░██████
92
+ ████░░░░░█ █░░░░░░░░░░░░████
93
+ █████░░█ D O C L I N G █░░░░░░░░░░░█████
94
+ █████ █░░░░░░░░████████
95
+ ██ D O C L I N G █░░░░░░░░█████
96
+ █ █░░░████████
97
+ █████████████████████████████
98
+ """
99
+
100
+
63
101
  app = typer.Typer(
64
102
  name="Docling",
65
103
  no_args_is_help=True,
@@ -68,6 +106,12 @@ app = typer.Typer(
68
106
  )
69
107
 
70
108
 
109
+ def logo_callback(value: bool):
110
+ if value:
111
+ print(DOCLING_ASCII_ART)
112
+ raise typer.Exit()
113
+
114
+
71
115
  def version_callback(value: bool):
72
116
  if value:
73
117
  docling_version = importlib.metadata.version("docling")
@@ -356,6 +400,12 @@ def convert(
356
400
  device: Annotated[
357
401
  AcceleratorDevice, typer.Option(..., help="Accelerator device")
358
402
  ] = AcceleratorDevice.AUTO,
403
+ docling_logo: Annotated[
404
+ Optional[bool],
405
+ typer.Option(
406
+ "--logo", callback=logo_callback, is_eager=True, help="Docling logo"
407
+ ),
408
+ ] = None,
359
409
  ):
360
410
  if verbose == 0:
361
411
  logging.basicConfig(level=logging.WARNING)
@@ -247,7 +247,7 @@ class TesseractOcrCliModel(BaseOcrModel):
247
247
 
248
248
  cell = TextCell(
249
249
  index=ix,
250
- text=text,
250
+ text=str(text),
251
251
  orig=text,
252
252
  from_ocr=True,
253
253
  confidence=conf / 100.0,
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.28.4" # DO NOT EDIT, updated automatically
3
+ version = "2.29.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = [
6
6
  "Christoph Auer <cau@zurich.ibm.com>",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes