docling 2.37.0__tar.gz → 2.38.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. {docling-2.37.0 → docling-2.38.0}/PKG-INFO +7 -4
  2. {docling-2.37.0 → docling-2.38.0}/README.md +4 -3
  3. {docling-2.37.0 → docling-2.38.0}/docling/backend/md_backend.py +101 -81
  4. {docling-2.37.0 → docling-2.38.0}/docling/backend/msword_backend.py +71 -62
  5. docling-2.38.0/docling/backend/noop_backend.py +51 -0
  6. {docling-2.37.0 → docling-2.38.0}/docling/cli/main.py +82 -14
  7. docling-2.38.0/docling/datamodel/asr_model_specs.py +92 -0
  8. {docling-2.37.0 → docling-2.38.0}/docling/datamodel/base_models.py +11 -1
  9. {docling-2.37.0 → docling-2.38.0}/docling/datamodel/document.py +3 -1
  10. {docling-2.37.0 → docling-2.38.0}/docling/datamodel/pipeline_options.py +12 -1
  11. docling-2.38.0/docling/datamodel/pipeline_options_asr_model.py +57 -0
  12. {docling-2.37.0 → docling-2.38.0}/docling/datamodel/pipeline_options_vlm_model.py +2 -3
  13. {docling-2.37.0 → docling-2.38.0}/docling/document_converter.py +8 -0
  14. {docling-2.37.0 → docling-2.38.0}/docling/models/api_vlm_model.py +3 -1
  15. {docling-2.37.0 → docling-2.38.0}/docling/models/base_model.py +1 -1
  16. {docling-2.37.0 → docling-2.38.0}/docling/models/readingorder_model.py +1 -1
  17. {docling-2.37.0 → docling-2.38.0}/docling/models/vlm_models_inline/hf_transformers_model.py +3 -1
  18. {docling-2.37.0 → docling-2.38.0}/docling/models/vlm_models_inline/mlx_model.py +3 -1
  19. docling-2.38.0/docling/pipeline/asr_pipeline.py +253 -0
  20. {docling-2.37.0 → docling-2.38.0}/docling/pipeline/base_pipeline.py +11 -0
  21. {docling-2.37.0 → docling-2.38.0}/docling.egg-info/PKG-INFO +7 -4
  22. {docling-2.37.0 → docling-2.38.0}/docling.egg-info/SOURCES.txt +5 -0
  23. {docling-2.37.0 → docling-2.38.0}/docling.egg-info/requires.txt +3 -0
  24. {docling-2.37.0 → docling-2.38.0}/pyproject.toml +7 -1
  25. docling-2.38.0/tests/test_asr_pipeline.py +59 -0
  26. docling-2.38.0/tests/test_backend_markdown.py +52 -0
  27. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_msword.py +44 -4
  28. {docling-2.37.0 → docling-2.38.0}/tests/test_code_formula.py +22 -0
  29. docling-2.37.0/tests/test_backend_markdown.py +0 -41
  30. {docling-2.37.0 → docling-2.38.0}/LICENSE +0 -0
  31. {docling-2.37.0 → docling-2.38.0}/docling/__init__.py +0 -0
  32. {docling-2.37.0 → docling-2.38.0}/docling/backend/__init__.py +0 -0
  33. {docling-2.37.0 → docling-2.38.0}/docling/backend/abstract_backend.py +0 -0
  34. {docling-2.37.0 → docling-2.38.0}/docling/backend/asciidoc_backend.py +0 -0
  35. {docling-2.37.0 → docling-2.38.0}/docling/backend/csv_backend.py +0 -0
  36. {docling-2.37.0 → docling-2.38.0}/docling/backend/docling_parse_backend.py +0 -0
  37. {docling-2.37.0 → docling-2.38.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  38. {docling-2.37.0 → docling-2.38.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  39. {docling-2.37.0 → docling-2.38.0}/docling/backend/docx/__init__.py +0 -0
  40. {docling-2.37.0 → docling-2.38.0}/docling/backend/docx/latex/__init__.py +0 -0
  41. {docling-2.37.0 → docling-2.38.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  42. {docling-2.37.0 → docling-2.38.0}/docling/backend/docx/latex/omml.py +0 -0
  43. {docling-2.37.0 → docling-2.38.0}/docling/backend/html_backend.py +0 -0
  44. {docling-2.37.0 → docling-2.38.0}/docling/backend/json/__init__.py +0 -0
  45. {docling-2.37.0 → docling-2.38.0}/docling/backend/json/docling_json_backend.py +0 -0
  46. {docling-2.37.0 → docling-2.38.0}/docling/backend/msexcel_backend.py +0 -0
  47. {docling-2.37.0 → docling-2.38.0}/docling/backend/mspowerpoint_backend.py +0 -0
  48. {docling-2.37.0 → docling-2.38.0}/docling/backend/pdf_backend.py +0 -0
  49. {docling-2.37.0 → docling-2.38.0}/docling/backend/pypdfium2_backend.py +0 -0
  50. {docling-2.37.0 → docling-2.38.0}/docling/backend/xml/__init__.py +0 -0
  51. {docling-2.37.0 → docling-2.38.0}/docling/backend/xml/jats_backend.py +0 -0
  52. {docling-2.37.0 → docling-2.38.0}/docling/backend/xml/uspto_backend.py +0 -0
  53. {docling-2.37.0 → docling-2.38.0}/docling/chunking/__init__.py +0 -0
  54. {docling-2.37.0 → docling-2.38.0}/docling/cli/__init__.py +0 -0
  55. {docling-2.37.0 → docling-2.38.0}/docling/cli/models.py +0 -0
  56. {docling-2.37.0 → docling-2.38.0}/docling/cli/tools.py +0 -0
  57. {docling-2.37.0 → docling-2.38.0}/docling/datamodel/__init__.py +0 -0
  58. {docling-2.37.0 → docling-2.38.0}/docling/datamodel/accelerator_options.py +0 -0
  59. {docling-2.37.0 → docling-2.38.0}/docling/datamodel/settings.py +0 -0
  60. {docling-2.37.0 → docling-2.38.0}/docling/datamodel/vlm_model_specs.py +0 -0
  61. {docling-2.37.0 → docling-2.38.0}/docling/exceptions.py +0 -0
  62. {docling-2.37.0 → docling-2.38.0}/docling/models/__init__.py +0 -0
  63. {docling-2.37.0 → docling-2.38.0}/docling/models/base_ocr_model.py +0 -0
  64. {docling-2.37.0 → docling-2.38.0}/docling/models/code_formula_model.py +0 -0
  65. {docling-2.37.0 → docling-2.38.0}/docling/models/document_picture_classifier.py +0 -0
  66. {docling-2.37.0 → docling-2.38.0}/docling/models/easyocr_model.py +0 -0
  67. {docling-2.37.0 → docling-2.38.0}/docling/models/factories/__init__.py +0 -0
  68. {docling-2.37.0 → docling-2.38.0}/docling/models/factories/base_factory.py +0 -0
  69. {docling-2.37.0 → docling-2.38.0}/docling/models/factories/ocr_factory.py +0 -0
  70. {docling-2.37.0 → docling-2.38.0}/docling/models/factories/picture_description_factory.py +0 -0
  71. {docling-2.37.0 → docling-2.38.0}/docling/models/layout_model.py +0 -0
  72. {docling-2.37.0 → docling-2.38.0}/docling/models/ocr_mac_model.py +0 -0
  73. {docling-2.37.0 → docling-2.38.0}/docling/models/page_assemble_model.py +0 -0
  74. {docling-2.37.0 → docling-2.38.0}/docling/models/page_preprocessing_model.py +0 -0
  75. {docling-2.37.0 → docling-2.38.0}/docling/models/picture_description_api_model.py +0 -0
  76. {docling-2.37.0 → docling-2.38.0}/docling/models/picture_description_base_model.py +0 -0
  77. {docling-2.37.0 → docling-2.38.0}/docling/models/picture_description_vlm_model.py +0 -0
  78. {docling-2.37.0 → docling-2.38.0}/docling/models/plugins/__init__.py +0 -0
  79. {docling-2.37.0 → docling-2.38.0}/docling/models/plugins/defaults.py +0 -0
  80. {docling-2.37.0 → docling-2.38.0}/docling/models/rapid_ocr_model.py +0 -0
  81. {docling-2.37.0 → docling-2.38.0}/docling/models/table_structure_model.py +0 -0
  82. {docling-2.37.0 → docling-2.38.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  83. {docling-2.37.0 → docling-2.38.0}/docling/models/tesseract_ocr_model.py +0 -0
  84. {docling-2.37.0 → docling-2.38.0}/docling/models/utils/__init__.py +0 -0
  85. {docling-2.37.0 → docling-2.38.0}/docling/models/utils/hf_model_download.py +0 -0
  86. {docling-2.37.0 → docling-2.38.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  87. {docling-2.37.0 → docling-2.38.0}/docling/pipeline/__init__.py +0 -0
  88. {docling-2.37.0 → docling-2.38.0}/docling/pipeline/simple_pipeline.py +0 -0
  89. {docling-2.37.0 → docling-2.38.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  90. {docling-2.37.0 → docling-2.38.0}/docling/pipeline/vlm_pipeline.py +0 -0
  91. {docling-2.37.0 → docling-2.38.0}/docling/py.typed +0 -0
  92. {docling-2.37.0 → docling-2.38.0}/docling/utils/__init__.py +0 -0
  93. {docling-2.37.0 → docling-2.38.0}/docling/utils/accelerator_utils.py +0 -0
  94. {docling-2.37.0 → docling-2.38.0}/docling/utils/api_image_request.py +0 -0
  95. {docling-2.37.0 → docling-2.38.0}/docling/utils/export.py +0 -0
  96. {docling-2.37.0 → docling-2.38.0}/docling/utils/glm_utils.py +0 -0
  97. {docling-2.37.0 → docling-2.38.0}/docling/utils/layout_postprocessor.py +0 -0
  98. {docling-2.37.0 → docling-2.38.0}/docling/utils/locks.py +0 -0
  99. {docling-2.37.0 → docling-2.38.0}/docling/utils/model_downloader.py +0 -0
  100. {docling-2.37.0 → docling-2.38.0}/docling/utils/ocr_utils.py +0 -0
  101. {docling-2.37.0 → docling-2.38.0}/docling/utils/orientation.py +0 -0
  102. {docling-2.37.0 → docling-2.38.0}/docling/utils/profiling.py +0 -0
  103. {docling-2.37.0 → docling-2.38.0}/docling/utils/utils.py +0 -0
  104. {docling-2.37.0 → docling-2.38.0}/docling/utils/visualization.py +0 -0
  105. {docling-2.37.0 → docling-2.38.0}/docling.egg-info/dependency_links.txt +0 -0
  106. {docling-2.37.0 → docling-2.38.0}/docling.egg-info/entry_points.txt +0 -0
  107. {docling-2.37.0 → docling-2.38.0}/docling.egg-info/top_level.txt +0 -0
  108. {docling-2.37.0 → docling-2.38.0}/setup.cfg +0 -0
  109. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_asciidoc.py +0 -0
  110. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_csv.py +0 -0
  111. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_docling_json.py +0 -0
  112. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_docling_parse.py +0 -0
  113. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_docling_parse_v2.py +0 -0
  114. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_docling_parse_v4.py +0 -0
  115. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_html.py +0 -0
  116. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_jats.py +0 -0
  117. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_msexcel.py +0 -0
  118. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_patent_uspto.py +0 -0
  119. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_pdfium.py +0 -0
  120. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_pptx.py +0 -0
  121. {docling-2.37.0 → docling-2.38.0}/tests/test_backend_webp.py +0 -0
  122. {docling-2.37.0 → docling-2.38.0}/tests/test_cli.py +0 -0
  123. {docling-2.37.0 → docling-2.38.0}/tests/test_data_gen_flag.py +0 -0
  124. {docling-2.37.0 → docling-2.38.0}/tests/test_document_picture_classifier.py +0 -0
  125. {docling-2.37.0 → docling-2.38.0}/tests/test_e2e_conversion.py +0 -0
  126. {docling-2.37.0 → docling-2.38.0}/tests/test_e2e_ocr_conversion.py +0 -0
  127. {docling-2.37.0 → docling-2.38.0}/tests/test_input_doc.py +0 -0
  128. {docling-2.37.0 → docling-2.38.0}/tests/test_interfaces.py +0 -0
  129. {docling-2.37.0 → docling-2.38.0}/tests/test_invalid_input.py +0 -0
  130. {docling-2.37.0 → docling-2.38.0}/tests/test_legacy_format_transform.py +0 -0
  131. {docling-2.37.0 → docling-2.38.0}/tests/test_options.py +0 -0
  132. {docling-2.37.0 → docling-2.38.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.37.0
3
+ Version: 2.38.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -61,6 +61,8 @@ Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "d
61
61
  Provides-Extra: rapidocr
62
62
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
63
63
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
64
+ Provides-Extra: asr
65
+ Requires-Dist: openai-whisper>=20240930; extra == "asr"
64
66
  Dynamic: license-file
65
67
 
66
68
  <p align="center">
@@ -93,14 +95,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
93
95
 
94
96
  ## Features
95
97
 
96
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
98
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
97
99
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
98
100
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
99
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
101
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
100
102
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
101
103
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
102
104
  * 🔍 Extensive OCR support for scanned PDFs and images
103
- * 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
105
+ * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
106
+ * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
104
107
  * 💻 Simple and convenient CLI
105
108
 
106
109
  ### Coming soon
@@ -28,14 +28,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
28
28
 
29
29
  ## Features
30
30
 
31
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
31
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
32
32
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
33
33
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
34
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
34
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
35
35
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
36
36
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
37
37
  * 🔍 Extensive OCR support for scanned PDFs and images
38
- * 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
38
+ * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
39
+ * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
39
40
  * 💻 Simple and convenient CLI
40
41
 
41
42
  ### Coming soon
@@ -1,17 +1,15 @@
1
1
  import logging
2
2
  import re
3
3
  import warnings
4
+ from copy import deepcopy
4
5
  from io import BytesIO
5
6
  from pathlib import Path
6
7
  from typing import List, Optional, Set, Union
7
8
 
8
9
  import marko
9
10
  import marko.element
10
- import marko.ext
11
- import marko.ext.gfm
12
11
  import marko.inline
13
12
  from docling_core.types.doc import (
14
- DocItem,
15
13
  DocItemLabel,
16
14
  DoclingDocument,
17
15
  DocumentOrigin,
@@ -21,7 +19,9 @@ from docling_core.types.doc import (
21
19
  TableData,
22
20
  TextItem,
23
21
  )
22
+ from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
24
23
  from marko import Markdown
24
+ from pydantic import AnyUrl, TypeAdapter
25
25
 
26
26
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
27
  from docling.backend.html_backend import HTMLDocumentBackend
@@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
71
71
 
72
72
  self.in_table = False
73
73
  self.md_table_buffer: list[str] = []
74
- self.inline_texts: list[str] = []
75
74
  self._html_blocks: int = 0
76
75
 
77
76
  try:
@@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
156
155
  doc.add_table(data=table_data)
157
156
  return
158
157
 
159
- def _process_inline_text(
160
- self, parent_item: Optional[NodeItem], doc: DoclingDocument
161
- ):
162
- txt = " ".join(self.inline_texts)
163
- if len(txt) > 0:
164
- doc.add_text(
165
- label=DocItemLabel.PARAGRAPH,
166
- parent=parent_item,
167
- text=txt,
168
- )
169
- self.inline_texts = []
170
-
171
158
  def _iterate_elements( # noqa: C901
172
159
  self,
160
+ *,
173
161
  element: marko.element.Element,
174
162
  depth: int,
175
163
  doc: DoclingDocument,
176
164
  visited: Set[marko.element.Element],
177
165
  parent_item: Optional[NodeItem] = None,
166
+ formatting: Optional[Formatting] = None,
167
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
178
168
  ):
179
169
  if element in visited:
180
170
  return
@@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
183
173
  # Check for different element types and process relevant details
184
174
  if isinstance(element, marko.block.Heading) and len(element.children) > 0:
185
175
  self._close_table(doc)
186
- self._process_inline_text(parent_item, doc)
187
176
  _log.debug(
188
177
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
189
178
  )
179
+
180
+ if len(element.children) == 1:
181
+ child = element.children[0]
182
+ snippet_text = str(child.children) # type: ignore
183
+ visited.add(child)
184
+ else:
185
+ snippet_text = "" # inline group will be created
186
+
190
187
  if element.level == 1:
191
- doc_label = DocItemLabel.TITLE
188
+ parent_item = doc.add_title(
189
+ text=snippet_text,
190
+ parent=parent_item,
191
+ formatting=formatting,
192
+ hyperlink=hyperlink,
193
+ )
192
194
  else:
193
- doc_label = DocItemLabel.SECTION_HEADER
194
-
195
- # Header could have arbitrary inclusion of bold, italic or emphasis,
196
- # hence we need to traverse the tree to get full text of a header
197
- strings: List[str] = []
198
-
199
- # Define a recursive function to traverse the tree
200
- def traverse(node: marko.block.BlockElement):
201
- # Check if the node has a "children" attribute
202
- if hasattr(node, "children"):
203
- # If "children" is a list, continue traversal
204
- if isinstance(node.children, list):
205
- for child in node.children:
206
- traverse(child)
207
- # If "children" is text, add it to header text
208
- elif isinstance(node.children, str):
209
- strings.append(node.children)
210
-
211
- traverse(element)
212
- snippet_text = "".join(strings)
213
- if len(snippet_text) > 0:
214
- if doc_label == DocItemLabel.SECTION_HEADER:
215
- parent_item = doc.add_heading(
216
- text=snippet_text,
217
- level=element.level - 1,
218
- parent=parent_item,
219
- )
220
- else:
221
- parent_item = doc.add_text(
222
- label=doc_label, parent=parent_item, text=snippet_text
223
- )
195
+ parent_item = doc.add_heading(
196
+ text=snippet_text,
197
+ level=element.level - 1,
198
+ parent=parent_item,
199
+ formatting=formatting,
200
+ hyperlink=hyperlink,
201
+ )
224
202
 
225
203
  elif isinstance(element, marko.block.List):
226
204
  has_non_empty_list_items = False
@@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
230
208
  break
231
209
 
232
210
  self._close_table(doc)
233
- self._process_inline_text(parent_item, doc)
234
211
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
235
212
  if has_non_empty_list_items:
236
213
  label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
240
217
 
241
218
  elif (
242
219
  isinstance(element, marko.block.ListItem)
243
- and len(element.children) > 0
244
- and isinstance((first_child := element.children[0]), marko.block.Paragraph)
220
+ and len(element.children) == 1
221
+ and isinstance((child := element.children[0]), marko.block.Paragraph)
222
+ and len(child.children) > 0
245
223
  ):
246
224
  self._close_table(doc)
247
- self._process_inline_text(parent_item, doc)
248
225
  _log.debug(" - List item")
249
226
 
250
- snippet_text = str(first_child.children[0].children) # type: ignore
251
- is_numbered = False
252
- if (
253
- parent_item is not None
254
- and isinstance(parent_item, DocItem)
255
- and parent_item.label == GroupLabel.ORDERED_LIST
256
- ):
257
- is_numbered = True
258
- doc.add_list_item(
259
- enumerated=is_numbered, parent=parent_item, text=snippet_text
227
+ if len(child.children) == 1:
228
+ snippet_text = str(child.children[0].children) # type: ignore
229
+ visited.add(child)
230
+ else:
231
+ snippet_text = "" # inline group will be created
232
+ is_numbered = isinstance(parent_item, OrderedList)
233
+ if not isinstance(parent_item, (OrderedList, UnorderedList)):
234
+ _log.warning("ListItem would have not had a list parent, adding one.")
235
+ parent_item = doc.add_unordered_list(parent=parent_item)
236
+ parent_item = doc.add_list_item(
237
+ enumerated=is_numbered,
238
+ parent=parent_item,
239
+ text=snippet_text,
240
+ formatting=formatting,
241
+ hyperlink=hyperlink,
260
242
  )
261
- visited.add(first_child)
262
243
 
263
244
  elif isinstance(element, marko.inline.Image):
264
245
  self._close_table(doc)
265
- self._process_inline_text(parent_item, doc)
266
246
  _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
267
247
 
268
248
  fig_caption: Optional[TextItem] = None
269
249
  if element.title is not None and element.title != "":
270
250
  fig_caption = doc.add_text(
271
- label=DocItemLabel.CAPTION, text=element.title
251
+ label=DocItemLabel.CAPTION,
252
+ text=element.title,
253
+ formatting=formatting,
254
+ hyperlink=hyperlink,
272
255
  )
273
256
 
274
257
  doc.add_picture(parent=parent_item, caption=fig_caption)
275
258
 
276
- elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
277
- self._process_inline_text(parent_item, doc)
259
+ elif isinstance(element, marko.inline.Emphasis):
260
+ _log.debug(f" - Emphasis: {element.children}")
261
+ formatting = deepcopy(formatting) if formatting else Formatting()
262
+ formatting.italic = True
263
+
264
+ elif isinstance(element, marko.inline.StrongEmphasis):
265
+ _log.debug(f" - StrongEmphasis: {element.children}")
266
+ formatting = deepcopy(formatting) if formatting else Formatting()
267
+ formatting.bold = True
268
+
269
+ elif isinstance(element, marko.inline.Link):
270
+ _log.debug(f" - Link: {element.children}")
271
+ hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
272
+ element.dest
273
+ )
278
274
 
279
275
  elif isinstance(element, marko.inline.RawText):
280
276
  _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
287
283
  self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
288
284
  else:
289
285
  self.md_table_buffer.append(snippet_text)
290
- else:
286
+ elif snippet_text:
291
287
  self._close_table(doc)
292
- # most likely just inline text
293
- self.inline_texts.append(str(element.children))
288
+ doc.add_text(
289
+ label=DocItemLabel.TEXT,
290
+ parent=parent_item,
291
+ text=snippet_text,
292
+ formatting=formatting,
293
+ hyperlink=hyperlink,
294
+ )
294
295
 
295
296
  elif isinstance(element, marko.inline.CodeSpan):
296
297
  self._close_table(doc)
297
- self._process_inline_text(parent_item, doc)
298
298
  _log.debug(f" - Code Span: {element.children}")
299
299
  snippet_text = str(element.children).strip()
300
- doc.add_code(parent=parent_item, text=snippet_text)
300
+ doc.add_code(
301
+ parent=parent_item,
302
+ text=snippet_text,
303
+ formatting=formatting,
304
+ hyperlink=hyperlink,
305
+ )
301
306
 
302
307
  elif (
303
308
  isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
304
309
  and len(element.children) > 0
305
- and isinstance((first_child := element.children[0]), marko.inline.RawText)
306
- and len(snippet_text := (first_child.children.strip())) > 0
310
+ and isinstance((child := element.children[0]), marko.inline.RawText)
311
+ and len(snippet_text := (child.children.strip())) > 0
307
312
  ):
308
313
  self._close_table(doc)
309
- self._process_inline_text(parent_item, doc)
310
314
  _log.debug(f" - Code Block: {element.children}")
311
- doc.add_code(parent=parent_item, text=snippet_text)
315
+ doc.add_code(
316
+ parent=parent_item,
317
+ text=snippet_text,
318
+ formatting=formatting,
319
+ hyperlink=hyperlink,
320
+ )
312
321
 
313
322
  elif isinstance(element, marko.inline.LineBreak):
314
323
  if self.in_table:
@@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
317
326
 
318
327
  elif isinstance(element, marko.block.HTMLBlock):
319
328
  self._html_blocks += 1
320
- self._process_inline_text(parent_item, doc)
321
329
  self._close_table(doc)
322
330
  _log.debug(f"HTML Block: {element}")
323
331
  if (
@@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
327
335
 
328
336
  # wrap in markers to enable post-processing in convert()
329
337
  text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
330
- doc.add_code(parent=parent_item, text=text_to_add)
338
+ doc.add_code(
339
+ parent=parent_item,
340
+ text=text_to_add,
341
+ formatting=formatting,
342
+ hyperlink=hyperlink,
343
+ )
331
344
  else:
332
345
  if not isinstance(element, str):
333
346
  self._close_table(doc)
334
347
  _log.debug(f"Some other element: {element}")
335
348
 
349
+ if (
350
+ isinstance(element, (marko.block.Paragraph, marko.block.Heading))
351
+ and len(element.children) > 1
352
+ ):
353
+ parent_item = doc.add_inline_group(parent=parent_item)
354
+
336
355
  processed_block_types = (
337
- marko.block.Heading,
356
+ # marko.block.Heading,
338
357
  marko.block.CodeBlock,
339
358
  marko.block.FencedCode,
340
359
  marko.inline.RawText,
@@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
351
370
  doc=doc,
352
371
  visited=visited,
353
372
  parent_item=parent_item,
373
+ formatting=formatting,
374
+ hyperlink=hyperlink,
354
375
  )
355
376
 
356
377
  def is_valid(self) -> bool:
@@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
392
413
  parent_item=None,
393
414
  visited=set(),
394
415
  )
395
- self._process_inline_text(None, doc) # handle last hanging inline text
396
416
  self._close_table(doc=doc) # handle any last hanging table
397
417
 
398
418
  # if HTML blocks were detected, export to HTML and delegate to HTML backend
@@ -14,7 +14,7 @@ from docling_core.types.doc import (
14
14
  TableCell,
15
15
  TableData,
16
16
  )
17
- from docling_core.types.doc.document import Formatting
17
+ from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
18
18
  from docx import Document
19
19
  from docx.document import Document as DocxDocument
20
20
  from docx.oxml.table import CT_Tc
@@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
84
84
  self.valid = True
85
85
  except Exception as e:
86
86
  raise RuntimeError(
87
- f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
87
+ f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
88
88
  ) from e
89
89
 
90
90
  @override
@@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
251
251
  self._handle_tables(element, docx_obj, doc)
252
252
  except Exception:
253
253
  _log.debug("could not parse a table, broken docx table")
254
-
254
+ # Check for Image
255
255
  elif drawing_blip:
256
256
  self._handle_pictures(docx_obj, drawing_blip, doc)
257
+ # Check for Text after the Image
258
+ if (
259
+ tag_name in ["p"]
260
+ and element.find(".//w:t", namespaces=namespaces) is not None
261
+ ):
262
+ self._handle_text_elements(element, docx_obj, doc)
257
263
  # Check for the sdt containers, like table of contents
258
264
  elif tag_name in ["sdt"]:
259
265
  sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
268
274
  self._handle_text_elements(element, docx_obj, doc)
269
275
  else:
270
276
  _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
277
+
271
278
  return doc
272
279
 
273
280
  def _str_to_int(
@@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
578
585
  all_paragraphs = []
579
586
 
580
587
  # Sort paragraphs within each container, then process containers
581
- for container_id, paragraphs in container_paragraphs.items():
588
+ for paragraphs in container_paragraphs.values():
582
589
  # Sort by vertical position within each container
583
590
  sorted_container_paragraphs = sorted(
584
591
  paragraphs,
@@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
689
696
  doc: DoclingDocument,
690
697
  ) -> None:
691
698
  paragraph = Paragraph(element, docx_obj)
692
-
699
+ paragraph_elements = self._get_paragraph_elements(paragraph)
693
700
  text, equations = self._handle_equations_in_text(
694
701
  element=element, text=paragraph.text
695
702
  )
696
703
 
697
704
  if text is None:
698
705
  return
699
- paragraph_elements = self._get_paragraph_elements(paragraph)
700
706
  text = text.strip()
701
707
 
702
708
  # Common styles for bullet and numbered lists.
@@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
912
918
  )
913
919
  return
914
920
 
921
+ def _add_formatted_list_item(
922
+ self,
923
+ doc: DoclingDocument,
924
+ elements: list,
925
+ marker: str,
926
+ enumerated: bool,
927
+ level: int,
928
+ ) -> None:
929
+ # This should not happen by construction
930
+ if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
931
+ return
932
+ if len(elements) == 1:
933
+ text, format, hyperlink = elements[0]
934
+ doc.add_list_item(
935
+ marker=marker,
936
+ enumerated=enumerated,
937
+ parent=self.parents[level],
938
+ text=text,
939
+ formatting=format,
940
+ hyperlink=hyperlink,
941
+ )
942
+ else:
943
+ new_item = doc.add_list_item(
944
+ marker=marker,
945
+ enumerated=enumerated,
946
+ parent=self.parents[level],
947
+ text="",
948
+ )
949
+ new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
950
+ for text, format, hyperlink in elements:
951
+ doc.add_text(
952
+ label=DocItemLabel.TEXT,
953
+ parent=new_parent,
954
+ text=text,
955
+ formatting=format,
956
+ hyperlink=hyperlink,
957
+ )
958
+
915
959
  def _add_list_item(
916
960
  self,
917
961
  *,
@@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
921
965
  elements: list,
922
966
  is_numbered: bool = False,
923
967
  ) -> None:
968
+ # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
969
+ if not elements:
970
+ return None
924
971
  enum_marker = ""
925
972
 
926
973
  level = self._get_level()
@@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
937
984
  if is_numbered:
938
985
  enum_marker = str(self.listIter) + "."
939
986
  is_numbered = True
940
- new_parent = self._create_or_reuse_parent(
941
- doc=doc,
942
- prev_parent=self.parents[level],
943
- paragraph_elements=elements,
987
+ self._add_formatted_list_item(
988
+ doc, elements, enum_marker, is_numbered, level
944
989
  )
945
- for text, format, hyperlink in elements:
946
- doc.add_list_item(
947
- marker=enum_marker,
948
- enumerated=is_numbered,
949
- parent=new_parent,
950
- text=text,
951
- formatting=format,
952
- hyperlink=hyperlink,
953
- )
954
-
955
990
  elif (
956
991
  self._prev_numid() == numid
957
992
  and self.level_at_new_list is not None
@@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
981
1016
  if is_numbered:
982
1017
  enum_marker = str(self.listIter) + "."
983
1018
  is_numbered = True
984
-
985
- new_parent = self._create_or_reuse_parent(
986
- doc=doc,
987
- prev_parent=self.parents[self.level_at_new_list + ilevel],
988
- paragraph_elements=elements,
1019
+ self._add_formatted_list_item(
1020
+ doc,
1021
+ elements,
1022
+ enum_marker,
1023
+ is_numbered,
1024
+ self.level_at_new_list + ilevel,
989
1025
  )
990
- for text, format, hyperlink in elements:
991
- doc.add_list_item(
992
- marker=enum_marker,
993
- enumerated=is_numbered,
994
- parent=new_parent,
995
- text=text,
996
- formatting=format,
997
- hyperlink=hyperlink,
998
- )
999
1026
  elif (
1000
1027
  self._prev_numid() == numid
1001
1028
  and self.level_at_new_list is not None
1002
1029
  and prev_indent is not None
1003
1030
  and ilevel < prev_indent
1004
1031
  ): # Close list
1005
- for k, v in self.parents.items():
1032
+ for k in self.parents:
1006
1033
  if k > self.level_at_new_list + ilevel:
1007
1034
  self.parents[k] = None
1008
1035
 
@@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1011
1038
  if is_numbered:
1012
1039
  enum_marker = str(self.listIter) + "."
1013
1040
  is_numbered = True
1014
- new_parent = self._create_or_reuse_parent(
1015
- doc=doc,
1016
- prev_parent=self.parents[self.level_at_new_list + ilevel],
1017
- paragraph_elements=elements,
1041
+ self._add_formatted_list_item(
1042
+ doc,
1043
+ elements,
1044
+ enum_marker,
1045
+ is_numbered,
1046
+ self.level_at_new_list + ilevel,
1018
1047
  )
1019
- for text, format, hyperlink in elements:
1020
- doc.add_list_item(
1021
- marker=enum_marker,
1022
- enumerated=is_numbered,
1023
- parent=new_parent,
1024
- text=text,
1025
- formatting=format,
1026
- hyperlink=hyperlink,
1027
- )
1028
1048
  self.listIter = 0
1029
1049
 
1030
1050
  elif self._prev_numid() == numid or prev_indent == ilevel:
@@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1033
1053
  if is_numbered:
1034
1054
  enum_marker = str(self.listIter) + "."
1035
1055
  is_numbered = True
1036
- new_parent = self._create_or_reuse_parent(
1037
- doc=doc,
1038
- prev_parent=self.parents[level - 1],
1039
- paragraph_elements=elements,
1056
+ self._add_formatted_list_item(
1057
+ doc, elements, enum_marker, is_numbered, level - 1
1040
1058
  )
1041
- for text, format, hyperlink in elements:
1042
- # Add the list item to the parent group
1043
- doc.add_list_item(
1044
- marker=enum_marker,
1045
- enumerated=is_numbered,
1046
- parent=new_parent,
1047
- text=text,
1048
- formatting=format,
1049
- hyperlink=hyperlink,
1050
- )
1059
+
1051
1060
  return
1052
1061
 
1053
1062
  def _handle_tables(
@@ -0,0 +1,51 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ from docling.backend.abstract_backend import AbstractDocumentBackend
7
+ from docling.datamodel.base_models import InputFormat
8
+ from docling.datamodel.document import InputDocument
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class NoOpBackend(AbstractDocumentBackend):
14
+ """
15
+ A no-op backend that only validates input existence.
16
+ Used e.g. for audio files where actual processing is handled by the ASR pipeline.
17
+ """
18
+
19
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
20
+ super().__init__(in_doc, path_or_stream)
21
+
22
+ _log.debug(f"NoOpBackend initialized for: {path_or_stream}")
23
+
24
+ # Validate input
25
+ try:
26
+ if isinstance(self.path_or_stream, BytesIO):
27
+ # Check if stream has content
28
+ self.valid = len(self.path_or_stream.getvalue()) > 0
29
+ _log.debug(
30
+ f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
31
+ )
32
+ elif isinstance(self.path_or_stream, Path):
33
+ # Check if file exists
34
+ self.valid = self.path_or_stream.exists()
35
+ _log.debug(f"File exists: {self.valid}")
36
+ else:
37
+ self.valid = False
38
+ except Exception as e:
39
+ _log.error(f"NoOpBackend validation failed: {e}")
40
+ self.valid = False
41
+
42
+ def is_valid(self) -> bool:
43
+ return self.valid
44
+
45
+ @classmethod
46
+ def supports_pagination(cls) -> bool:
47
+ return False
48
+
49
+ @classmethod
50
+ def supported_formats(cls) -> Set[InputFormat]:
51
+ return set(InputFormat)