docling 2.37.0__tar.gz → 2.38.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. {docling-2.37.0 → docling-2.38.1}/PKG-INFO +7 -4
  2. {docling-2.37.0 → docling-2.38.1}/README.md +4 -3
  3. {docling-2.37.0 → docling-2.38.1}/docling/backend/md_backend.py +185 -80
  4. {docling-2.37.0 → docling-2.38.1}/docling/backend/msword_backend.py +76 -63
  5. docling-2.38.1/docling/backend/noop_backend.py +51 -0
  6. {docling-2.37.0 → docling-2.38.1}/docling/cli/main.py +82 -14
  7. docling-2.38.1/docling/datamodel/asr_model_specs.py +92 -0
  8. {docling-2.37.0 → docling-2.38.1}/docling/datamodel/base_models.py +12 -2
  9. {docling-2.37.0 → docling-2.38.1}/docling/datamodel/document.py +3 -1
  10. {docling-2.37.0 → docling-2.38.1}/docling/datamodel/pipeline_options.py +13 -2
  11. docling-2.38.1/docling/datamodel/pipeline_options_asr_model.py +57 -0
  12. {docling-2.37.0 → docling-2.38.1}/docling/datamodel/pipeline_options_vlm_model.py +2 -3
  13. {docling-2.37.0 → docling-2.38.1}/docling/document_converter.py +8 -0
  14. {docling-2.37.0 → docling-2.38.1}/docling/models/api_vlm_model.py +3 -1
  15. {docling-2.37.0 → docling-2.38.1}/docling/models/base_model.py +1 -1
  16. {docling-2.37.0 → docling-2.38.1}/docling/models/readingorder_model.py +1 -1
  17. {docling-2.37.0 → docling-2.38.1}/docling/models/vlm_models_inline/hf_transformers_model.py +3 -1
  18. {docling-2.37.0 → docling-2.38.1}/docling/models/vlm_models_inline/mlx_model.py +3 -1
  19. docling-2.38.1/docling/pipeline/asr_pipeline.py +253 -0
  20. {docling-2.37.0 → docling-2.38.1}/docling/pipeline/base_pipeline.py +11 -0
  21. {docling-2.37.0 → docling-2.38.1}/docling.egg-info/PKG-INFO +7 -4
  22. {docling-2.37.0 → docling-2.38.1}/docling.egg-info/SOURCES.txt +5 -0
  23. {docling-2.37.0 → docling-2.38.1}/docling.egg-info/requires.txt +3 -0
  24. {docling-2.37.0 → docling-2.38.1}/pyproject.toml +7 -1
  25. docling-2.38.1/tests/test_asr_pipeline.py +59 -0
  26. docling-2.38.1/tests/test_backend_markdown.py +52 -0
  27. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_msword.py +44 -4
  28. {docling-2.37.0 → docling-2.38.1}/tests/test_code_formula.py +22 -0
  29. docling-2.37.0/tests/test_backend_markdown.py +0 -41
  30. {docling-2.37.0 → docling-2.38.1}/LICENSE +0 -0
  31. {docling-2.37.0 → docling-2.38.1}/docling/__init__.py +0 -0
  32. {docling-2.37.0 → docling-2.38.1}/docling/backend/__init__.py +0 -0
  33. {docling-2.37.0 → docling-2.38.1}/docling/backend/abstract_backend.py +0 -0
  34. {docling-2.37.0 → docling-2.38.1}/docling/backend/asciidoc_backend.py +0 -0
  35. {docling-2.37.0 → docling-2.38.1}/docling/backend/csv_backend.py +0 -0
  36. {docling-2.37.0 → docling-2.38.1}/docling/backend/docling_parse_backend.py +0 -0
  37. {docling-2.37.0 → docling-2.38.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  38. {docling-2.37.0 → docling-2.38.1}/docling/backend/docling_parse_v4_backend.py +0 -0
  39. {docling-2.37.0 → docling-2.38.1}/docling/backend/docx/__init__.py +0 -0
  40. {docling-2.37.0 → docling-2.38.1}/docling/backend/docx/latex/__init__.py +0 -0
  41. {docling-2.37.0 → docling-2.38.1}/docling/backend/docx/latex/latex_dict.py +0 -0
  42. {docling-2.37.0 → docling-2.38.1}/docling/backend/docx/latex/omml.py +0 -0
  43. {docling-2.37.0 → docling-2.38.1}/docling/backend/html_backend.py +0 -0
  44. {docling-2.37.0 → docling-2.38.1}/docling/backend/json/__init__.py +0 -0
  45. {docling-2.37.0 → docling-2.38.1}/docling/backend/json/docling_json_backend.py +0 -0
  46. {docling-2.37.0 → docling-2.38.1}/docling/backend/msexcel_backend.py +0 -0
  47. {docling-2.37.0 → docling-2.38.1}/docling/backend/mspowerpoint_backend.py +0 -0
  48. {docling-2.37.0 → docling-2.38.1}/docling/backend/pdf_backend.py +0 -0
  49. {docling-2.37.0 → docling-2.38.1}/docling/backend/pypdfium2_backend.py +0 -0
  50. {docling-2.37.0 → docling-2.38.1}/docling/backend/xml/__init__.py +0 -0
  51. {docling-2.37.0 → docling-2.38.1}/docling/backend/xml/jats_backend.py +0 -0
  52. {docling-2.37.0 → docling-2.38.1}/docling/backend/xml/uspto_backend.py +0 -0
  53. {docling-2.37.0 → docling-2.38.1}/docling/chunking/__init__.py +0 -0
  54. {docling-2.37.0 → docling-2.38.1}/docling/cli/__init__.py +0 -0
  55. {docling-2.37.0 → docling-2.38.1}/docling/cli/models.py +0 -0
  56. {docling-2.37.0 → docling-2.38.1}/docling/cli/tools.py +0 -0
  57. {docling-2.37.0 → docling-2.38.1}/docling/datamodel/__init__.py +0 -0
  58. {docling-2.37.0 → docling-2.38.1}/docling/datamodel/accelerator_options.py +0 -0
  59. {docling-2.37.0 → docling-2.38.1}/docling/datamodel/settings.py +0 -0
  60. {docling-2.37.0 → docling-2.38.1}/docling/datamodel/vlm_model_specs.py +0 -0
  61. {docling-2.37.0 → docling-2.38.1}/docling/exceptions.py +0 -0
  62. {docling-2.37.0 → docling-2.38.1}/docling/models/__init__.py +0 -0
  63. {docling-2.37.0 → docling-2.38.1}/docling/models/base_ocr_model.py +0 -0
  64. {docling-2.37.0 → docling-2.38.1}/docling/models/code_formula_model.py +0 -0
  65. {docling-2.37.0 → docling-2.38.1}/docling/models/document_picture_classifier.py +0 -0
  66. {docling-2.37.0 → docling-2.38.1}/docling/models/easyocr_model.py +0 -0
  67. {docling-2.37.0 → docling-2.38.1}/docling/models/factories/__init__.py +0 -0
  68. {docling-2.37.0 → docling-2.38.1}/docling/models/factories/base_factory.py +0 -0
  69. {docling-2.37.0 → docling-2.38.1}/docling/models/factories/ocr_factory.py +0 -0
  70. {docling-2.37.0 → docling-2.38.1}/docling/models/factories/picture_description_factory.py +0 -0
  71. {docling-2.37.0 → docling-2.38.1}/docling/models/layout_model.py +0 -0
  72. {docling-2.37.0 → docling-2.38.1}/docling/models/ocr_mac_model.py +0 -0
  73. {docling-2.37.0 → docling-2.38.1}/docling/models/page_assemble_model.py +0 -0
  74. {docling-2.37.0 → docling-2.38.1}/docling/models/page_preprocessing_model.py +0 -0
  75. {docling-2.37.0 → docling-2.38.1}/docling/models/picture_description_api_model.py +0 -0
  76. {docling-2.37.0 → docling-2.38.1}/docling/models/picture_description_base_model.py +0 -0
  77. {docling-2.37.0 → docling-2.38.1}/docling/models/picture_description_vlm_model.py +0 -0
  78. {docling-2.37.0 → docling-2.38.1}/docling/models/plugins/__init__.py +0 -0
  79. {docling-2.37.0 → docling-2.38.1}/docling/models/plugins/defaults.py +0 -0
  80. {docling-2.37.0 → docling-2.38.1}/docling/models/rapid_ocr_model.py +0 -0
  81. {docling-2.37.0 → docling-2.38.1}/docling/models/table_structure_model.py +0 -0
  82. {docling-2.37.0 → docling-2.38.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  83. {docling-2.37.0 → docling-2.38.1}/docling/models/tesseract_ocr_model.py +0 -0
  84. {docling-2.37.0 → docling-2.38.1}/docling/models/utils/__init__.py +0 -0
  85. {docling-2.37.0 → docling-2.38.1}/docling/models/utils/hf_model_download.py +0 -0
  86. {docling-2.37.0 → docling-2.38.1}/docling/models/vlm_models_inline/__init__.py +0 -0
  87. {docling-2.37.0 → docling-2.38.1}/docling/pipeline/__init__.py +0 -0
  88. {docling-2.37.0 → docling-2.38.1}/docling/pipeline/simple_pipeline.py +0 -0
  89. {docling-2.37.0 → docling-2.38.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  90. {docling-2.37.0 → docling-2.38.1}/docling/pipeline/vlm_pipeline.py +0 -0
  91. {docling-2.37.0 → docling-2.38.1}/docling/py.typed +0 -0
  92. {docling-2.37.0 → docling-2.38.1}/docling/utils/__init__.py +0 -0
  93. {docling-2.37.0 → docling-2.38.1}/docling/utils/accelerator_utils.py +0 -0
  94. {docling-2.37.0 → docling-2.38.1}/docling/utils/api_image_request.py +0 -0
  95. {docling-2.37.0 → docling-2.38.1}/docling/utils/export.py +0 -0
  96. {docling-2.37.0 → docling-2.38.1}/docling/utils/glm_utils.py +0 -0
  97. {docling-2.37.0 → docling-2.38.1}/docling/utils/layout_postprocessor.py +0 -0
  98. {docling-2.37.0 → docling-2.38.1}/docling/utils/locks.py +0 -0
  99. {docling-2.37.0 → docling-2.38.1}/docling/utils/model_downloader.py +0 -0
  100. {docling-2.37.0 → docling-2.38.1}/docling/utils/ocr_utils.py +0 -0
  101. {docling-2.37.0 → docling-2.38.1}/docling/utils/orientation.py +0 -0
  102. {docling-2.37.0 → docling-2.38.1}/docling/utils/profiling.py +0 -0
  103. {docling-2.37.0 → docling-2.38.1}/docling/utils/utils.py +0 -0
  104. {docling-2.37.0 → docling-2.38.1}/docling/utils/visualization.py +0 -0
  105. {docling-2.37.0 → docling-2.38.1}/docling.egg-info/dependency_links.txt +0 -0
  106. {docling-2.37.0 → docling-2.38.1}/docling.egg-info/entry_points.txt +0 -0
  107. {docling-2.37.0 → docling-2.38.1}/docling.egg-info/top_level.txt +0 -0
  108. {docling-2.37.0 → docling-2.38.1}/setup.cfg +0 -0
  109. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_asciidoc.py +0 -0
  110. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_csv.py +0 -0
  111. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_docling_json.py +0 -0
  112. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_docling_parse.py +0 -0
  113. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_docling_parse_v2.py +0 -0
  114. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_docling_parse_v4.py +0 -0
  115. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_html.py +0 -0
  116. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_jats.py +0 -0
  117. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_msexcel.py +0 -0
  118. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_patent_uspto.py +0 -0
  119. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_pdfium.py +0 -0
  120. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_pptx.py +0 -0
  121. {docling-2.37.0 → docling-2.38.1}/tests/test_backend_webp.py +0 -0
  122. {docling-2.37.0 → docling-2.38.1}/tests/test_cli.py +0 -0
  123. {docling-2.37.0 → docling-2.38.1}/tests/test_data_gen_flag.py +0 -0
  124. {docling-2.37.0 → docling-2.38.1}/tests/test_document_picture_classifier.py +0 -0
  125. {docling-2.37.0 → docling-2.38.1}/tests/test_e2e_conversion.py +0 -0
  126. {docling-2.37.0 → docling-2.38.1}/tests/test_e2e_ocr_conversion.py +0 -0
  127. {docling-2.37.0 → docling-2.38.1}/tests/test_input_doc.py +0 -0
  128. {docling-2.37.0 → docling-2.38.1}/tests/test_interfaces.py +0 -0
  129. {docling-2.37.0 → docling-2.38.1}/tests/test_invalid_input.py +0 -0
  130. {docling-2.37.0 → docling-2.38.1}/tests/test_legacy_format_transform.py +0 -0
  131. {docling-2.37.0 → docling-2.38.1}/tests/test_options.py +0 -0
  132. {docling-2.37.0 → docling-2.38.1}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.37.0
3
+ Version: 2.38.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -61,6 +61,8 @@ Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "d
61
61
  Provides-Extra: rapidocr
62
62
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
63
63
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
64
+ Provides-Extra: asr
65
+ Requires-Dist: openai-whisper>=20240930; extra == "asr"
64
66
  Dynamic: license-file
65
67
 
66
68
  <p align="center">
@@ -93,14 +95,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
93
95
 
94
96
  ## Features
95
97
 
96
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
98
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
97
99
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
98
100
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
99
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
101
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
100
102
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
101
103
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
102
104
  * 🔍 Extensive OCR support for scanned PDFs and images
103
- * 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
105
+ * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
106
+ * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
104
107
  * 💻 Simple and convenient CLI
105
108
 
106
109
  ### Coming soon
@@ -28,14 +28,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
28
28
 
29
29
  ## Features
30
30
 
31
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
31
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
32
32
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
33
33
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
34
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
34
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
35
35
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
36
36
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
37
37
  * 🔍 Extensive OCR support for scanned PDFs and images
38
- * 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
38
+ * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
39
+ * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
39
40
  * 💻 Simple and convenient CLI
40
41
 
41
42
  ### Coming soon
@@ -1,17 +1,16 @@
1
1
  import logging
2
2
  import re
3
3
  import warnings
4
+ from copy import deepcopy
5
+ from enum import Enum
4
6
  from io import BytesIO
5
7
  from pathlib import Path
6
- from typing import List, Optional, Set, Union
8
+ from typing import List, Literal, Optional, Set, Union
7
9
 
8
10
  import marko
9
11
  import marko.element
10
- import marko.ext
11
- import marko.ext.gfm
12
12
  import marko.inline
13
13
  from docling_core.types.doc import (
14
- DocItem,
15
14
  DocItemLabel,
16
15
  DoclingDocument,
17
16
  DocumentOrigin,
@@ -21,7 +20,10 @@ from docling_core.types.doc import (
21
20
  TableData,
22
21
  TextItem,
23
22
  )
23
+ from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
24
24
  from marko import Markdown
25
+ from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
26
+ from typing_extensions import Annotated
25
27
 
26
28
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
29
  from docling.backend.html_backend import HTMLDocumentBackend
@@ -35,6 +37,31 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
35
37
  _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
38
 
37
39
 
40
+ class _PendingCreationType(str, Enum):
41
+ """CoordOrigin."""
42
+
43
+ HEADING = "heading"
44
+ LIST_ITEM = "list_item"
45
+
46
+
47
+ class _HeadingCreationPayload(BaseModel):
48
+ kind: Literal["heading"] = "heading"
49
+ level: int
50
+
51
+
52
+ class _ListItemCreationPayload(BaseModel):
53
+ kind: Literal["list_item"] = "list_item"
54
+
55
+
56
+ _CreationPayload = Annotated[
57
+ Union[
58
+ _HeadingCreationPayload,
59
+ _ListItemCreationPayload,
60
+ ],
61
+ Field(discriminator="kind"),
62
+ ]
63
+
64
+
38
65
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
39
66
  def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
40
67
  # This regex will match any sequence of underscores
@@ -71,7 +98,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
71
98
 
72
99
  self.in_table = False
73
100
  self.md_table_buffer: list[str] = []
74
- self.inline_texts: list[str] = []
75
101
  self._html_blocks: int = 0
76
102
 
77
103
  try:
@@ -156,25 +182,65 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
156
182
  doc.add_table(data=table_data)
157
183
  return
158
184
 
159
- def _process_inline_text(
160
- self, parent_item: Optional[NodeItem], doc: DoclingDocument
185
+ def _create_list_item(
186
+ self,
187
+ doc: DoclingDocument,
188
+ parent_item: Optional[NodeItem],
189
+ text: str,
190
+ formatting: Optional[Formatting] = None,
191
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
192
+ ):
193
+ if not isinstance(parent_item, (OrderedList, UnorderedList)):
194
+ _log.warning("ListItem would have not had a list parent, adding one.")
195
+ parent_item = doc.add_unordered_list(parent=parent_item)
196
+ item = doc.add_list_item(
197
+ text=text,
198
+ enumerated=(isinstance(parent_item, OrderedList)),
199
+ parent=parent_item,
200
+ formatting=formatting,
201
+ hyperlink=hyperlink,
202
+ )
203
+ return item
204
+
205
+ def _create_heading_item(
206
+ self,
207
+ doc: DoclingDocument,
208
+ parent_item: Optional[NodeItem],
209
+ text: str,
210
+ level: int,
211
+ formatting: Optional[Formatting] = None,
212
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
161
213
  ):
162
- txt = " ".join(self.inline_texts)
163
- if len(txt) > 0:
164
- doc.add_text(
165
- label=DocItemLabel.PARAGRAPH,
214
+ if level == 1:
215
+ item = doc.add_title(
216
+ text=text,
166
217
  parent=parent_item,
167
- text=txt,
218
+ formatting=formatting,
219
+ hyperlink=hyperlink,
168
220
  )
169
- self.inline_texts = []
221
+ else:
222
+ item = doc.add_heading(
223
+ text=text,
224
+ level=level - 1,
225
+ parent=parent_item,
226
+ formatting=formatting,
227
+ hyperlink=hyperlink,
228
+ )
229
+ return item
170
230
 
171
231
  def _iterate_elements( # noqa: C901
172
232
  self,
233
+ *,
173
234
  element: marko.element.Element,
174
235
  depth: int,
175
236
  doc: DoclingDocument,
176
237
  visited: Set[marko.element.Element],
238
+ creation_stack: list[
239
+ _CreationPayload
240
+ ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
177
241
  parent_item: Optional[NodeItem] = None,
242
+ formatting: Optional[Formatting] = None,
243
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
178
244
  ):
179
245
  if element in visited:
180
246
  return
@@ -183,44 +249,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
183
249
  # Check for different element types and process relevant details
184
250
  if isinstance(element, marko.block.Heading) and len(element.children) > 0:
185
251
  self._close_table(doc)
186
- self._process_inline_text(parent_item, doc)
187
252
  _log.debug(
188
253
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
189
254
  )
190
- if element.level == 1:
191
- doc_label = DocItemLabel.TITLE
255
+
256
+ if len(element.children) > 1: # inline group will be created further down
257
+ parent_item = self._create_heading_item(
258
+ doc=doc,
259
+ parent_item=parent_item,
260
+ text="",
261
+ level=element.level,
262
+ formatting=formatting,
263
+ hyperlink=hyperlink,
264
+ )
192
265
  else:
193
- doc_label = DocItemLabel.SECTION_HEADER
194
-
195
- # Header could have arbitrary inclusion of bold, italic or emphasis,
196
- # hence we need to traverse the tree to get full text of a header
197
- strings: List[str] = []
198
-
199
- # Define a recursive function to traverse the tree
200
- def traverse(node: marko.block.BlockElement):
201
- # Check if the node has a "children" attribute
202
- if hasattr(node, "children"):
203
- # If "children" is a list, continue traversal
204
- if isinstance(node.children, list):
205
- for child in node.children:
206
- traverse(child)
207
- # If "children" is text, add it to header text
208
- elif isinstance(node.children, str):
209
- strings.append(node.children)
210
-
211
- traverse(element)
212
- snippet_text = "".join(strings)
213
- if len(snippet_text) > 0:
214
- if doc_label == DocItemLabel.SECTION_HEADER:
215
- parent_item = doc.add_heading(
216
- text=snippet_text,
217
- level=element.level - 1,
218
- parent=parent_item,
219
- )
220
- else:
221
- parent_item = doc.add_text(
222
- label=doc_label, parent=parent_item, text=snippet_text
223
- )
266
+ creation_stack.append(_HeadingCreationPayload(level=element.level))
224
267
 
225
268
  elif isinstance(element, marko.block.List):
226
269
  has_non_empty_list_items = False
@@ -230,7 +273,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
230
273
  break
231
274
 
232
275
  self._close_table(doc)
233
- self._process_inline_text(parent_item, doc)
234
276
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
235
277
  if has_non_empty_list_items:
236
278
  label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@@ -240,41 +282,54 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
240
282
 
241
283
  elif (
242
284
  isinstance(element, marko.block.ListItem)
243
- and len(element.children) > 0
244
- and isinstance((first_child := element.children[0]), marko.block.Paragraph)
285
+ and len(element.children) == 1
286
+ and isinstance((child := element.children[0]), marko.block.Paragraph)
287
+ and len(child.children) > 0
245
288
  ):
246
289
  self._close_table(doc)
247
- self._process_inline_text(parent_item, doc)
248
290
  _log.debug(" - List item")
249
291
 
250
- snippet_text = str(first_child.children[0].children) # type: ignore
251
- is_numbered = False
252
- if (
253
- parent_item is not None
254
- and isinstance(parent_item, DocItem)
255
- and parent_item.label == GroupLabel.ORDERED_LIST
256
- ):
257
- is_numbered = True
258
- doc.add_list_item(
259
- enumerated=is_numbered, parent=parent_item, text=snippet_text
260
- )
261
- visited.add(first_child)
292
+ if len(child.children) > 1: # inline group will be created further down
293
+ parent_item = self._create_list_item(
294
+ doc=doc,
295
+ parent_item=parent_item,
296
+ text="",
297
+ formatting=formatting,
298
+ hyperlink=hyperlink,
299
+ )
300
+ else:
301
+ creation_stack.append(_ListItemCreationPayload())
262
302
 
263
303
  elif isinstance(element, marko.inline.Image):
264
304
  self._close_table(doc)
265
- self._process_inline_text(parent_item, doc)
266
305
  _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
267
306
 
268
307
  fig_caption: Optional[TextItem] = None
269
308
  if element.title is not None and element.title != "":
270
309
  fig_caption = doc.add_text(
271
- label=DocItemLabel.CAPTION, text=element.title
310
+ label=DocItemLabel.CAPTION,
311
+ text=element.title,
312
+ formatting=formatting,
313
+ hyperlink=hyperlink,
272
314
  )
273
315
 
274
316
  doc.add_picture(parent=parent_item, caption=fig_caption)
275
317
 
276
- elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
277
- self._process_inline_text(parent_item, doc)
318
+ elif isinstance(element, marko.inline.Emphasis):
319
+ _log.debug(f" - Emphasis: {element.children}")
320
+ formatting = deepcopy(formatting) if formatting else Formatting()
321
+ formatting.italic = True
322
+
323
+ elif isinstance(element, marko.inline.StrongEmphasis):
324
+ _log.debug(f" - StrongEmphasis: {element.children}")
325
+ formatting = deepcopy(formatting) if formatting else Formatting()
326
+ formatting.bold = True
327
+
328
+ elif isinstance(element, marko.inline.Link):
329
+ _log.debug(f" - Link: {element.children}")
330
+ hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
331
+ element.dest
332
+ )
278
333
 
279
334
  elif isinstance(element, marko.inline.RawText):
280
335
  _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -287,28 +342,66 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
287
342
  self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
288
343
  else:
289
344
  self.md_table_buffer.append(snippet_text)
290
- else:
345
+ elif snippet_text:
291
346
  self._close_table(doc)
292
- # most likely just inline text
293
- self.inline_texts.append(str(element.children))
347
+
348
+ if creation_stack:
349
+ while len(creation_stack) > 0:
350
+ to_create = creation_stack.pop()
351
+ if isinstance(to_create, _ListItemCreationPayload):
352
+ parent_item = self._create_list_item(
353
+ doc=doc,
354
+ parent_item=parent_item,
355
+ text=snippet_text,
356
+ formatting=formatting,
357
+ hyperlink=hyperlink,
358
+ )
359
+ elif isinstance(to_create, _HeadingCreationPayload):
360
+ # not keeping as parent_item as logic for correctly tracking
361
+ # that not implemented yet (section components not captured
362
+ # as heading children in marko)
363
+ self._create_heading_item(
364
+ doc=doc,
365
+ parent_item=parent_item,
366
+ text=snippet_text,
367
+ level=to_create.level,
368
+ formatting=formatting,
369
+ hyperlink=hyperlink,
370
+ )
371
+ else:
372
+ doc.add_text(
373
+ label=DocItemLabel.TEXT,
374
+ parent=parent_item,
375
+ text=snippet_text,
376
+ formatting=formatting,
377
+ hyperlink=hyperlink,
378
+ )
294
379
 
295
380
  elif isinstance(element, marko.inline.CodeSpan):
296
381
  self._close_table(doc)
297
- self._process_inline_text(parent_item, doc)
298
382
  _log.debug(f" - Code Span: {element.children}")
299
383
  snippet_text = str(element.children).strip()
300
- doc.add_code(parent=parent_item, text=snippet_text)
384
+ doc.add_code(
385
+ parent=parent_item,
386
+ text=snippet_text,
387
+ formatting=formatting,
388
+ hyperlink=hyperlink,
389
+ )
301
390
 
302
391
  elif (
303
392
  isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
304
393
  and len(element.children) > 0
305
- and isinstance((first_child := element.children[0]), marko.inline.RawText)
306
- and len(snippet_text := (first_child.children.strip())) > 0
394
+ and isinstance((child := element.children[0]), marko.inline.RawText)
395
+ and len(snippet_text := (child.children.strip())) > 0
307
396
  ):
308
397
  self._close_table(doc)
309
- self._process_inline_text(parent_item, doc)
310
398
  _log.debug(f" - Code Block: {element.children}")
311
- doc.add_code(parent=parent_item, text=snippet_text)
399
+ doc.add_code(
400
+ parent=parent_item,
401
+ text=snippet_text,
402
+ formatting=formatting,
403
+ hyperlink=hyperlink,
404
+ )
312
405
 
313
406
  elif isinstance(element, marko.inline.LineBreak):
314
407
  if self.in_table:
@@ -317,7 +410,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
317
410
 
318
411
  elif isinstance(element, marko.block.HTMLBlock):
319
412
  self._html_blocks += 1
320
- self._process_inline_text(parent_item, doc)
321
413
  self._close_table(doc)
322
414
  _log.debug(f"HTML Block: {element}")
323
415
  if (
@@ -327,14 +419,24 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
327
419
 
328
420
  # wrap in markers to enable post-processing in convert()
329
421
  text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
330
- doc.add_code(parent=parent_item, text=text_to_add)
422
+ doc.add_code(
423
+ parent=parent_item,
424
+ text=text_to_add,
425
+ formatting=formatting,
426
+ hyperlink=hyperlink,
427
+ )
331
428
  else:
332
429
  if not isinstance(element, str):
333
430
  self._close_table(doc)
334
431
  _log.debug(f"Some other element: {element}")
335
432
 
433
+ if (
434
+ isinstance(element, (marko.block.Paragraph, marko.block.Heading))
435
+ and len(element.children) > 1
436
+ ):
437
+ parent_item = doc.add_inline_group(parent=parent_item)
438
+
336
439
  processed_block_types = (
337
- marko.block.Heading,
338
440
  marko.block.CodeBlock,
339
441
  marko.block.FencedCode,
340
442
  marko.inline.RawText,
@@ -350,7 +452,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
350
452
  depth=depth + 1,
351
453
  doc=doc,
352
454
  visited=visited,
455
+ creation_stack=creation_stack,
353
456
  parent_item=parent_item,
457
+ formatting=formatting,
458
+ hyperlink=hyperlink,
354
459
  )
355
460
 
356
461
  def is_valid(self) -> bool:
@@ -391,8 +496,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
391
496
  doc=doc,
392
497
  parent_item=None,
393
498
  visited=set(),
499
+ creation_stack=[],
394
500
  )
395
- self._process_inline_text(None, doc) # handle last hanging inline text
396
501
  self._close_table(doc=doc) # handle any last hanging table
397
502
 
398
503
  # if HTML blocks were detected, export to HTML and delegate to HTML backend