docling 2.17.0__tar.gz → 2.19.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {docling-2.17.0 → docling-2.19.0}/PKG-INFO +11 -5
  2. {docling-2.17.0 → docling-2.19.0}/README.md +3 -0
  3. {docling-2.17.0 → docling-2.19.0}/docling/backend/html_backend.py +18 -18
  4. {docling-2.17.0 → docling-2.19.0}/docling/backend/md_backend.py +144 -75
  5. {docling-2.17.0 → docling-2.19.0}/docling/backend/mspowerpoint_backend.py +39 -27
  6. {docling-2.17.0 → docling-2.19.0}/docling/backend/msword_backend.py +173 -131
  7. {docling-2.17.0 → docling-2.19.0}/docling/cli/main.py +8 -0
  8. docling-2.19.0/docling/cli/models.py +105 -0
  9. docling-2.19.0/docling/cli/tools.py +17 -0
  10. {docling-2.17.0 → docling-2.19.0}/docling/datamodel/document.py +2 -0
  11. {docling-2.17.0 → docling-2.19.0}/docling/datamodel/settings.py +18 -1
  12. {docling-2.17.0 → docling-2.19.0}/docling/document_converter.py +12 -2
  13. {docling-2.17.0 → docling-2.19.0}/docling/models/base_model.py +3 -0
  14. {docling-2.17.0 → docling-2.19.0}/docling/models/code_formula_model.py +15 -9
  15. {docling-2.17.0 → docling-2.19.0}/docling/models/document_picture_classifier.py +11 -8
  16. {docling-2.17.0 → docling-2.19.0}/docling/models/easyocr_model.py +50 -3
  17. {docling-2.17.0 → docling-2.19.0}/docling/models/layout_model.py +49 -3
  18. {docling-2.17.0 → docling-2.19.0}/docling/models/table_structure_model.py +53 -7
  19. {docling-2.17.0 → docling-2.19.0}/docling/pipeline/base_pipeline.py +4 -2
  20. {docling-2.17.0 → docling-2.19.0}/docling/pipeline/standard_pdf_pipeline.py +25 -24
  21. {docling-2.17.0 → docling-2.19.0}/docling/utils/glm_utils.py +4 -0
  22. docling-2.19.0/docling/utils/model_downloader.py +72 -0
  23. {docling-2.17.0 → docling-2.19.0}/docling/utils/utils.py +24 -0
  24. {docling-2.17.0 → docling-2.19.0}/pyproject.toml +11 -5
  25. {docling-2.17.0 → docling-2.19.0}/LICENSE +0 -0
  26. {docling-2.17.0 → docling-2.19.0}/docling/__init__.py +0 -0
  27. {docling-2.17.0 → docling-2.19.0}/docling/backend/__init__.py +0 -0
  28. {docling-2.17.0 → docling-2.19.0}/docling/backend/abstract_backend.py +0 -0
  29. {docling-2.17.0 → docling-2.19.0}/docling/backend/asciidoc_backend.py +0 -0
  30. {docling-2.17.0 → docling-2.19.0}/docling/backend/docling_parse_backend.py +0 -0
  31. {docling-2.17.0 → docling-2.19.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  32. {docling-2.17.0 → docling-2.19.0}/docling/backend/json/__init__.py +0 -0
  33. {docling-2.17.0 → docling-2.19.0}/docling/backend/json/docling_json_backend.py +0 -0
  34. {docling-2.17.0 → docling-2.19.0}/docling/backend/msexcel_backend.py +0 -0
  35. {docling-2.17.0 → docling-2.19.0}/docling/backend/pdf_backend.py +0 -0
  36. {docling-2.17.0 → docling-2.19.0}/docling/backend/pypdfium2_backend.py +0 -0
  37. {docling-2.17.0 → docling-2.19.0}/docling/backend/xml/__init__.py +0 -0
  38. {docling-2.17.0 → docling-2.19.0}/docling/backend/xml/pubmed_backend.py +0 -0
  39. {docling-2.17.0 → docling-2.19.0}/docling/backend/xml/uspto_backend.py +0 -0
  40. {docling-2.17.0 → docling-2.19.0}/docling/chunking/__init__.py +0 -0
  41. {docling-2.17.0 → docling-2.19.0}/docling/cli/__init__.py +0 -0
  42. {docling-2.17.0 → docling-2.19.0}/docling/datamodel/__init__.py +0 -0
  43. {docling-2.17.0 → docling-2.19.0}/docling/datamodel/base_models.py +0 -0
  44. {docling-2.17.0 → docling-2.19.0}/docling/datamodel/pipeline_options.py +0 -0
  45. {docling-2.17.0 → docling-2.19.0}/docling/exceptions.py +0 -0
  46. {docling-2.17.0 → docling-2.19.0}/docling/models/__init__.py +0 -0
  47. {docling-2.17.0 → docling-2.19.0}/docling/models/base_ocr_model.py +0 -0
  48. {docling-2.17.0 → docling-2.19.0}/docling/models/ds_glm_model.py +0 -0
  49. {docling-2.17.0 → docling-2.19.0}/docling/models/ocr_mac_model.py +0 -0
  50. {docling-2.17.0 → docling-2.19.0}/docling/models/page_assemble_model.py +0 -0
  51. {docling-2.17.0 → docling-2.19.0}/docling/models/page_preprocessing_model.py +0 -0
  52. {docling-2.17.0 → docling-2.19.0}/docling/models/rapid_ocr_model.py +0 -0
  53. {docling-2.17.0 → docling-2.19.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  54. {docling-2.17.0 → docling-2.19.0}/docling/models/tesseract_ocr_model.py +0 -0
  55. {docling-2.17.0 → docling-2.19.0}/docling/pipeline/__init__.py +0 -0
  56. {docling-2.17.0 → docling-2.19.0}/docling/pipeline/simple_pipeline.py +0 -0
  57. {docling-2.17.0 → docling-2.19.0}/docling/py.typed +0 -0
  58. {docling-2.17.0 → docling-2.19.0}/docling/utils/__init__.py +0 -0
  59. {docling-2.17.0 → docling-2.19.0}/docling/utils/accelerator_utils.py +0 -0
  60. {docling-2.17.0 → docling-2.19.0}/docling/utils/export.py +0 -0
  61. {docling-2.17.0 → docling-2.19.0}/docling/utils/layout_postprocessor.py +0 -0
  62. {docling-2.17.0 → docling-2.19.0}/docling/utils/ocr_utils.py +0 -0
  63. {docling-2.17.0 → docling-2.19.0}/docling/utils/profiling.py +0 -0
  64. {docling-2.17.0 → docling-2.19.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.17.0
3
+ Version: 2.19.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -19,16 +19,17 @@ Classifier: Programming Language :: Python :: 3.9
19
19
  Classifier: Programming Language :: Python :: 3.10
20
20
  Classifier: Programming Language :: Python :: 3.11
21
21
  Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
22
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
24
  Provides-Extra: ocrmac
24
25
  Provides-Extra: rapidocr
25
26
  Provides-Extra: tesserocr
26
- Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
27
28
  Requires-Dist: certifi (>=2024.7.4)
28
29
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
30
+ Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
30
31
  Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
- Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
32
+ Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
32
33
  Requires-Dist: easyocr (>=1.7,<2.0)
33
34
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
35
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -48,8 +49,10 @@ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
48
49
  Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
49
50
  Requires-Dist: requests (>=2.32.2,<3.0.0)
50
51
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
51
- Requires-Dist: scipy (>=1.6.0,<2.0.0)
52
+ Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
53
+ Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
52
54
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
55
+ Requires-Dist: tqdm (>=4.65.0,<5.0.0)
53
56
  Requires-Dist: typer (>=0.12.5,<0.13.0)
54
57
  Project-URL: Repository, https://github.com/DS4SD/docling
55
58
  Description-Content-Type: text/markdown
@@ -94,6 +97,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
94
97
  ### Coming soon
95
98
 
96
99
  * 📝 Metadata extraction, including title, authors, references & language
100
+ * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
101
+ * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
102
+ * 📝 Complex chemistry understanding (Molecular structures)
97
103
 
98
104
  ## Installation
99
105
 
@@ -38,6 +38,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
38
38
  ### Coming soon
39
39
 
40
40
  * 📝 Metadata extraction, including title, authors, references & language
41
+ * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
42
+ * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
43
+ * 📝 Complex chemistry understanding (Molecular structures)
41
44
 
42
45
  ## Installation
43
46
 
@@ -1,9 +1,9 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Set, Union
4
+ from typing import Optional, Set, Union
5
5
 
6
- from bs4 import BeautifulSoup
6
+ from bs4 import BeautifulSoup, Tag
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
9
9
  DoclingDocument,
@@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
24
24
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
25
25
  super().__init__(in_doc, path_or_stream)
26
26
  _log.debug("About to init HTML backend...")
27
- self.soup = None
27
+ self.soup: Optional[Tag] = None
28
28
  # HTML file:
29
29
  self.path_or_stream = path_or_stream
30
30
  # Initialise the parents for the hierarchy
@@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
89
89
  )
90
90
  return doc
91
91
 
92
- def walk(self, element, doc):
92
+ def walk(self, element: Tag, doc: DoclingDocument):
93
93
  try:
94
94
  # Iterate over elements in the body of the document
95
95
  for idx, element in enumerate(element.children):
@@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
106
106
 
107
107
  return doc
108
108
 
109
- def analyse_element(self, element, idx, doc):
109
+ def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
110
110
  """
111
111
  if element.name!=None:
112
112
  _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
@@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
136
136
  else:
137
137
  self.walk(element, doc)
138
138
 
139
- def get_direct_text(self, item):
139
+ def get_direct_text(self, item: Tag):
140
140
  """Get the direct text of the <li> element (ignoring nested lists)."""
141
141
  text = item.find(string=True, recursive=False)
142
142
  if isinstance(text, str):
@@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
145
145
  return ""
146
146
 
147
147
  # Function to recursively extract text from all child nodes
148
- def extract_text_recursively(self, item):
148
+ def extract_text_recursively(self, item: Tag):
149
149
  result = []
150
150
 
151
151
  if isinstance(item, str):
@@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
166
166
 
167
167
  return "".join(result) + " "
168
168
 
169
- def handle_header(self, element, idx, doc):
169
+ def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
170
170
  """Handles header tags (h1, h2, etc.)."""
171
171
  hlevel = int(element.name.replace("h", ""))
172
172
  slevel = hlevel - 1
@@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
208
208
  level=hlevel,
209
209
  )
210
210
 
211
- def handle_code(self, element, idx, doc):
211
+ def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
212
212
  """Handles monospace code snippets (pre)."""
213
213
  if element.text is None:
214
214
  return
@@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
216
216
  label = DocItemLabel.CODE
217
217
  if len(text) == 0:
218
218
  return
219
- doc.add_code(parent=self.parents[self.level], label=label, text=text)
219
+ doc.add_code(parent=self.parents[self.level], text=text)
220
220
 
221
- def handle_paragraph(self, element, idx, doc):
221
+ def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
222
222
  """Handles paragraph tags (p)."""
223
223
  if element.text is None:
224
224
  return
@@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
228
228
  return
229
229
  doc.add_text(parent=self.parents[self.level], label=label, text=text)
230
230
 
231
- def handle_list(self, element, idx, doc):
231
+ def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
232
232
  """Handles list tags (ul, ol) and their list items."""
233
233
 
234
234
  if element.name == "ul":
@@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
250
250
  self.parents[self.level + 1] = None
251
251
  self.level -= 1
252
252
 
253
- def handle_listitem(self, element, idx, doc):
253
+ def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
254
254
  """Handles listitem tags (li)."""
255
255
  nested_lists = element.find(["ul", "ol"])
256
256
 
@@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
304
304
  else:
305
305
  _log.warn("list-item has no text: ", element)
306
306
 
307
- def handle_table(self, element, idx, doc):
307
+ def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
308
308
  """Handles table tags."""
309
309
 
310
310
  nested_tables = element.find("table")
@@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
377
377
 
378
378
  doc.add_table(data=data, parent=self.parents[self.level])
379
379
 
380
- def get_list_text(self, list_element, level=0):
380
+ def get_list_text(self, list_element: Tag, level=0):
381
381
  """Recursively extract text from <ul> or <ol> with proper indentation."""
382
382
  result = []
383
383
  bullet_char = "*" # Default bullet character for unordered lists
@@ -403,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
403
403
 
404
404
  return result
405
405
 
406
- def extract_table_cell_text(self, cell):
406
+ def extract_table_cell_text(self, cell: Tag):
407
407
  """Extract text from a table cell, including lists with indents."""
408
408
  contains_lists = cell.find(["ul", "ol"])
409
409
  if contains_lists is None:
@@ -414,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
414
414
  )
415
415
  return cell.text
416
416
 
417
- def handle_figure(self, element, idx, doc):
417
+ def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
418
418
  """Handles image tags (img)."""
419
419
 
420
420
  # Extract the image URI from the <img> tag
@@ -437,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
437
437
  caption=fig_caption,
438
438
  )
439
439
 
440
- def handle_image(self, element, idx, doc):
440
+ def handle_image(self, element: Tag, idx, doc: DoclingDocument):
441
441
  """Handles image tags (img)."""
442
442
  doc.add_picture(parent=self.parents[self.level], caption=None)
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import List, Optional, Set, Union
7
7
 
8
8
  import marko
9
+ import marko.element
9
10
  import marko.ext
10
11
  import marko.ext.gfm
11
12
  import marko.inline
@@ -23,14 +24,19 @@ from docling_core.types.doc import (
23
24
  from marko import Markdown
24
25
 
25
26
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
+ from docling.backend.html_backend import HTMLDocumentBackend
26
28
  from docling.datamodel.base_models import InputFormat
27
29
  from docling.datamodel.document import InputDocument
28
30
 
29
31
  _log = logging.getLogger(__name__)
30
32
 
33
+ _MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
34
+ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
35
+ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
+
31
37
 
32
38
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
33
- def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
39
+ def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
34
40
  # This regex will match any sequence of underscores
35
41
  pattern = r"_+"
36
42
 
@@ -66,6 +72,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
66
72
  self.in_table = False
67
73
  self.md_table_buffer: list[str] = []
68
74
  self.inline_texts: list[str] = []
75
+ self._html_blocks: int = 0
69
76
 
70
77
  try:
71
78
  if isinstance(self.path_or_stream, BytesIO):
@@ -74,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
74
81
  # very long sequences of underscores will lead to unnecessary long processing times.
75
82
  # In any proper Markdown files, underscores have to be escaped,
76
83
  # otherwise they represent emphasis (bold or italic)
77
- self.markdown = self.shorten_underscore_sequences(text_stream)
84
+ self.markdown = self._shorten_underscore_sequences(text_stream)
78
85
  if isinstance(self.path_or_stream, Path):
79
86
  with open(self.path_or_stream, "r", encoding="utf-8") as f:
80
87
  md_content = f.read()
@@ -82,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
82
89
  # very long sequences of underscores will lead to unnecessary long processing times.
83
90
  # In any proper Markdown files, underscores have to be escaped,
84
91
  # otherwise they represent emphasis (bold or italic)
85
- self.markdown = self.shorten_underscore_sequences(md_content)
92
+ self.markdown = self._shorten_underscore_sequences(md_content)
86
93
  self.valid = True
87
94
 
88
95
  _log.debug(self.markdown)
@@ -92,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
92
99
  ) from e
93
100
  return
94
101
 
95
- def close_table(self, doc: DoclingDocument):
102
+ def _close_table(self, doc: DoclingDocument):
96
103
  if self.in_table:
97
104
  _log.debug("=== TABLE START ===")
98
105
  for md_table_row in self.md_table_buffer:
@@ -149,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
149
156
  doc.add_table(data=table_data)
150
157
  return
151
158
 
152
- def process_inline_text(
153
- self, parent_element: Optional[NodeItem], doc: DoclingDocument
159
+ def _process_inline_text(
160
+ self, parent_item: Optional[NodeItem], doc: DoclingDocument
154
161
  ):
155
162
  txt = " ".join(self.inline_texts)
156
163
  if len(txt) > 0:
157
164
  doc.add_text(
158
165
  label=DocItemLabel.PARAGRAPH,
159
- parent=parent_element,
166
+ parent=parent_item,
160
167
  text=txt,
161
168
  )
162
169
  self.inline_texts = []
163
170
 
164
- def iterate_elements(
171
+ def _iterate_elements(
165
172
  self,
166
- element: marko.block.Element,
173
+ element: marko.element.Element,
167
174
  depth: int,
168
175
  doc: DoclingDocument,
169
- parent_element: Optional[NodeItem] = None,
176
+ visited: Set[marko.element.Element],
177
+ parent_item: Optional[NodeItem] = None,
170
178
  ):
179
+
180
+ if element in visited:
181
+ return
182
+
171
183
  # Iterates over all elements in the AST
172
184
  # Check for different element types and process relevant details
173
- if isinstance(element, marko.block.Heading):
174
- self.close_table(doc)
175
- self.process_inline_text(parent_element, doc)
185
+ if isinstance(element, marko.block.Heading) and len(element.children) > 0:
186
+ self._close_table(doc)
187
+ self._process_inline_text(parent_item, doc)
176
188
  _log.debug(
177
189
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
178
190
  )
@@ -200,41 +212,48 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
200
212
  traverse(element)
201
213
  snippet_text = "".join(strings)
202
214
  if len(snippet_text) > 0:
203
- parent_element = doc.add_text(
204
- label=doc_label, parent=parent_element, text=snippet_text
215
+ parent_item = doc.add_text(
216
+ label=doc_label, parent=parent_item, text=snippet_text
205
217
  )
206
218
 
207
219
  elif isinstance(element, marko.block.List):
208
- self.close_table(doc)
209
- self.process_inline_text(parent_element, doc)
220
+ has_non_empty_list_items = False
221
+ for child in element.children:
222
+ if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
223
+ has_non_empty_list_items = True
224
+ break
225
+
226
+ self._close_table(doc)
227
+ self._process_inline_text(parent_item, doc)
210
228
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
211
- list_label = GroupLabel.LIST
212
- if element.ordered:
213
- list_label = GroupLabel.ORDERED_LIST
214
- parent_element = doc.add_group(
215
- label=list_label, name=f"list", parent=parent_element
216
- )
229
+ if has_non_empty_list_items:
230
+ label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
231
+ parent_item = doc.add_group(
232
+ label=label, name=f"list", parent=parent_item
233
+ )
217
234
 
218
- elif isinstance(element, marko.block.ListItem):
219
- self.close_table(doc)
220
- self.process_inline_text(parent_element, doc)
235
+ elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
236
+ self._close_table(doc)
237
+ self._process_inline_text(parent_item, doc)
221
238
  _log.debug(" - List item")
222
239
 
223
- snippet_text = str(element.children[0].children[0].children) # type: ignore
240
+ first_child = element.children[0]
241
+ snippet_text = str(first_child.children[0].children) # type: ignore
224
242
  is_numbered = False
225
243
  if (
226
- parent_element is not None
227
- and isinstance(parent_element, DocItem)
228
- and parent_element.label == GroupLabel.ORDERED_LIST
244
+ parent_item is not None
245
+ and isinstance(parent_item, DocItem)
246
+ and parent_item.label == GroupLabel.ORDERED_LIST
229
247
  ):
230
248
  is_numbered = True
231
249
  doc.add_list_item(
232
- enumerated=is_numbered, parent=parent_element, text=snippet_text
250
+ enumerated=is_numbered, parent=parent_item, text=snippet_text
233
251
  )
252
+ visited.add(first_child)
234
253
 
235
254
  elif isinstance(element, marko.inline.Image):
236
- self.close_table(doc)
237
- self.process_inline_text(parent_element, doc)
255
+ self._close_table(doc)
256
+ self._process_inline_text(parent_item, doc)
238
257
  _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
239
258
 
240
259
  fig_caption: Optional[TextItem] = None
@@ -243,50 +262,44 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
243
262
  label=DocItemLabel.CAPTION, text=element.title
244
263
  )
245
264
 
246
- doc.add_picture(parent=parent_element, caption=fig_caption)
265
+ doc.add_picture(parent=parent_item, caption=fig_caption)
247
266
 
248
- elif isinstance(element, marko.block.Paragraph):
249
- self.process_inline_text(parent_element, doc)
267
+ elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
268
+ self._process_inline_text(parent_item, doc)
250
269
 
251
270
  elif isinstance(element, marko.inline.RawText):
252
271
  _log.debug(f" - Paragraph (raw text): {element.children}")
253
- snippet_text = str(element.children).strip()
272
+ snippet_text = element.children.strip()
254
273
  # Detect start of the table:
255
274
  if "|" in snippet_text:
256
275
  # most likely part of the markdown table
257
276
  self.in_table = True
258
277
  if len(self.md_table_buffer) > 0:
259
- self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
260
- snippet_text
261
- )
278
+ self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
262
279
  else:
263
280
  self.md_table_buffer.append(snippet_text)
264
281
  else:
265
- self.close_table(doc)
266
- self.in_table = False
282
+ self._close_table(doc)
267
283
  # most likely just inline text
268
284
  self.inline_texts.append(str(element.children))
269
285
 
270
286
  elif isinstance(element, marko.inline.CodeSpan):
271
- self.close_table(doc)
272
- self.process_inline_text(parent_element, doc)
287
+ self._close_table(doc)
288
+ self._process_inline_text(parent_item, doc)
273
289
  _log.debug(f" - Code Span: {element.children}")
274
290
  snippet_text = str(element.children).strip()
275
- doc.add_code(parent=parent_element, text=snippet_text)
276
-
277
- elif isinstance(element, marko.block.CodeBlock):
278
- self.close_table(doc)
279
- self.process_inline_text(parent_element, doc)
280
- _log.debug(f" - Code Block: {element.children}")
281
- snippet_text = str(element.children[0].children).strip() # type: ignore
282
- doc.add_code(parent=parent_element, text=snippet_text)
283
-
284
- elif isinstance(element, marko.block.FencedCode):
285
- self.close_table(doc)
286
- self.process_inline_text(parent_element, doc)
291
+ doc.add_code(parent=parent_item, text=snippet_text)
292
+
293
+ elif (
294
+ isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
295
+ and len(element.children) > 0
296
+ and isinstance((first_child := element.children[0]), marko.inline.RawText)
297
+ and len(snippet_text := (first_child.children.strip())) > 0
298
+ ):
299
+ self._close_table(doc)
300
+ self._process_inline_text(parent_item, doc)
287
301
  _log.debug(f" - Code Block: {element.children}")
288
- snippet_text = str(element.children[0].children).strip() # type: ignore
289
- doc.add_code(parent=parent_element, text=snippet_text)
302
+ doc.add_code(parent=parent_item, text=snippet_text)
290
303
 
291
304
  elif isinstance(element, marko.inline.LineBreak):
292
305
  if self.in_table:
@@ -294,29 +307,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
294
307
  self.md_table_buffer.append("")
295
308
 
296
309
  elif isinstance(element, marko.block.HTMLBlock):
297
- self.process_inline_text(parent_element, doc)
298
- self.close_table(doc)
310
+ self._html_blocks += 1
311
+ self._process_inline_text(parent_item, doc)
312
+ self._close_table(doc)
299
313
  _log.debug("HTML Block: {}".format(element))
300
314
  if (
301
- len(element.children) > 0
315
+ len(element.body) > 0
302
316
  ): # If Marko doesn't return any content for HTML block, skip it
303
- snippet_text = str(element.children).strip()
304
- doc.add_text(
305
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
306
- )
317
+ html_block = element.body.strip()
318
+
319
+ # wrap in markers to enable post-processing in convert()
320
+ text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
321
+ doc.add_code(parent=parent_item, text=text_to_add)
307
322
  else:
308
323
  if not isinstance(element, str):
309
- self.close_table(doc)
324
+ self._close_table(doc)
310
325
  _log.debug("Some other element: {}".format(element))
311
326
 
327
+ processed_block_types = (
328
+ marko.block.Heading,
329
+ marko.block.CodeBlock,
330
+ marko.block.FencedCode,
331
+ marko.inline.RawText,
332
+ )
333
+
312
334
  # Iterate through the element's children (if any)
313
- if not isinstance(element, marko.block.ListItem):
314
- if not isinstance(element, marko.block.Heading):
315
- if not isinstance(element, marko.block.FencedCode):
316
- # if not isinstance(element, marko.block.Paragraph):
317
- if hasattr(element, "children"):
318
- for child in element.children:
319
- self.iterate_elements(child, depth + 1, doc, parent_element)
335
+ if hasattr(element, "children") and not isinstance(
336
+ element, processed_block_types
337
+ ):
338
+ for child in element.children:
339
+ self._iterate_elements(
340
+ element=child,
341
+ depth=depth + 1,
342
+ doc=doc,
343
+ visited=visited,
344
+ parent_item=parent_item,
345
+ )
320
346
 
321
347
  def is_valid(self) -> bool:
322
348
  return self.valid
@@ -350,8 +376,51 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
350
376
  marko_parser = Markdown()
351
377
  parsed_ast = marko_parser.parse(self.markdown)
352
378
  # Start iterating from the root of the AST
353
- self.iterate_elements(parsed_ast, 0, doc, None)
354
- self.process_inline_text(None, doc) # handle last hanging inline text
379
+ self._iterate_elements(
380
+ element=parsed_ast,
381
+ depth=0,
382
+ doc=doc,
383
+ parent_item=None,
384
+ visited=set(),
385
+ )
386
+ self._process_inline_text(None, doc) # handle last hanging inline text
387
+ self._close_table(doc=doc) # handle any last hanging table
388
+
389
+ # if HTML blocks were detected, export to HTML and delegate to HTML backend
390
+ if self._html_blocks > 0:
391
+
392
+ # export to HTML
393
+ html_backend_cls = HTMLDocumentBackend
394
+ html_str = doc.export_to_html()
395
+
396
+ def _restore_original_html(txt, regex):
397
+ _txt, count = re.subn(regex, "", txt)
398
+ if count != self._html_blocks:
399
+ raise RuntimeError(
400
+ "An internal error has occurred during Markdown conversion."
401
+ )
402
+ return _txt
403
+
404
+ # restore original HTML by removing previouly added markers
405
+ for regex in [
406
+ rf"<pre>\s*<code>\s*{_START_MARKER}",
407
+ rf"{_STOP_MARKER}\s*</code>\s*</pre>",
408
+ ]:
409
+ html_str = _restore_original_html(txt=html_str, regex=regex)
410
+ self._html_blocks = 0
411
+
412
+ # delegate to HTML backend
413
+ stream = BytesIO(bytes(html_str, encoding="utf-8"))
414
+ in_doc = InputDocument(
415
+ path_or_stream=stream,
416
+ format=InputFormat.HTML,
417
+ backend=html_backend_cls,
418
+ filename=self.file.name,
419
+ )
420
+ html_backend_obj = html_backend_cls(
421
+ in_doc=in_doc, path_or_stream=stream
422
+ )
423
+ doc = html_backend_obj.convert()
355
424
  else:
356
425
  raise RuntimeError(
357
426
  f"Cannot convert md with {self.document_hash} because the backend failed to init."