docling 2.17.0__tar.gz → 2.18.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {docling-2.17.0 → docling-2.18.0}/PKG-INFO +8 -3
  2. {docling-2.17.0 → docling-2.18.0}/README.md +3 -0
  3. {docling-2.17.0 → docling-2.18.0}/docling/backend/html_backend.py +18 -18
  4. {docling-2.17.0 → docling-2.18.0}/docling/backend/md_backend.py +88 -35
  5. {docling-2.17.0 → docling-2.18.0}/docling/backend/mspowerpoint_backend.py +39 -27
  6. {docling-2.17.0 → docling-2.18.0}/docling/backend/msword_backend.py +172 -130
  7. {docling-2.17.0 → docling-2.18.0}/docling/datamodel/document.py +2 -0
  8. {docling-2.17.0 → docling-2.18.0}/docling/datamodel/settings.py +16 -1
  9. {docling-2.17.0 → docling-2.18.0}/docling/document_converter.py +12 -2
  10. {docling-2.17.0 → docling-2.18.0}/docling/models/table_structure_model.py +9 -5
  11. {docling-2.17.0 → docling-2.18.0}/docling/pipeline/base_pipeline.py +3 -1
  12. {docling-2.17.0 → docling-2.18.0}/docling/utils/glm_utils.py +4 -0
  13. {docling-2.17.0 → docling-2.18.0}/pyproject.toml +6 -3
  14. {docling-2.17.0 → docling-2.18.0}/LICENSE +0 -0
  15. {docling-2.17.0 → docling-2.18.0}/docling/__init__.py +0 -0
  16. {docling-2.17.0 → docling-2.18.0}/docling/backend/__init__.py +0 -0
  17. {docling-2.17.0 → docling-2.18.0}/docling/backend/abstract_backend.py +0 -0
  18. {docling-2.17.0 → docling-2.18.0}/docling/backend/asciidoc_backend.py +0 -0
  19. {docling-2.17.0 → docling-2.18.0}/docling/backend/docling_parse_backend.py +0 -0
  20. {docling-2.17.0 → docling-2.18.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  21. {docling-2.17.0 → docling-2.18.0}/docling/backend/json/__init__.py +0 -0
  22. {docling-2.17.0 → docling-2.18.0}/docling/backend/json/docling_json_backend.py +0 -0
  23. {docling-2.17.0 → docling-2.18.0}/docling/backend/msexcel_backend.py +0 -0
  24. {docling-2.17.0 → docling-2.18.0}/docling/backend/pdf_backend.py +0 -0
  25. {docling-2.17.0 → docling-2.18.0}/docling/backend/pypdfium2_backend.py +0 -0
  26. {docling-2.17.0 → docling-2.18.0}/docling/backend/xml/__init__.py +0 -0
  27. {docling-2.17.0 → docling-2.18.0}/docling/backend/xml/pubmed_backend.py +0 -0
  28. {docling-2.17.0 → docling-2.18.0}/docling/backend/xml/uspto_backend.py +0 -0
  29. {docling-2.17.0 → docling-2.18.0}/docling/chunking/__init__.py +0 -0
  30. {docling-2.17.0 → docling-2.18.0}/docling/cli/__init__.py +0 -0
  31. {docling-2.17.0 → docling-2.18.0}/docling/cli/main.py +0 -0
  32. {docling-2.17.0 → docling-2.18.0}/docling/datamodel/__init__.py +0 -0
  33. {docling-2.17.0 → docling-2.18.0}/docling/datamodel/base_models.py +0 -0
  34. {docling-2.17.0 → docling-2.18.0}/docling/datamodel/pipeline_options.py +0 -0
  35. {docling-2.17.0 → docling-2.18.0}/docling/exceptions.py +0 -0
  36. {docling-2.17.0 → docling-2.18.0}/docling/models/__init__.py +0 -0
  37. {docling-2.17.0 → docling-2.18.0}/docling/models/base_model.py +0 -0
  38. {docling-2.17.0 → docling-2.18.0}/docling/models/base_ocr_model.py +0 -0
  39. {docling-2.17.0 → docling-2.18.0}/docling/models/code_formula_model.py +0 -0
  40. {docling-2.17.0 → docling-2.18.0}/docling/models/document_picture_classifier.py +0 -0
  41. {docling-2.17.0 → docling-2.18.0}/docling/models/ds_glm_model.py +0 -0
  42. {docling-2.17.0 → docling-2.18.0}/docling/models/easyocr_model.py +0 -0
  43. {docling-2.17.0 → docling-2.18.0}/docling/models/layout_model.py +0 -0
  44. {docling-2.17.0 → docling-2.18.0}/docling/models/ocr_mac_model.py +0 -0
  45. {docling-2.17.0 → docling-2.18.0}/docling/models/page_assemble_model.py +0 -0
  46. {docling-2.17.0 → docling-2.18.0}/docling/models/page_preprocessing_model.py +0 -0
  47. {docling-2.17.0 → docling-2.18.0}/docling/models/rapid_ocr_model.py +0 -0
  48. {docling-2.17.0 → docling-2.18.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  49. {docling-2.17.0 → docling-2.18.0}/docling/models/tesseract_ocr_model.py +0 -0
  50. {docling-2.17.0 → docling-2.18.0}/docling/pipeline/__init__.py +0 -0
  51. {docling-2.17.0 → docling-2.18.0}/docling/pipeline/simple_pipeline.py +0 -0
  52. {docling-2.17.0 → docling-2.18.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  53. {docling-2.17.0 → docling-2.18.0}/docling/py.typed +0 -0
  54. {docling-2.17.0 → docling-2.18.0}/docling/utils/__init__.py +0 -0
  55. {docling-2.17.0 → docling-2.18.0}/docling/utils/accelerator_utils.py +0 -0
  56. {docling-2.17.0 → docling-2.18.0}/docling/utils/export.py +0 -0
  57. {docling-2.17.0 → docling-2.18.0}/docling/utils/layout_postprocessor.py +0 -0
  58. {docling-2.17.0 → docling-2.18.0}/docling/utils/ocr_utils.py +0 -0
  59. {docling-2.17.0 → docling-2.18.0}/docling/utils/profiling.py +0 -0
  60. {docling-2.17.0 → docling-2.18.0}/docling/utils/utils.py +0 -0
  61. {docling-2.17.0 → docling-2.18.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.17.0
3
+ Version: 2.18.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.9
19
19
  Classifier: Programming Language :: Python :: 3.10
20
20
  Classifier: Programming Language :: Python :: 3.11
21
21
  Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
22
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
24
  Provides-Extra: ocrmac
24
25
  Provides-Extra: rapidocr
@@ -26,7 +27,7 @@ Provides-Extra: tesserocr
26
27
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
28
  Requires-Dist: certifi (>=2024.7.4)
28
29
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
30
+ Requires-Dist: docling-core[chunking] (>=2.17.0,<3.0.0)
30
31
  Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
32
  Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
32
33
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -48,7 +49,8 @@ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
48
49
  Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
49
50
  Requires-Dist: requests (>=2.32.2,<3.0.0)
50
51
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
51
- Requires-Dist: scipy (>=1.6.0,<2.0.0)
52
+ Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
53
+ Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
52
54
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
53
55
  Requires-Dist: typer (>=0.12.5,<0.13.0)
54
56
  Project-URL: Repository, https://github.com/DS4SD/docling
@@ -94,6 +96,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
94
96
  ### Coming soon
95
97
 
96
98
  * 📝 Metadata extraction, including title, authors, references & language
99
+ * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
100
+ * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
101
+ * 📝 Complex chemistry understanding (Molecular structures)
97
102
 
98
103
  ## Installation
99
104
 
@@ -38,6 +38,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
38
38
  ### Coming soon
39
39
 
40
40
  * 📝 Metadata extraction, including title, authors, references & language
41
+ * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
42
+ * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
43
+ * 📝 Complex chemistry understanding (Molecular structures)
41
44
 
42
45
  ## Installation
43
46
 
@@ -1,9 +1,9 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Set, Union
4
+ from typing import Optional, Set, Union
5
5
 
6
- from bs4 import BeautifulSoup
6
+ from bs4 import BeautifulSoup, Tag
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
9
9
  DoclingDocument,
@@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
24
24
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
25
25
  super().__init__(in_doc, path_or_stream)
26
26
  _log.debug("About to init HTML backend...")
27
- self.soup = None
27
+ self.soup: Optional[Tag] = None
28
28
  # HTML file:
29
29
  self.path_or_stream = path_or_stream
30
30
  # Initialise the parents for the hierarchy
@@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
89
89
  )
90
90
  return doc
91
91
 
92
- def walk(self, element, doc):
92
+ def walk(self, element: Tag, doc: DoclingDocument):
93
93
  try:
94
94
  # Iterate over elements in the body of the document
95
95
  for idx, element in enumerate(element.children):
@@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
106
106
 
107
107
  return doc
108
108
 
109
- def analyse_element(self, element, idx, doc):
109
+ def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
110
110
  """
111
111
  if element.name!=None:
112
112
  _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
@@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
136
136
  else:
137
137
  self.walk(element, doc)
138
138
 
139
- def get_direct_text(self, item):
139
+ def get_direct_text(self, item: Tag):
140
140
  """Get the direct text of the <li> element (ignoring nested lists)."""
141
141
  text = item.find(string=True, recursive=False)
142
142
  if isinstance(text, str):
@@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
145
145
  return ""
146
146
 
147
147
  # Function to recursively extract text from all child nodes
148
- def extract_text_recursively(self, item):
148
+ def extract_text_recursively(self, item: Tag):
149
149
  result = []
150
150
 
151
151
  if isinstance(item, str):
@@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
166
166
 
167
167
  return "".join(result) + " "
168
168
 
169
- def handle_header(self, element, idx, doc):
169
+ def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
170
170
  """Handles header tags (h1, h2, etc.)."""
171
171
  hlevel = int(element.name.replace("h", ""))
172
172
  slevel = hlevel - 1
@@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
208
208
  level=hlevel,
209
209
  )
210
210
 
211
- def handle_code(self, element, idx, doc):
211
+ def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
212
212
  """Handles monospace code snippets (pre)."""
213
213
  if element.text is None:
214
214
  return
@@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
216
216
  label = DocItemLabel.CODE
217
217
  if len(text) == 0:
218
218
  return
219
- doc.add_code(parent=self.parents[self.level], label=label, text=text)
219
+ doc.add_code(parent=self.parents[self.level], text=text)
220
220
 
221
- def handle_paragraph(self, element, idx, doc):
221
+ def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
222
222
  """Handles paragraph tags (p)."""
223
223
  if element.text is None:
224
224
  return
@@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
228
228
  return
229
229
  doc.add_text(parent=self.parents[self.level], label=label, text=text)
230
230
 
231
- def handle_list(self, element, idx, doc):
231
+ def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
232
232
  """Handles list tags (ul, ol) and their list items."""
233
233
 
234
234
  if element.name == "ul":
@@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
250
250
  self.parents[self.level + 1] = None
251
251
  self.level -= 1
252
252
 
253
- def handle_listitem(self, element, idx, doc):
253
+ def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
254
254
  """Handles listitem tags (li)."""
255
255
  nested_lists = element.find(["ul", "ol"])
256
256
 
@@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
304
304
  else:
305
305
  _log.warn("list-item has no text: ", element)
306
306
 
307
- def handle_table(self, element, idx, doc):
307
+ def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
308
308
  """Handles table tags."""
309
309
 
310
310
  nested_tables = element.find("table")
@@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
377
377
 
378
378
  doc.add_table(data=data, parent=self.parents[self.level])
379
379
 
380
- def get_list_text(self, list_element, level=0):
380
+ def get_list_text(self, list_element: Tag, level=0):
381
381
  """Recursively extract text from <ul> or <ol> with proper indentation."""
382
382
  result = []
383
383
  bullet_char = "*" # Default bullet character for unordered lists
@@ -403,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
403
403
 
404
404
  return result
405
405
 
406
- def extract_table_cell_text(self, cell):
406
+ def extract_table_cell_text(self, cell: Tag):
407
407
  """Extract text from a table cell, including lists with indents."""
408
408
  contains_lists = cell.find(["ul", "ol"])
409
409
  if contains_lists is None:
@@ -414,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
414
414
  )
415
415
  return cell.text
416
416
 
417
- def handle_figure(self, element, idx, doc):
417
+ def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
418
418
  """Handles image tags (img)."""
419
419
 
420
420
  # Extract the image URI from the <img> tag
@@ -437,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
437
437
  caption=fig_caption,
438
438
  )
439
439
 
440
- def handle_image(self, element, idx, doc):
440
+ def handle_image(self, element: Tag, idx, doc: DoclingDocument):
441
441
  """Handles image tags (img)."""
442
442
  doc.add_picture(parent=self.parents[self.level], caption=None)
@@ -6,6 +6,7 @@ from pathlib import Path
6
6
  from typing import List, Optional, Set, Union
7
7
 
8
8
  import marko
9
+ import marko.element
9
10
  import marko.ext
10
11
  import marko.ext.gfm
11
12
  import marko.inline
@@ -23,11 +24,16 @@ from docling_core.types.doc import (
23
24
  from marko import Markdown
24
25
 
25
26
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
+ from docling.backend.html_backend import HTMLDocumentBackend
26
28
  from docling.datamodel.base_models import InputFormat
27
29
  from docling.datamodel.document import InputDocument
28
30
 
29
31
  _log = logging.getLogger(__name__)
30
32
 
33
+ _MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
34
+ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
35
+ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
+
31
37
 
32
38
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
33
39
  def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
@@ -66,6 +72,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
66
72
  self.in_table = False
67
73
  self.md_table_buffer: list[str] = []
68
74
  self.inline_texts: list[str] = []
75
+ self._html_blocks: int = 0
69
76
 
70
77
  try:
71
78
  if isinstance(self.path_or_stream, BytesIO):
@@ -163,14 +170,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
163
170
 
164
171
  def iterate_elements(
165
172
  self,
166
- element: marko.block.Element,
173
+ element: marko.element.Element,
167
174
  depth: int,
168
175
  doc: DoclingDocument,
169
176
  parent_element: Optional[NodeItem] = None,
170
177
  ):
171
178
  # Iterates over all elements in the AST
172
179
  # Check for different element types and process relevant details
173
- if isinstance(element, marko.block.Heading):
180
+ if isinstance(element, marko.block.Heading) and len(element.children) > 0:
174
181
  self.close_table(doc)
175
182
  self.process_inline_text(parent_element, doc)
176
183
  _log.debug(
@@ -205,17 +212,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
205
212
  )
206
213
 
207
214
  elif isinstance(element, marko.block.List):
215
+ has_non_empty_list_items = False
216
+ for child in element.children:
217
+ if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
218
+ has_non_empty_list_items = True
219
+ break
220
+
208
221
  self.close_table(doc)
209
222
  self.process_inline_text(parent_element, doc)
210
223
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
211
- list_label = GroupLabel.LIST
212
- if element.ordered:
213
- list_label = GroupLabel.ORDERED_LIST
214
- parent_element = doc.add_group(
215
- label=list_label, name=f"list", parent=parent_element
216
- )
224
+ if has_non_empty_list_items:
225
+ label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
226
+ parent_element = doc.add_group(
227
+ label=label, name=f"list", parent=parent_element
228
+ )
217
229
 
218
- elif isinstance(element, marko.block.ListItem):
230
+ elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
219
231
  self.close_table(doc)
220
232
  self.process_inline_text(parent_element, doc)
221
233
  _log.debug(" - List item")
@@ -245,20 +257,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
245
257
 
246
258
  doc.add_picture(parent=parent_element, caption=fig_caption)
247
259
 
248
- elif isinstance(element, marko.block.Paragraph):
260
+ elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
249
261
  self.process_inline_text(parent_element, doc)
250
262
 
251
263
  elif isinstance(element, marko.inline.RawText):
252
264
  _log.debug(f" - Paragraph (raw text): {element.children}")
253
- snippet_text = str(element.children).strip()
265
+ snippet_text = element.children.strip()
254
266
  # Detect start of the table:
255
267
  if "|" in snippet_text:
256
268
  # most likely part of the markdown table
257
269
  self.in_table = True
258
270
  if len(self.md_table_buffer) > 0:
259
- self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
260
- snippet_text
261
- )
271
+ self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
262
272
  else:
263
273
  self.md_table_buffer.append(snippet_text)
264
274
  else:
@@ -274,18 +284,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
274
284
  snippet_text = str(element.children).strip()
275
285
  doc.add_code(parent=parent_element, text=snippet_text)
276
286
 
277
- elif isinstance(element, marko.block.CodeBlock):
287
+ elif (
288
+ isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
289
+ and len(element.children) > 0
290
+ and isinstance((first_child := element.children[0]), marko.inline.RawText)
291
+ and len(snippet_text := (first_child.children.strip())) > 0
292
+ ):
278
293
  self.close_table(doc)
279
294
  self.process_inline_text(parent_element, doc)
280
295
  _log.debug(f" - Code Block: {element.children}")
281
- snippet_text = str(element.children[0].children).strip() # type: ignore
282
- doc.add_code(parent=parent_element, text=snippet_text)
283
-
284
- elif isinstance(element, marko.block.FencedCode):
285
- self.close_table(doc)
286
- self.process_inline_text(parent_element, doc)
287
- _log.debug(f" - Code Block: {element.children}")
288
- snippet_text = str(element.children[0].children).strip() # type: ignore
289
296
  doc.add_code(parent=parent_element, text=snippet_text)
290
297
 
291
298
  elif isinstance(element, marko.inline.LineBreak):
@@ -294,29 +301,38 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
294
301
  self.md_table_buffer.append("")
295
302
 
296
303
  elif isinstance(element, marko.block.HTMLBlock):
304
+ self._html_blocks += 1
297
305
  self.process_inline_text(parent_element, doc)
298
306
  self.close_table(doc)
299
307
  _log.debug("HTML Block: {}".format(element))
300
308
  if (
301
- len(element.children) > 0
309
+ len(element.body) > 0
302
310
  ): # If Marko doesn't return any content for HTML block, skip it
303
- snippet_text = str(element.children).strip()
304
- doc.add_text(
305
- label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
306
- )
311
+ html_block = element.body.strip()
312
+
313
+ # wrap in markers to enable post-processing in convert()
314
+ text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
315
+ doc.add_code(parent=parent_element, text=text_to_add)
307
316
  else:
308
317
  if not isinstance(element, str):
309
318
  self.close_table(doc)
310
319
  _log.debug("Some other element: {}".format(element))
311
320
 
321
+ processed_block_types = (
322
+ marko.block.ListItem,
323
+ marko.block.Heading,
324
+ marko.block.CodeBlock,
325
+ marko.block.FencedCode,
326
+ # marko.block.Paragraph,
327
+ marko.inline.RawText,
328
+ )
329
+
312
330
  # Iterate through the element's children (if any)
313
- if not isinstance(element, marko.block.ListItem):
314
- if not isinstance(element, marko.block.Heading):
315
- if not isinstance(element, marko.block.FencedCode):
316
- # if not isinstance(element, marko.block.Paragraph):
317
- if hasattr(element, "children"):
318
- for child in element.children:
319
- self.iterate_elements(child, depth + 1, doc, parent_element)
331
+ if hasattr(element, "children") and not isinstance(
332
+ element, processed_block_types
333
+ ):
334
+ for child in element.children:
335
+ self.iterate_elements(child, depth + 1, doc, parent_element)
320
336
 
321
337
  def is_valid(self) -> bool:
322
338
  return self.valid
@@ -352,6 +368,43 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
352
368
  # Start iterating from the root of the AST
353
369
  self.iterate_elements(parsed_ast, 0, doc, None)
354
370
  self.process_inline_text(None, doc) # handle last hanging inline text
371
+ self.close_table(doc=doc) # handle any last hanging table
372
+
373
+ # if HTML blocks were detected, export to HTML and delegate to HTML backend
374
+ if self._html_blocks > 0:
375
+
376
+ # export to HTML
377
+ html_backend_cls = HTMLDocumentBackend
378
+ html_str = doc.export_to_html()
379
+
380
+ def _restore_original_html(txt, regex):
381
+ _txt, count = re.subn(regex, "", txt)
382
+ if count != self._html_blocks:
383
+ raise RuntimeError(
384
+ "An internal error has occurred during Markdown conversion."
385
+ )
386
+ return _txt
387
+
388
+ # restore original HTML by removing previouly added markers
389
+ for regex in [
390
+ rf"<pre>\s*<code>\s*{_START_MARKER}",
391
+ rf"{_STOP_MARKER}\s*</code>\s*</pre>",
392
+ ]:
393
+ html_str = _restore_original_html(txt=html_str, regex=regex)
394
+ self._html_blocks = 0
395
+
396
+ # delegate to HTML backend
397
+ stream = BytesIO(bytes(html_str, encoding="utf-8"))
398
+ in_doc = InputDocument(
399
+ path_or_stream=stream,
400
+ format=InputFormat.HTML,
401
+ backend=html_backend_cls,
402
+ filename=self.file.name,
403
+ )
404
+ html_backend_obj = html_backend_cls(
405
+ in_doc=in_doc, path_or_stream=stream
406
+ )
407
+ doc = html_backend_obj.convert()
355
408
  else:
356
409
  raise RuntimeError(
357
410
  f"Cannot convert md with {self.document_hash} because the backend failed to init."
@@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
98
98
 
99
99
  return doc
100
100
 
101
- def generate_prov(self, shape, slide_ind, text=""):
102
- left = shape.left
103
- top = shape.top
104
- width = shape.width
105
- height = shape.height
101
+ def generate_prov(
102
+ self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
103
+ ):
104
+ if shape.left:
105
+ left = shape.left
106
+ top = shape.top
107
+ width = shape.width
108
+ height = shape.height
109
+ else:
110
+ left = 0
111
+ top = 0
112
+ width = slide_size.width
113
+ height = slide_size.height
106
114
  shape_bbox = [left, top, left + width, top + height]
107
115
  shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
108
- # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
109
116
  prov = ProvenanceItem(
110
117
  page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
111
118
  )
112
119
 
113
120
  return prov
114
121
 
115
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
122
+ def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
116
123
  is_a_list = False
117
124
  is_list_group_created = False
118
125
  enum_list_item_value = 0
@@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
121
128
  list_text = ""
122
129
  list_label = GroupLabel.LIST
123
130
  doc_label = DocItemLabel.LIST_ITEM
124
- prov = self.generate_prov(shape, slide_ind, shape.text.strip())
131
+ prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
125
132
 
126
133
  # Identify if shape contains lists
127
134
  for paragraph in shape.text_frame.paragraphs:
@@ -270,18 +277,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
270
277
  )
271
278
  return
272
279
 
273
- def handle_pictures(self, shape, parent_slide, slide_ind, doc):
274
- # Get the image bytes
275
- image = shape.image
276
- image_bytes = image.blob
277
- im_dpi, _ = image.dpi
278
-
280
+ def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
279
281
  # Open it with PIL
280
282
  try:
283
+ # Get the image bytes
284
+ image = shape.image
285
+ image_bytes = image.blob
286
+ im_dpi, _ = image.dpi
281
287
  pil_image = Image.open(BytesIO(image_bytes))
282
288
 
283
289
  # shape has picture
284
- prov = self.generate_prov(shape, slide_ind, "")
290
+ prov = self.generate_prov(shape, slide_ind, "", slide_size)
285
291
  doc.add_picture(
286
292
  parent=parent_slide,
287
293
  image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
@@ -292,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
292
298
  _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
293
299
  return
294
300
 
295
- def handle_tables(self, shape, parent_slide, slide_ind, doc):
301
+ def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
296
302
  # Handling tables, images, charts
297
303
  if shape.has_table:
298
304
  table = shape.table
299
305
  table_xml = shape._element
300
306
 
301
- prov = self.generate_prov(shape, slide_ind, "")
307
+ prov = self.generate_prov(shape, slide_ind, "", slide_size)
302
308
 
303
309
  num_cols = 0
304
310
  num_rows = len(table.rows)
@@ -375,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
375
381
  name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
376
382
  )
377
383
 
378
- size = Size(width=slide_width, height=slide_height)
379
- parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
384
+ slide_size = Size(width=slide_width, height=slide_height)
385
+ parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
380
386
 
381
- def handle_shapes(shape, parent_slide, slide_ind, doc):
382
- handle_groups(shape, parent_slide, slide_ind, doc)
387
+ def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
388
+ handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
383
389
  if shape.has_table:
384
390
  # Handle Tables
385
- self.handle_tables(shape, parent_slide, slide_ind, doc)
391
+ self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
386
392
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
387
393
  # Handle Pictures
388
- self.handle_pictures(shape, parent_slide, slide_ind, doc)
394
+ self.handle_pictures(
395
+ shape, parent_slide, slide_ind, doc, slide_size
396
+ )
389
397
  # If shape doesn't have any text, move on to the next shape
390
398
  if not hasattr(shape, "text"):
391
399
  return
@@ -397,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
397
405
  _log.warning("Warning: shape has text but not text_frame")
398
406
  return
399
407
  # Handle other text elements, including lists (bullet lists, numbered lists)
400
- self.handle_text_elements(shape, parent_slide, slide_ind, doc)
408
+ self.handle_text_elements(
409
+ shape, parent_slide, slide_ind, doc, slide_size
410
+ )
401
411
  return
402
412
 
403
- def handle_groups(shape, parent_slide, slide_ind, doc):
413
+ def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
404
414
  if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
405
415
  for groupedshape in shape.shapes:
406
- handle_shapes(groupedshape, parent_slide, slide_ind, doc)
416
+ handle_shapes(
417
+ groupedshape, parent_slide, slide_ind, doc, slide_size
418
+ )
407
419
 
408
420
  # Loop through each shape in the slide
409
421
  for shape in slide.shapes:
410
- handle_shapes(shape, parent_slide, slide_ind, doc)
422
+ handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
411
423
 
412
424
  return doc