docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,618 @@
1
+ import logging
2
+ import re
3
+ import warnings
4
+ from copy import deepcopy
5
+ from enum import Enum
6
+ from html import unescape
7
+ from io import BytesIO
8
+ from pathlib import Path
9
+ from typing import Literal, Optional, Union, cast
10
+
11
+ import marko
12
+ import marko.element
13
+ import marko.inline
14
+ from docling_core.types.doc import (
15
+ ContentLayer,
16
+ DocItem,
17
+ DocItemLabel,
18
+ DoclingDocument,
19
+ DocumentOrigin,
20
+ ListItem,
21
+ NodeItem,
22
+ TableCell,
23
+ TableData,
24
+ TextItem,
25
+ )
26
+ from docling_core.types.doc.document import Formatting
27
+ from marko import Markdown
28
+ from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
29
+ from typing_extensions import Annotated, override
30
+
31
+ from docling.backend.abstract_backend import (
32
+ DeclarativeDocumentBackend,
33
+ )
34
+ from docling.backend.html_backend import HTMLDocumentBackend
35
+ from docling.datamodel.backend_options import (
36
+ HTMLBackendOptions,
37
+ MarkdownBackendOptions,
38
+ )
39
+ from docling.datamodel.base_models import InputFormat
40
+ from docling.datamodel.document import InputDocument
41
+
42
+ _log = logging.getLogger(__name__)
43
+
44
+ _MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
45
+ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
46
+ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
47
+
48
+
49
+ class _PendingCreationType(str, Enum):
50
+ """CoordOrigin."""
51
+
52
+ HEADING = "heading"
53
+ LIST_ITEM = "list_item"
54
+
55
+
56
+ class _HeadingCreationPayload(BaseModel):
57
+ kind: Literal["heading"] = "heading"
58
+ level: int
59
+
60
+
61
+ class _ListItemCreationPayload(BaseModel):
62
+ kind: Literal["list_item"] = "list_item"
63
+ enumerated: bool
64
+
65
+
66
+ _CreationPayload = Annotated[
67
+ Union[
68
+ _HeadingCreationPayload,
69
+ _ListItemCreationPayload,
70
+ ],
71
+ Field(discriminator="kind"),
72
+ ]
73
+
74
+
75
+ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
76
+ def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
77
+ # This regex will match any sequence of underscores
78
+ pattern = r"_+"
79
+
80
+ def replace_match(match):
81
+ underscore_sequence = match.group(
82
+ 0
83
+ ) # Get the full match (sequence of underscores)
84
+
85
+ # Shorten the sequence if it exceeds max_length
86
+ if len(underscore_sequence) > max_length:
87
+ return "_" * max_length
88
+ else:
89
+ return underscore_sequence # Leave it unchanged if it is shorter or equal to max_length
90
+
91
+ # Use re.sub to replace long underscore sequences
92
+ shortened_text = re.sub(pattern, replace_match, markdown_text)
93
+
94
+ if len(shortened_text) != len(markdown_text):
95
+ warnings.warn("Detected potentially incorrect Markdown, correcting...")
96
+
97
+ return shortened_text
98
+
99
+ @override
100
+ def __init__(
101
+ self,
102
+ in_doc: InputDocument,
103
+ path_or_stream: Union[BytesIO, Path],
104
+ options: MarkdownBackendOptions = MarkdownBackendOptions(),
105
+ ):
106
+ super().__init__(in_doc, path_or_stream, options)
107
+
108
+ _log.debug("Starting MarkdownDocumentBackend...")
109
+
110
+ # Markdown file:
111
+ self.path_or_stream = path_or_stream
112
+ self.valid = True
113
+ self.markdown = "" # To store original Markdown string
114
+
115
+ self.in_table = False
116
+ self.md_table_buffer: list[str] = []
117
+ self._html_blocks: int = 0
118
+
119
+ try:
120
+ if isinstance(self.path_or_stream, BytesIO):
121
+ text_stream = self.path_or_stream.getvalue().decode("utf-8")
122
+ # remove invalid sequences
123
+ # very long sequences of underscores will lead to unnecessary long processing times.
124
+ # In any proper Markdown files, underscores have to be escaped,
125
+ # otherwise they represent emphasis (bold or italic)
126
+ self.markdown = self._shorten_underscore_sequences(text_stream)
127
+ if isinstance(self.path_or_stream, Path):
128
+ with open(self.path_or_stream, encoding="utf-8") as f:
129
+ md_content = f.read()
130
+ # remove invalid sequences
131
+ # very long sequences of underscores will lead to unnecessary long processing times.
132
+ # In any proper Markdown files, underscores have to be escaped,
133
+ # otherwise they represent emphasis (bold or italic)
134
+ self.markdown = self._shorten_underscore_sequences(md_content)
135
+ self.valid = True
136
+
137
+ _log.debug(self.markdown)
138
+ except Exception as e:
139
+ raise RuntimeError(
140
+ f"Could not initialize MD backend for file with hash {self.document_hash}."
141
+ ) from e
142
+ return
143
+
144
+ def _close_table(self, doc: DoclingDocument):
145
+ if self.in_table:
146
+ _log.debug("=== TABLE START ===")
147
+ for md_table_row in self.md_table_buffer:
148
+ _log.debug(md_table_row)
149
+ _log.debug("=== TABLE END ===")
150
+ tcells: list[TableCell] = []
151
+ result_table = []
152
+ for n, md_table_row in enumerate(self.md_table_buffer):
153
+ data = []
154
+ if n == 0:
155
+ header = [t.strip() for t in md_table_row.split("|")[1:-1]]
156
+ for value in header:
157
+ data.append(value)
158
+ result_table.append(data)
159
+ if n > 1:
160
+ values = [t.strip() for t in md_table_row.split("|")[1:-1]]
161
+ for value in values:
162
+ data.append(value)
163
+ result_table.append(data)
164
+
165
+ for trow_ind, trow in enumerate(result_table):
166
+ for tcol_ind, cellval in enumerate(trow):
167
+ row_span = (
168
+ 1 # currently supporting just simple tables (without spans)
169
+ )
170
+ col_span = (
171
+ 1 # currently supporting just simple tables (without spans)
172
+ )
173
+ icell = TableCell(
174
+ text=cellval.strip(),
175
+ row_span=row_span,
176
+ col_span=col_span,
177
+ start_row_offset_idx=trow_ind,
178
+ end_row_offset_idx=trow_ind + row_span,
179
+ start_col_offset_idx=tcol_ind,
180
+ end_col_offset_idx=tcol_ind + col_span,
181
+ column_header=trow_ind == 0,
182
+ row_header=False,
183
+ )
184
+ tcells.append(icell)
185
+
186
+ num_rows = len(result_table)
187
+ num_cols = len(result_table[0])
188
+ self.in_table = False
189
+ self.md_table_buffer = [] # clean table markdown buffer
190
+ # Initialize Docling TableData
191
+ table_data = TableData(
192
+ num_rows=num_rows, num_cols=num_cols, table_cells=tcells
193
+ )
194
+ # Populate
195
+ for tcell in tcells:
196
+ table_data.table_cells.append(tcell)
197
+ if len(tcells) > 0:
198
+ doc.add_table(data=table_data)
199
+ return
200
+
201
+ def _create_list_item(
202
+ self,
203
+ doc: DoclingDocument,
204
+ parent_item: Optional[NodeItem],
205
+ text: str,
206
+ enumerated: bool,
207
+ formatting: Optional[Formatting] = None,
208
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
209
+ ):
210
+ item = doc.add_list_item(
211
+ text=text,
212
+ enumerated=enumerated,
213
+ parent=parent_item,
214
+ formatting=formatting,
215
+ hyperlink=hyperlink,
216
+ )
217
+ return item
218
+
219
+ def _create_heading_item(
220
+ self,
221
+ doc: DoclingDocument,
222
+ parent_item: Optional[NodeItem],
223
+ text: str,
224
+ level: int,
225
+ formatting: Optional[Formatting] = None,
226
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
227
+ ):
228
+ if level == 1:
229
+ item = doc.add_title(
230
+ text=text,
231
+ parent=parent_item,
232
+ formatting=formatting,
233
+ hyperlink=hyperlink,
234
+ )
235
+ else:
236
+ item = doc.add_heading(
237
+ text=text,
238
+ level=level - 1,
239
+ parent=parent_item,
240
+ formatting=formatting,
241
+ hyperlink=hyperlink,
242
+ )
243
+ return item
244
+
245
+ def _iterate_elements( # noqa: C901
246
+ self,
247
+ *,
248
+ element: marko.element.Element,
249
+ depth: int,
250
+ doc: DoclingDocument,
251
+ visited: set[marko.element.Element],
252
+ creation_stack: list[
253
+ _CreationPayload
254
+ ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
255
+ list_ordered_flag_by_ref: dict[str, bool],
256
+ list_last_item_by_ref: dict[str, ListItem],
257
+ parent_item: Optional[NodeItem] = None,
258
+ formatting: Optional[Formatting] = None,
259
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
260
+ ):
261
+ if element in visited:
262
+ return
263
+
264
+ # Iterates over all elements in the AST
265
+ # Check for different element types and process relevant details
266
+ if (
267
+ isinstance(element, marko.block.Heading)
268
+ or isinstance(element, marko.block.SetextHeading)
269
+ ) and len(element.children) > 0:
270
+ self._close_table(doc)
271
+ _log.debug(
272
+ f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
273
+ )
274
+
275
+ if len(element.children) > 1: # inline group will be created further down
276
+ parent_item = self._create_heading_item(
277
+ doc=doc,
278
+ parent_item=parent_item,
279
+ text="",
280
+ level=element.level,
281
+ formatting=formatting,
282
+ hyperlink=hyperlink,
283
+ )
284
+ else:
285
+ creation_stack.append(_HeadingCreationPayload(level=element.level))
286
+
287
+ elif isinstance(element, marko.block.List):
288
+ has_non_empty_list_items = False
289
+ for child in element.children:
290
+ if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
291
+ has_non_empty_list_items = True
292
+ break
293
+
294
+ self._close_table(doc)
295
+ _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
296
+ if has_non_empty_list_items:
297
+ parent_item = doc.add_list_group(name="list", parent=parent_item)
298
+ list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered
299
+
300
+ elif (
301
+ isinstance(element, marko.block.ListItem)
302
+ and len(element.children) > 0
303
+ and isinstance((child := element.children[0]), marko.block.Paragraph)
304
+ and len(child.children) > 0
305
+ ):
306
+ self._close_table(doc)
307
+ _log.debug(" - List item")
308
+
309
+ enumerated = (
310
+ list_ordered_flag_by_ref.get(parent_item.self_ref, False)
311
+ if parent_item
312
+ else False
313
+ )
314
+ non_list_children: list[marko.element.Element] = [
315
+ item
316
+ for item in child.children
317
+ if not isinstance(item, marko.block.ListItem)
318
+ ]
319
+ if len(non_list_children) > 1: # inline group will be created further down
320
+ parent_ref: Optional[str] = (
321
+ parent_item.self_ref if parent_item else None
322
+ )
323
+ parent_item = self._create_list_item(
324
+ doc=doc,
325
+ parent_item=parent_item,
326
+ text="",
327
+ enumerated=enumerated,
328
+ formatting=formatting,
329
+ hyperlink=hyperlink,
330
+ )
331
+ if parent_ref:
332
+ list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
333
+ else:
334
+ creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
335
+
336
+ elif isinstance(element, marko.inline.Image):
337
+ self._close_table(doc)
338
+ _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
339
+
340
+ fig_caption: Optional[TextItem] = None
341
+ if element.title is not None and element.title != "":
342
+ title = unescape(element.title)
343
+ fig_caption = doc.add_text(
344
+ label=DocItemLabel.CAPTION,
345
+ text=title,
346
+ formatting=formatting,
347
+ hyperlink=hyperlink,
348
+ )
349
+
350
+ doc.add_picture(parent=parent_item, caption=fig_caption)
351
+
352
+ elif isinstance(element, marko.inline.Emphasis):
353
+ _log.debug(f" - Emphasis: {element.children}")
354
+ formatting = deepcopy(formatting) if formatting else Formatting()
355
+ formatting.italic = True
356
+
357
+ elif isinstance(element, marko.inline.StrongEmphasis):
358
+ _log.debug(f" - StrongEmphasis: {element.children}")
359
+ formatting = deepcopy(formatting) if formatting else Formatting()
360
+ formatting.bold = True
361
+
362
+ elif isinstance(element, marko.inline.Link):
363
+ _log.debug(f" - Link: {element.children}")
364
+ hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
365
+ element.dest
366
+ )
367
+
368
+ elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
369
+ _log.debug(f" - RawText/Literal: {element.children}")
370
+ snippet_text = (
371
+ element.children.strip() if isinstance(element.children, str) else ""
372
+ )
373
+ snippet_text = unescape(snippet_text)
374
+ # Detect start of the table:
375
+ if "|" in snippet_text or self.in_table:
376
+ # most likely part of the markdown table
377
+ self.in_table = True
378
+ if len(self.md_table_buffer) > 0:
379
+ self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
380
+ else:
381
+ self.md_table_buffer.append(snippet_text)
382
+ elif snippet_text:
383
+ self._close_table(doc)
384
+
385
+ if creation_stack:
386
+ while len(creation_stack) > 0:
387
+ to_create = creation_stack.pop()
388
+ if isinstance(to_create, _ListItemCreationPayload):
389
+ enumerated = (
390
+ list_ordered_flag_by_ref.get(
391
+ parent_item.self_ref, False
392
+ )
393
+ if parent_item
394
+ else False
395
+ )
396
+ parent_ref = parent_item.self_ref if parent_item else None
397
+ parent_item = self._create_list_item(
398
+ doc=doc,
399
+ parent_item=parent_item,
400
+ text=snippet_text,
401
+ enumerated=enumerated,
402
+ formatting=formatting,
403
+ hyperlink=hyperlink,
404
+ )
405
+ if parent_ref:
406
+ list_last_item_by_ref[parent_ref] = cast(
407
+ ListItem, parent_item
408
+ )
409
+
410
+ elif isinstance(to_create, _HeadingCreationPayload):
411
+ # not keeping as parent_item as logic for correctly tracking
412
+ # that not implemented yet (section components not captured
413
+ # as heading children in marko)
414
+ self._create_heading_item(
415
+ doc=doc,
416
+ parent_item=parent_item,
417
+ text=snippet_text,
418
+ level=to_create.level,
419
+ formatting=formatting,
420
+ hyperlink=hyperlink,
421
+ )
422
+ else:
423
+ doc.add_text(
424
+ label=DocItemLabel.TEXT,
425
+ parent=parent_item,
426
+ text=snippet_text,
427
+ formatting=formatting,
428
+ hyperlink=hyperlink,
429
+ )
430
+
431
+ elif isinstance(element, marko.inline.CodeSpan):
432
+ self._close_table(doc)
433
+ _log.debug(f" - Code Span: {element.children}")
434
+ snippet_text = str(element.children).strip()
435
+ doc.add_code(
436
+ parent=parent_item,
437
+ text=snippet_text,
438
+ formatting=formatting,
439
+ hyperlink=hyperlink,
440
+ )
441
+
442
+ elif (
443
+ isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
444
+ and len(element.children) > 0
445
+ and isinstance((child := element.children[0]), marko.inline.RawText)
446
+ and len(snippet_text := (child.children.strip())) > 0
447
+ ):
448
+ self._close_table(doc)
449
+ _log.debug(f" - Code Block: {element.children}")
450
+ doc.add_code(
451
+ parent=parent_item,
452
+ text=snippet_text,
453
+ formatting=formatting,
454
+ hyperlink=hyperlink,
455
+ )
456
+
457
+ elif isinstance(element, marko.inline.LineBreak):
458
+ if self.in_table:
459
+ _log.debug("Line break in a table")
460
+ self.md_table_buffer.append("")
461
+
462
+ elif isinstance(element, marko.block.HTMLBlock):
463
+ self._html_blocks += 1
464
+ self._close_table(doc)
465
+ _log.debug(f"HTML Block: {element}")
466
+ if (
467
+ len(element.body) > 0
468
+ ): # If Marko doesn't return any content for HTML block, skip it
469
+ html_block = element.body.strip()
470
+
471
+ # wrap in markers to enable post-processing in convert()
472
+ text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
473
+ doc.add_code(
474
+ parent=parent_item,
475
+ text=text_to_add,
476
+ formatting=formatting,
477
+ hyperlink=hyperlink,
478
+ )
479
+ else:
480
+ if not isinstance(element, str):
481
+ self._close_table(doc)
482
+ _log.debug(f"Some other element: {element}")
483
+
484
+ if (
485
+ isinstance(element, (marko.block.Paragraph, marko.block.Heading))
486
+ and len(element.children) > 1
487
+ ):
488
+ parent_item = doc.add_inline_group(parent=parent_item)
489
+
490
+ processed_block_types = (
491
+ marko.block.CodeBlock,
492
+ marko.block.FencedCode,
493
+ marko.inline.RawText,
494
+ )
495
+
496
+ # Iterate through the element's children (if any)
497
+ if hasattr(element, "children") and not isinstance(
498
+ element, processed_block_types
499
+ ):
500
+ for child in element.children:
501
+ if (
502
+ isinstance(element, marko.block.ListItem)
503
+ and isinstance(child, marko.block.List)
504
+ and parent_item
505
+ and list_last_item_by_ref.get(parent_item.self_ref, None)
506
+ ):
507
+ _log.debug(
508
+ f"walking into new List hanging from item of parent list {parent_item.self_ref}"
509
+ )
510
+ parent_item = list_last_item_by_ref[parent_item.self_ref]
511
+
512
+ self._iterate_elements(
513
+ element=child,
514
+ depth=depth + 1,
515
+ doc=doc,
516
+ visited=visited,
517
+ creation_stack=creation_stack,
518
+ list_ordered_flag_by_ref=list_ordered_flag_by_ref,
519
+ list_last_item_by_ref=list_last_item_by_ref,
520
+ parent_item=parent_item,
521
+ formatting=formatting,
522
+ hyperlink=hyperlink,
523
+ )
524
+
525
+ def is_valid(self) -> bool:
526
+ return self.valid
527
+
528
+ def unload(self):
529
+ if isinstance(self.path_or_stream, BytesIO):
530
+ self.path_or_stream.close()
531
+ self.path_or_stream = None
532
+
533
+ @classmethod
534
+ def supports_pagination(cls) -> bool:
535
+ return False
536
+
537
+ @classmethod
538
+ def supported_formats(cls) -> set[InputFormat]:
539
+ return {InputFormat.MD}
540
+
541
+ def convert(self) -> DoclingDocument:
542
+ _log.debug("converting Markdown...")
543
+
544
+ origin = DocumentOrigin(
545
+ filename=self.file.name or "file",
546
+ mimetype="text/markdown",
547
+ binary_hash=self.document_hash,
548
+ )
549
+
550
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
551
+
552
+ if self.is_valid():
553
+ # Parse the markdown into an abstract syntax tree (AST)
554
+ marko_parser = Markdown()
555
+ parsed_ast = marko_parser.parse(self.markdown)
556
+ # Start iterating from the root of the AST
557
+ self._iterate_elements(
558
+ element=parsed_ast,
559
+ depth=0,
560
+ doc=doc,
561
+ parent_item=None,
562
+ visited=set(),
563
+ creation_stack=[],
564
+ list_ordered_flag_by_ref={},
565
+ list_last_item_by_ref={},
566
+ )
567
+ self._close_table(doc=doc) # handle any last hanging table
568
+
569
+ # if HTML blocks were detected, export to HTML and delegate to HTML backend
570
+ if self._html_blocks > 0:
571
+ # export to HTML
572
+ html_backend_cls = HTMLDocumentBackend
573
+ html_str = doc.export_to_html()
574
+
575
+ def _restore_original_html(txt, regex):
576
+ _txt, count = re.subn(regex, "", txt)
577
+ if count != self._html_blocks:
578
+ raise RuntimeError(
579
+ "An internal error has occurred during Markdown conversion."
580
+ )
581
+ return _txt
582
+
583
+ # restore original HTML by removing previously added markers
584
+ for regex in [
585
+ rf"<pre>\s*<code>\s*{_START_MARKER}",
586
+ rf"{_STOP_MARKER}\s*</code>\s*</pre>",
587
+ ]:
588
+ html_str = _restore_original_html(txt=html_str, regex=regex)
589
+ self._html_blocks = 0
590
+ # delegate to HTML backend
591
+ stream = BytesIO(bytes(html_str, encoding="utf-8"))
592
+ md_options = cast(MarkdownBackendOptions, self.options)
593
+ html_options = HTMLBackendOptions(
594
+ enable_local_fetch=md_options.enable_local_fetch,
595
+ enable_remote_fetch=md_options.enable_remote_fetch,
596
+ fetch_images=md_options.fetch_images,
597
+ source_uri=md_options.source_uri,
598
+ infer_furniture=False,
599
+ add_title=False,
600
+ )
601
+ in_doc = InputDocument(
602
+ path_or_stream=stream,
603
+ format=InputFormat.HTML,
604
+ backend=html_backend_cls,
605
+ filename=self.file.name,
606
+ backend_options=html_options,
607
+ )
608
+ html_backend_obj = html_backend_cls(
609
+ in_doc=in_doc,
610
+ path_or_stream=stream,
611
+ options=html_options,
612
+ )
613
+ doc = html_backend_obj.convert()
614
+ else:
615
+ raise RuntimeError(
616
+ f"Cannot convert md with {self.document_hash} because the backend failed to init."
617
+ )
618
+ return doc