docling 2.17.0__tar.gz → 2.19.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.17.0 → docling-2.19.0}/PKG-INFO +11 -5
- {docling-2.17.0 → docling-2.19.0}/README.md +3 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/html_backend.py +18 -18
- {docling-2.17.0 → docling-2.19.0}/docling/backend/md_backend.py +144 -75
- {docling-2.17.0 → docling-2.19.0}/docling/backend/mspowerpoint_backend.py +39 -27
- {docling-2.17.0 → docling-2.19.0}/docling/backend/msword_backend.py +173 -131
- {docling-2.17.0 → docling-2.19.0}/docling/cli/main.py +8 -0
- docling-2.19.0/docling/cli/models.py +105 -0
- docling-2.19.0/docling/cli/tools.py +17 -0
- {docling-2.17.0 → docling-2.19.0}/docling/datamodel/document.py +2 -0
- {docling-2.17.0 → docling-2.19.0}/docling/datamodel/settings.py +18 -1
- {docling-2.17.0 → docling-2.19.0}/docling/document_converter.py +12 -2
- {docling-2.17.0 → docling-2.19.0}/docling/models/base_model.py +3 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/code_formula_model.py +15 -9
- {docling-2.17.0 → docling-2.19.0}/docling/models/document_picture_classifier.py +11 -8
- {docling-2.17.0 → docling-2.19.0}/docling/models/easyocr_model.py +50 -3
- {docling-2.17.0 → docling-2.19.0}/docling/models/layout_model.py +49 -3
- {docling-2.17.0 → docling-2.19.0}/docling/models/table_structure_model.py +53 -7
- {docling-2.17.0 → docling-2.19.0}/docling/pipeline/base_pipeline.py +4 -2
- {docling-2.17.0 → docling-2.19.0}/docling/pipeline/standard_pdf_pipeline.py +25 -24
- {docling-2.17.0 → docling-2.19.0}/docling/utils/glm_utils.py +4 -0
- docling-2.19.0/docling/utils/model_downloader.py +72 -0
- {docling-2.17.0 → docling-2.19.0}/docling/utils/utils.py +24 -0
- {docling-2.17.0 → docling-2.19.0}/pyproject.toml +11 -5
- {docling-2.17.0 → docling-2.19.0}/LICENSE +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/xml/pubmed_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/chunking/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/cli/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/exceptions.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/py.typed +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/utils/__init__.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/utils/export.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/utils/profiling.py +0 -0
- {docling-2.17.0 → docling-2.19.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.19.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -19,16 +19,17 @@ Classifier: Programming Language :: Python :: 3.9
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.10
|
20
20
|
Classifier: Programming Language :: Python :: 3.11
|
21
21
|
Classifier: Programming Language :: Python :: 3.12
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
22
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
23
24
|
Provides-Extra: ocrmac
|
24
25
|
Provides-Extra: rapidocr
|
25
26
|
Provides-Extra: tesserocr
|
26
|
-
Requires-Dist: beautifulsoup4 (>=4.12.3,<
|
27
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
|
27
28
|
Requires-Dist: certifi (>=2024.7.4)
|
28
29
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
30
|
+
Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
|
30
31
|
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
31
|
-
Requires-Dist: docling-parse (>=3.
|
32
|
+
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
32
33
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
33
34
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
34
35
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -48,8 +49,10 @@ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
|
48
49
|
Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
|
49
50
|
Requires-Dist: requests (>=2.32.2,<3.0.0)
|
50
51
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
51
|
-
Requires-Dist: scipy (>=1.6.0,<
|
52
|
+
Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
|
53
|
+
Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
|
52
54
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
55
|
+
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
53
56
|
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
54
57
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
55
58
|
Description-Content-Type: text/markdown
|
@@ -94,6 +97,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
94
97
|
### Coming soon
|
95
98
|
|
96
99
|
* 📝 Metadata extraction, including title, authors, references & language
|
100
|
+
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
101
|
+
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
102
|
+
* 📝 Complex chemistry understanding (Molecular structures)
|
97
103
|
|
98
104
|
## Installation
|
99
105
|
|
@@ -38,6 +38,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
38
38
|
### Coming soon
|
39
39
|
|
40
40
|
* 📝 Metadata extraction, including title, authors, references & language
|
41
|
+
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
42
|
+
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
43
|
+
* 📝 Complex chemistry understanding (Molecular structures)
|
41
44
|
|
42
45
|
## Installation
|
43
46
|
|
@@ -1,9 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Set, Union
|
4
|
+
from typing import Optional, Set, Union
|
5
5
|
|
6
|
-
from bs4 import BeautifulSoup
|
6
|
+
from bs4 import BeautifulSoup, Tag
|
7
7
|
from docling_core.types.doc import (
|
8
8
|
DocItemLabel,
|
9
9
|
DoclingDocument,
|
@@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
24
24
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
25
25
|
super().__init__(in_doc, path_or_stream)
|
26
26
|
_log.debug("About to init HTML backend...")
|
27
|
-
self.soup = None
|
27
|
+
self.soup: Optional[Tag] = None
|
28
28
|
# HTML file:
|
29
29
|
self.path_or_stream = path_or_stream
|
30
30
|
# Initialise the parents for the hierarchy
|
@@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
89
89
|
)
|
90
90
|
return doc
|
91
91
|
|
92
|
-
def walk(self, element, doc):
|
92
|
+
def walk(self, element: Tag, doc: DoclingDocument):
|
93
93
|
try:
|
94
94
|
# Iterate over elements in the body of the document
|
95
95
|
for idx, element in enumerate(element.children):
|
@@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
106
106
|
|
107
107
|
return doc
|
108
108
|
|
109
|
-
def analyse_element(self, element, idx, doc):
|
109
|
+
def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
|
110
110
|
"""
|
111
111
|
if element.name!=None:
|
112
112
|
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
@@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
136
136
|
else:
|
137
137
|
self.walk(element, doc)
|
138
138
|
|
139
|
-
def get_direct_text(self, item):
|
139
|
+
def get_direct_text(self, item: Tag):
|
140
140
|
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
141
141
|
text = item.find(string=True, recursive=False)
|
142
142
|
if isinstance(text, str):
|
@@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
145
145
|
return ""
|
146
146
|
|
147
147
|
# Function to recursively extract text from all child nodes
|
148
|
-
def extract_text_recursively(self, item):
|
148
|
+
def extract_text_recursively(self, item: Tag):
|
149
149
|
result = []
|
150
150
|
|
151
151
|
if isinstance(item, str):
|
@@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
166
166
|
|
167
167
|
return "".join(result) + " "
|
168
168
|
|
169
|
-
def handle_header(self, element, idx, doc):
|
169
|
+
def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
|
170
170
|
"""Handles header tags (h1, h2, etc.)."""
|
171
171
|
hlevel = int(element.name.replace("h", ""))
|
172
172
|
slevel = hlevel - 1
|
@@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
208
208
|
level=hlevel,
|
209
209
|
)
|
210
210
|
|
211
|
-
def handle_code(self, element, idx, doc):
|
211
|
+
def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
|
212
212
|
"""Handles monospace code snippets (pre)."""
|
213
213
|
if element.text is None:
|
214
214
|
return
|
@@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
216
216
|
label = DocItemLabel.CODE
|
217
217
|
if len(text) == 0:
|
218
218
|
return
|
219
|
-
doc.add_code(parent=self.parents[self.level],
|
219
|
+
doc.add_code(parent=self.parents[self.level], text=text)
|
220
220
|
|
221
|
-
def handle_paragraph(self, element, idx, doc):
|
221
|
+
def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
|
222
222
|
"""Handles paragraph tags (p)."""
|
223
223
|
if element.text is None:
|
224
224
|
return
|
@@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
228
228
|
return
|
229
229
|
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
230
230
|
|
231
|
-
def handle_list(self, element, idx, doc):
|
231
|
+
def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
|
232
232
|
"""Handles list tags (ul, ol) and their list items."""
|
233
233
|
|
234
234
|
if element.name == "ul":
|
@@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
250
250
|
self.parents[self.level + 1] = None
|
251
251
|
self.level -= 1
|
252
252
|
|
253
|
-
def handle_listitem(self, element, idx, doc):
|
253
|
+
def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
|
254
254
|
"""Handles listitem tags (li)."""
|
255
255
|
nested_lists = element.find(["ul", "ol"])
|
256
256
|
|
@@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
304
304
|
else:
|
305
305
|
_log.warn("list-item has no text: ", element)
|
306
306
|
|
307
|
-
def handle_table(self, element, idx, doc):
|
307
|
+
def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
|
308
308
|
"""Handles table tags."""
|
309
309
|
|
310
310
|
nested_tables = element.find("table")
|
@@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
377
377
|
|
378
378
|
doc.add_table(data=data, parent=self.parents[self.level])
|
379
379
|
|
380
|
-
def get_list_text(self, list_element, level=0):
|
380
|
+
def get_list_text(self, list_element: Tag, level=0):
|
381
381
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
382
382
|
result = []
|
383
383
|
bullet_char = "*" # Default bullet character for unordered lists
|
@@ -403,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
403
403
|
|
404
404
|
return result
|
405
405
|
|
406
|
-
def extract_table_cell_text(self, cell):
|
406
|
+
def extract_table_cell_text(self, cell: Tag):
|
407
407
|
"""Extract text from a table cell, including lists with indents."""
|
408
408
|
contains_lists = cell.find(["ul", "ol"])
|
409
409
|
if contains_lists is None:
|
@@ -414,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
414
414
|
)
|
415
415
|
return cell.text
|
416
416
|
|
417
|
-
def handle_figure(self, element, idx, doc):
|
417
|
+
def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
|
418
418
|
"""Handles image tags (img)."""
|
419
419
|
|
420
420
|
# Extract the image URI from the <img> tag
|
@@ -437,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
437
437
|
caption=fig_caption,
|
438
438
|
)
|
439
439
|
|
440
|
-
def handle_image(self, element, idx, doc):
|
440
|
+
def handle_image(self, element: Tag, idx, doc: DoclingDocument):
|
441
441
|
"""Handles image tags (img)."""
|
442
442
|
doc.add_picture(parent=self.parents[self.level], caption=None)
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
6
6
|
from typing import List, Optional, Set, Union
|
7
7
|
|
8
8
|
import marko
|
9
|
+
import marko.element
|
9
10
|
import marko.ext
|
10
11
|
import marko.ext.gfm
|
11
12
|
import marko.inline
|
@@ -23,14 +24,19 @@ from docling_core.types.doc import (
|
|
23
24
|
from marko import Markdown
|
24
25
|
|
25
26
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
27
|
+
from docling.backend.html_backend import HTMLDocumentBackend
|
26
28
|
from docling.datamodel.base_models import InputFormat
|
27
29
|
from docling.datamodel.document import InputDocument
|
28
30
|
|
29
31
|
_log = logging.getLogger(__name__)
|
30
32
|
|
33
|
+
_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
|
34
|
+
_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
35
|
+
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
36
|
+
|
31
37
|
|
32
38
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
33
|
-
def
|
39
|
+
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
34
40
|
# This regex will match any sequence of underscores
|
35
41
|
pattern = r"_+"
|
36
42
|
|
@@ -66,6 +72,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
66
72
|
self.in_table = False
|
67
73
|
self.md_table_buffer: list[str] = []
|
68
74
|
self.inline_texts: list[str] = []
|
75
|
+
self._html_blocks: int = 0
|
69
76
|
|
70
77
|
try:
|
71
78
|
if isinstance(self.path_or_stream, BytesIO):
|
@@ -74,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
74
81
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
75
82
|
# In any proper Markdown files, underscores have to be escaped,
|
76
83
|
# otherwise they represent emphasis (bold or italic)
|
77
|
-
self.markdown = self.
|
84
|
+
self.markdown = self._shorten_underscore_sequences(text_stream)
|
78
85
|
if isinstance(self.path_or_stream, Path):
|
79
86
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
80
87
|
md_content = f.read()
|
@@ -82,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
82
89
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
83
90
|
# In any proper Markdown files, underscores have to be escaped,
|
84
91
|
# otherwise they represent emphasis (bold or italic)
|
85
|
-
self.markdown = self.
|
92
|
+
self.markdown = self._shorten_underscore_sequences(md_content)
|
86
93
|
self.valid = True
|
87
94
|
|
88
95
|
_log.debug(self.markdown)
|
@@ -92,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
92
99
|
) from e
|
93
100
|
return
|
94
101
|
|
95
|
-
def
|
102
|
+
def _close_table(self, doc: DoclingDocument):
|
96
103
|
if self.in_table:
|
97
104
|
_log.debug("=== TABLE START ===")
|
98
105
|
for md_table_row in self.md_table_buffer:
|
@@ -149,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
149
156
|
doc.add_table(data=table_data)
|
150
157
|
return
|
151
158
|
|
152
|
-
def
|
153
|
-
self,
|
159
|
+
def _process_inline_text(
|
160
|
+
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
154
161
|
):
|
155
162
|
txt = " ".join(self.inline_texts)
|
156
163
|
if len(txt) > 0:
|
157
164
|
doc.add_text(
|
158
165
|
label=DocItemLabel.PARAGRAPH,
|
159
|
-
parent=
|
166
|
+
parent=parent_item,
|
160
167
|
text=txt,
|
161
168
|
)
|
162
169
|
self.inline_texts = []
|
163
170
|
|
164
|
-
def
|
171
|
+
def _iterate_elements(
|
165
172
|
self,
|
166
|
-
element: marko.
|
173
|
+
element: marko.element.Element,
|
167
174
|
depth: int,
|
168
175
|
doc: DoclingDocument,
|
169
|
-
|
176
|
+
visited: Set[marko.element.Element],
|
177
|
+
parent_item: Optional[NodeItem] = None,
|
170
178
|
):
|
179
|
+
|
180
|
+
if element in visited:
|
181
|
+
return
|
182
|
+
|
171
183
|
# Iterates over all elements in the AST
|
172
184
|
# Check for different element types and process relevant details
|
173
|
-
if isinstance(element, marko.block.Heading):
|
174
|
-
self.
|
175
|
-
self.
|
185
|
+
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
186
|
+
self._close_table(doc)
|
187
|
+
self._process_inline_text(parent_item, doc)
|
176
188
|
_log.debug(
|
177
189
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
178
190
|
)
|
@@ -200,41 +212,48 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
200
212
|
traverse(element)
|
201
213
|
snippet_text = "".join(strings)
|
202
214
|
if len(snippet_text) > 0:
|
203
|
-
|
204
|
-
label=doc_label, parent=
|
215
|
+
parent_item = doc.add_text(
|
216
|
+
label=doc_label, parent=parent_item, text=snippet_text
|
205
217
|
)
|
206
218
|
|
207
219
|
elif isinstance(element, marko.block.List):
|
208
|
-
|
209
|
-
|
220
|
+
has_non_empty_list_items = False
|
221
|
+
for child in element.children:
|
222
|
+
if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
|
223
|
+
has_non_empty_list_items = True
|
224
|
+
break
|
225
|
+
|
226
|
+
self._close_table(doc)
|
227
|
+
self._process_inline_text(parent_item, doc)
|
210
228
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
)
|
229
|
+
if has_non_empty_list_items:
|
230
|
+
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
231
|
+
parent_item = doc.add_group(
|
232
|
+
label=label, name=f"list", parent=parent_item
|
233
|
+
)
|
217
234
|
|
218
|
-
elif isinstance(element, marko.block.ListItem):
|
219
|
-
self.
|
220
|
-
self.
|
235
|
+
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
|
236
|
+
self._close_table(doc)
|
237
|
+
self._process_inline_text(parent_item, doc)
|
221
238
|
_log.debug(" - List item")
|
222
239
|
|
223
|
-
|
240
|
+
first_child = element.children[0]
|
241
|
+
snippet_text = str(first_child.children[0].children) # type: ignore
|
224
242
|
is_numbered = False
|
225
243
|
if (
|
226
|
-
|
227
|
-
and isinstance(
|
228
|
-
and
|
244
|
+
parent_item is not None
|
245
|
+
and isinstance(parent_item, DocItem)
|
246
|
+
and parent_item.label == GroupLabel.ORDERED_LIST
|
229
247
|
):
|
230
248
|
is_numbered = True
|
231
249
|
doc.add_list_item(
|
232
|
-
enumerated=is_numbered, parent=
|
250
|
+
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
233
251
|
)
|
252
|
+
visited.add(first_child)
|
234
253
|
|
235
254
|
elif isinstance(element, marko.inline.Image):
|
236
|
-
self.
|
237
|
-
self.
|
255
|
+
self._close_table(doc)
|
256
|
+
self._process_inline_text(parent_item, doc)
|
238
257
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
239
258
|
|
240
259
|
fig_caption: Optional[TextItem] = None
|
@@ -243,50 +262,44 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
243
262
|
label=DocItemLabel.CAPTION, text=element.title
|
244
263
|
)
|
245
264
|
|
246
|
-
doc.add_picture(parent=
|
265
|
+
doc.add_picture(parent=parent_item, caption=fig_caption)
|
247
266
|
|
248
|
-
elif isinstance(element, marko.block.Paragraph):
|
249
|
-
self.
|
267
|
+
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
268
|
+
self._process_inline_text(parent_item, doc)
|
250
269
|
|
251
270
|
elif isinstance(element, marko.inline.RawText):
|
252
271
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
253
|
-
snippet_text =
|
272
|
+
snippet_text = element.children.strip()
|
254
273
|
# Detect start of the table:
|
255
274
|
if "|" in snippet_text:
|
256
275
|
# most likely part of the markdown table
|
257
276
|
self.in_table = True
|
258
277
|
if len(self.md_table_buffer) > 0:
|
259
|
-
self.md_table_buffer[len(self.md_table_buffer) - 1] +=
|
260
|
-
snippet_text
|
261
|
-
)
|
278
|
+
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
262
279
|
else:
|
263
280
|
self.md_table_buffer.append(snippet_text)
|
264
281
|
else:
|
265
|
-
self.
|
266
|
-
self.in_table = False
|
282
|
+
self._close_table(doc)
|
267
283
|
# most likely just inline text
|
268
284
|
self.inline_texts.append(str(element.children))
|
269
285
|
|
270
286
|
elif isinstance(element, marko.inline.CodeSpan):
|
271
|
-
self.
|
272
|
-
self.
|
287
|
+
self._close_table(doc)
|
288
|
+
self._process_inline_text(parent_item, doc)
|
273
289
|
_log.debug(f" - Code Span: {element.children}")
|
274
290
|
snippet_text = str(element.children).strip()
|
275
|
-
doc.add_code(parent=
|
276
|
-
|
277
|
-
elif
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
snippet_text
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
self.close_table(doc)
|
286
|
-
self.process_inline_text(parent_element, doc)
|
291
|
+
doc.add_code(parent=parent_item, text=snippet_text)
|
292
|
+
|
293
|
+
elif (
|
294
|
+
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
295
|
+
and len(element.children) > 0
|
296
|
+
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
297
|
+
and len(snippet_text := (first_child.children.strip())) > 0
|
298
|
+
):
|
299
|
+
self._close_table(doc)
|
300
|
+
self._process_inline_text(parent_item, doc)
|
287
301
|
_log.debug(f" - Code Block: {element.children}")
|
288
|
-
|
289
|
-
doc.add_code(parent=parent_element, text=snippet_text)
|
302
|
+
doc.add_code(parent=parent_item, text=snippet_text)
|
290
303
|
|
291
304
|
elif isinstance(element, marko.inline.LineBreak):
|
292
305
|
if self.in_table:
|
@@ -294,29 +307,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
294
307
|
self.md_table_buffer.append("")
|
295
308
|
|
296
309
|
elif isinstance(element, marko.block.HTMLBlock):
|
297
|
-
self.
|
298
|
-
self.
|
310
|
+
self._html_blocks += 1
|
311
|
+
self._process_inline_text(parent_item, doc)
|
312
|
+
self._close_table(doc)
|
299
313
|
_log.debug("HTML Block: {}".format(element))
|
300
314
|
if (
|
301
|
-
len(element.
|
315
|
+
len(element.body) > 0
|
302
316
|
): # If Marko doesn't return any content for HTML block, skip it
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
317
|
+
html_block = element.body.strip()
|
318
|
+
|
319
|
+
# wrap in markers to enable post-processing in convert()
|
320
|
+
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
321
|
+
doc.add_code(parent=parent_item, text=text_to_add)
|
307
322
|
else:
|
308
323
|
if not isinstance(element, str):
|
309
|
-
self.
|
324
|
+
self._close_table(doc)
|
310
325
|
_log.debug("Some other element: {}".format(element))
|
311
326
|
|
327
|
+
processed_block_types = (
|
328
|
+
marko.block.Heading,
|
329
|
+
marko.block.CodeBlock,
|
330
|
+
marko.block.FencedCode,
|
331
|
+
marko.inline.RawText,
|
332
|
+
)
|
333
|
+
|
312
334
|
# Iterate through the element's children (if any)
|
313
|
-
if
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
335
|
+
if hasattr(element, "children") and not isinstance(
|
336
|
+
element, processed_block_types
|
337
|
+
):
|
338
|
+
for child in element.children:
|
339
|
+
self._iterate_elements(
|
340
|
+
element=child,
|
341
|
+
depth=depth + 1,
|
342
|
+
doc=doc,
|
343
|
+
visited=visited,
|
344
|
+
parent_item=parent_item,
|
345
|
+
)
|
320
346
|
|
321
347
|
def is_valid(self) -> bool:
|
322
348
|
return self.valid
|
@@ -350,8 +376,51 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
350
376
|
marko_parser = Markdown()
|
351
377
|
parsed_ast = marko_parser.parse(self.markdown)
|
352
378
|
# Start iterating from the root of the AST
|
353
|
-
self.
|
354
|
-
|
379
|
+
self._iterate_elements(
|
380
|
+
element=parsed_ast,
|
381
|
+
depth=0,
|
382
|
+
doc=doc,
|
383
|
+
parent_item=None,
|
384
|
+
visited=set(),
|
385
|
+
)
|
386
|
+
self._process_inline_text(None, doc) # handle last hanging inline text
|
387
|
+
self._close_table(doc=doc) # handle any last hanging table
|
388
|
+
|
389
|
+
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
390
|
+
if self._html_blocks > 0:
|
391
|
+
|
392
|
+
# export to HTML
|
393
|
+
html_backend_cls = HTMLDocumentBackend
|
394
|
+
html_str = doc.export_to_html()
|
395
|
+
|
396
|
+
def _restore_original_html(txt, regex):
|
397
|
+
_txt, count = re.subn(regex, "", txt)
|
398
|
+
if count != self._html_blocks:
|
399
|
+
raise RuntimeError(
|
400
|
+
"An internal error has occurred during Markdown conversion."
|
401
|
+
)
|
402
|
+
return _txt
|
403
|
+
|
404
|
+
# restore original HTML by removing previouly added markers
|
405
|
+
for regex in [
|
406
|
+
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
407
|
+
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
408
|
+
]:
|
409
|
+
html_str = _restore_original_html(txt=html_str, regex=regex)
|
410
|
+
self._html_blocks = 0
|
411
|
+
|
412
|
+
# delegate to HTML backend
|
413
|
+
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
414
|
+
in_doc = InputDocument(
|
415
|
+
path_or_stream=stream,
|
416
|
+
format=InputFormat.HTML,
|
417
|
+
backend=html_backend_cls,
|
418
|
+
filename=self.file.name,
|
419
|
+
)
|
420
|
+
html_backend_obj = html_backend_cls(
|
421
|
+
in_doc=in_doc, path_or_stream=stream
|
422
|
+
)
|
423
|
+
doc = html_backend_obj.convert()
|
355
424
|
else:
|
356
425
|
raise RuntimeError(
|
357
426
|
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|