docling 2.37.0__tar.gz → 2.38.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.37.0 → docling-2.38.1}/PKG-INFO +7 -4
- {docling-2.37.0 → docling-2.38.1}/README.md +4 -3
- {docling-2.37.0 → docling-2.38.1}/docling/backend/md_backend.py +185 -80
- {docling-2.37.0 → docling-2.38.1}/docling/backend/msword_backend.py +76 -63
- docling-2.38.1/docling/backend/noop_backend.py +51 -0
- {docling-2.37.0 → docling-2.38.1}/docling/cli/main.py +82 -14
- docling-2.38.1/docling/datamodel/asr_model_specs.py +92 -0
- {docling-2.37.0 → docling-2.38.1}/docling/datamodel/base_models.py +12 -2
- {docling-2.37.0 → docling-2.38.1}/docling/datamodel/document.py +3 -1
- {docling-2.37.0 → docling-2.38.1}/docling/datamodel/pipeline_options.py +13 -2
- docling-2.38.1/docling/datamodel/pipeline_options_asr_model.py +57 -0
- {docling-2.37.0 → docling-2.38.1}/docling/datamodel/pipeline_options_vlm_model.py +2 -3
- {docling-2.37.0 → docling-2.38.1}/docling/document_converter.py +8 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/api_vlm_model.py +3 -1
- {docling-2.37.0 → docling-2.38.1}/docling/models/base_model.py +1 -1
- {docling-2.37.0 → docling-2.38.1}/docling/models/readingorder_model.py +1 -1
- {docling-2.37.0 → docling-2.38.1}/docling/models/vlm_models_inline/hf_transformers_model.py +3 -1
- {docling-2.37.0 → docling-2.38.1}/docling/models/vlm_models_inline/mlx_model.py +3 -1
- docling-2.38.1/docling/pipeline/asr_pipeline.py +253 -0
- {docling-2.37.0 → docling-2.38.1}/docling/pipeline/base_pipeline.py +11 -0
- {docling-2.37.0 → docling-2.38.1}/docling.egg-info/PKG-INFO +7 -4
- {docling-2.37.0 → docling-2.38.1}/docling.egg-info/SOURCES.txt +5 -0
- {docling-2.37.0 → docling-2.38.1}/docling.egg-info/requires.txt +3 -0
- {docling-2.37.0 → docling-2.38.1}/pyproject.toml +7 -1
- docling-2.38.1/tests/test_asr_pipeline.py +59 -0
- docling-2.38.1/tests/test_backend_markdown.py +52 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_msword.py +44 -4
- {docling-2.37.0 → docling-2.38.1}/tests/test_code_formula.py +22 -0
- docling-2.37.0/tests/test_backend_markdown.py +0 -41
- {docling-2.37.0 → docling-2.38.1}/LICENSE +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/abstract_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/csv_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/docx/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/html_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/json/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/pdf_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/xml/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/chunking/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/cli/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/cli/models.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/cli/tools.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/datamodel/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/datamodel/settings.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/exceptions.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/base_ocr_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/code_formula_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/easyocr_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/factories/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/factories/base_factory.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/layout_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/page_assemble_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/plugins/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/plugins/defaults.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/table_structure_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/utils/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/pipeline/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/py.typed +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/api_image_request.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/export.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/glm_utils.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/locks.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/model_downloader.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/ocr_utils.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/orientation.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/profiling.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/utils.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling/utils/visualization.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.37.0 → docling-2.38.1}/docling.egg-info/top_level.txt +0 -0
- {docling-2.37.0 → docling-2.38.1}/setup.cfg +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_csv.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_docling_json.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_html.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_jats.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_msexcel.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_pdfium.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_pptx.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_backend_webp.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_cli.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_data_gen_flag.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_e2e_conversion.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_input_doc.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_interfaces.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_invalid_input.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_options.py +0 -0
- {docling-2.37.0 → docling-2.38.1}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.38.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -61,6 +61,8 @@ Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "d
|
|
61
61
|
Provides-Extra: rapidocr
|
62
62
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
63
63
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
64
|
+
Provides-Extra: asr
|
65
|
+
Requires-Dist: openai-whisper>=20240930; extra == "asr"
|
64
66
|
Dynamic: license-file
|
65
67
|
|
66
68
|
<p align="center">
|
@@ -93,14 +95,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
93
95
|
|
94
96
|
## Features
|
95
97
|
|
96
|
-
* 🗂️
|
98
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
97
99
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
98
100
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
99
|
-
* ↪️
|
101
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
100
102
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
101
103
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
102
104
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
103
|
-
*
|
105
|
+
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
106
|
+
* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
|
104
107
|
* 💻 Simple and convenient CLI
|
105
108
|
|
106
109
|
### Coming soon
|
@@ -28,14 +28,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
28
28
|
|
29
29
|
## Features
|
30
30
|
|
31
|
-
* 🗂️
|
31
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
32
32
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
33
33
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
34
|
-
* ↪️
|
34
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
35
35
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
36
36
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
37
37
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
38
|
-
*
|
38
|
+
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
39
|
+
* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
|
39
40
|
* 💻 Simple and convenient CLI
|
40
41
|
|
41
42
|
### Coming soon
|
@@ -1,17 +1,16 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
3
|
import warnings
|
4
|
+
from copy import deepcopy
|
5
|
+
from enum import Enum
|
4
6
|
from io import BytesIO
|
5
7
|
from pathlib import Path
|
6
|
-
from typing import List, Optional, Set, Union
|
8
|
+
from typing import List, Literal, Optional, Set, Union
|
7
9
|
|
8
10
|
import marko
|
9
11
|
import marko.element
|
10
|
-
import marko.ext
|
11
|
-
import marko.ext.gfm
|
12
12
|
import marko.inline
|
13
13
|
from docling_core.types.doc import (
|
14
|
-
DocItem,
|
15
14
|
DocItemLabel,
|
16
15
|
DoclingDocument,
|
17
16
|
DocumentOrigin,
|
@@ -21,7 +20,10 @@ from docling_core.types.doc import (
|
|
21
20
|
TableData,
|
22
21
|
TextItem,
|
23
22
|
)
|
23
|
+
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
24
24
|
from marko import Markdown
|
25
|
+
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
26
|
+
from typing_extensions import Annotated
|
25
27
|
|
26
28
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
27
29
|
from docling.backend.html_backend import HTMLDocumentBackend
|
@@ -35,6 +37,31 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
|
35
37
|
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
36
38
|
|
37
39
|
|
40
|
+
class _PendingCreationType(str, Enum):
|
41
|
+
"""CoordOrigin."""
|
42
|
+
|
43
|
+
HEADING = "heading"
|
44
|
+
LIST_ITEM = "list_item"
|
45
|
+
|
46
|
+
|
47
|
+
class _HeadingCreationPayload(BaseModel):
|
48
|
+
kind: Literal["heading"] = "heading"
|
49
|
+
level: int
|
50
|
+
|
51
|
+
|
52
|
+
class _ListItemCreationPayload(BaseModel):
|
53
|
+
kind: Literal["list_item"] = "list_item"
|
54
|
+
|
55
|
+
|
56
|
+
_CreationPayload = Annotated[
|
57
|
+
Union[
|
58
|
+
_HeadingCreationPayload,
|
59
|
+
_ListItemCreationPayload,
|
60
|
+
],
|
61
|
+
Field(discriminator="kind"),
|
62
|
+
]
|
63
|
+
|
64
|
+
|
38
65
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
39
66
|
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
40
67
|
# This regex will match any sequence of underscores
|
@@ -71,7 +98,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
71
98
|
|
72
99
|
self.in_table = False
|
73
100
|
self.md_table_buffer: list[str] = []
|
74
|
-
self.inline_texts: list[str] = []
|
75
101
|
self._html_blocks: int = 0
|
76
102
|
|
77
103
|
try:
|
@@ -156,25 +182,65 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
156
182
|
doc.add_table(data=table_data)
|
157
183
|
return
|
158
184
|
|
159
|
-
def
|
160
|
-
self,
|
185
|
+
def _create_list_item(
|
186
|
+
self,
|
187
|
+
doc: DoclingDocument,
|
188
|
+
parent_item: Optional[NodeItem],
|
189
|
+
text: str,
|
190
|
+
formatting: Optional[Formatting] = None,
|
191
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
192
|
+
):
|
193
|
+
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
194
|
+
_log.warning("ListItem would have not had a list parent, adding one.")
|
195
|
+
parent_item = doc.add_unordered_list(parent=parent_item)
|
196
|
+
item = doc.add_list_item(
|
197
|
+
text=text,
|
198
|
+
enumerated=(isinstance(parent_item, OrderedList)),
|
199
|
+
parent=parent_item,
|
200
|
+
formatting=formatting,
|
201
|
+
hyperlink=hyperlink,
|
202
|
+
)
|
203
|
+
return item
|
204
|
+
|
205
|
+
def _create_heading_item(
|
206
|
+
self,
|
207
|
+
doc: DoclingDocument,
|
208
|
+
parent_item: Optional[NodeItem],
|
209
|
+
text: str,
|
210
|
+
level: int,
|
211
|
+
formatting: Optional[Formatting] = None,
|
212
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
161
213
|
):
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
label=DocItemLabel.PARAGRAPH,
|
214
|
+
if level == 1:
|
215
|
+
item = doc.add_title(
|
216
|
+
text=text,
|
166
217
|
parent=parent_item,
|
167
|
-
|
218
|
+
formatting=formatting,
|
219
|
+
hyperlink=hyperlink,
|
168
220
|
)
|
169
|
-
|
221
|
+
else:
|
222
|
+
item = doc.add_heading(
|
223
|
+
text=text,
|
224
|
+
level=level - 1,
|
225
|
+
parent=parent_item,
|
226
|
+
formatting=formatting,
|
227
|
+
hyperlink=hyperlink,
|
228
|
+
)
|
229
|
+
return item
|
170
230
|
|
171
231
|
def _iterate_elements( # noqa: C901
|
172
232
|
self,
|
233
|
+
*,
|
173
234
|
element: marko.element.Element,
|
174
235
|
depth: int,
|
175
236
|
doc: DoclingDocument,
|
176
237
|
visited: Set[marko.element.Element],
|
238
|
+
creation_stack: list[
|
239
|
+
_CreationPayload
|
240
|
+
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
177
241
|
parent_item: Optional[NodeItem] = None,
|
242
|
+
formatting: Optional[Formatting] = None,
|
243
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
178
244
|
):
|
179
245
|
if element in visited:
|
180
246
|
return
|
@@ -183,44 +249,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
183
249
|
# Check for different element types and process relevant details
|
184
250
|
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
185
251
|
self._close_table(doc)
|
186
|
-
self._process_inline_text(parent_item, doc)
|
187
252
|
_log.debug(
|
188
253
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
189
254
|
)
|
190
|
-
|
191
|
-
|
255
|
+
|
256
|
+
if len(element.children) > 1: # inline group will be created further down
|
257
|
+
parent_item = self._create_heading_item(
|
258
|
+
doc=doc,
|
259
|
+
parent_item=parent_item,
|
260
|
+
text="",
|
261
|
+
level=element.level,
|
262
|
+
formatting=formatting,
|
263
|
+
hyperlink=hyperlink,
|
264
|
+
)
|
192
265
|
else:
|
193
|
-
|
194
|
-
|
195
|
-
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
196
|
-
# hence we need to traverse the tree to get full text of a header
|
197
|
-
strings: List[str] = []
|
198
|
-
|
199
|
-
# Define a recursive function to traverse the tree
|
200
|
-
def traverse(node: marko.block.BlockElement):
|
201
|
-
# Check if the node has a "children" attribute
|
202
|
-
if hasattr(node, "children"):
|
203
|
-
# If "children" is a list, continue traversal
|
204
|
-
if isinstance(node.children, list):
|
205
|
-
for child in node.children:
|
206
|
-
traverse(child)
|
207
|
-
# If "children" is text, add it to header text
|
208
|
-
elif isinstance(node.children, str):
|
209
|
-
strings.append(node.children)
|
210
|
-
|
211
|
-
traverse(element)
|
212
|
-
snippet_text = "".join(strings)
|
213
|
-
if len(snippet_text) > 0:
|
214
|
-
if doc_label == DocItemLabel.SECTION_HEADER:
|
215
|
-
parent_item = doc.add_heading(
|
216
|
-
text=snippet_text,
|
217
|
-
level=element.level - 1,
|
218
|
-
parent=parent_item,
|
219
|
-
)
|
220
|
-
else:
|
221
|
-
parent_item = doc.add_text(
|
222
|
-
label=doc_label, parent=parent_item, text=snippet_text
|
223
|
-
)
|
266
|
+
creation_stack.append(_HeadingCreationPayload(level=element.level))
|
224
267
|
|
225
268
|
elif isinstance(element, marko.block.List):
|
226
269
|
has_non_empty_list_items = False
|
@@ -230,7 +273,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
230
273
|
break
|
231
274
|
|
232
275
|
self._close_table(doc)
|
233
|
-
self._process_inline_text(parent_item, doc)
|
234
276
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
235
277
|
if has_non_empty_list_items:
|
236
278
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
@@ -240,41 +282,54 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
240
282
|
|
241
283
|
elif (
|
242
284
|
isinstance(element, marko.block.ListItem)
|
243
|
-
and len(element.children)
|
244
|
-
and isinstance((
|
285
|
+
and len(element.children) == 1
|
286
|
+
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
287
|
+
and len(child.children) > 0
|
245
288
|
):
|
246
289
|
self._close_table(doc)
|
247
|
-
self._process_inline_text(parent_item, doc)
|
248
290
|
_log.debug(" - List item")
|
249
291
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
)
|
261
|
-
visited.add(first_child)
|
292
|
+
if len(child.children) > 1: # inline group will be created further down
|
293
|
+
parent_item = self._create_list_item(
|
294
|
+
doc=doc,
|
295
|
+
parent_item=parent_item,
|
296
|
+
text="",
|
297
|
+
formatting=formatting,
|
298
|
+
hyperlink=hyperlink,
|
299
|
+
)
|
300
|
+
else:
|
301
|
+
creation_stack.append(_ListItemCreationPayload())
|
262
302
|
|
263
303
|
elif isinstance(element, marko.inline.Image):
|
264
304
|
self._close_table(doc)
|
265
|
-
self._process_inline_text(parent_item, doc)
|
266
305
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
267
306
|
|
268
307
|
fig_caption: Optional[TextItem] = None
|
269
308
|
if element.title is not None and element.title != "":
|
270
309
|
fig_caption = doc.add_text(
|
271
|
-
label=DocItemLabel.CAPTION,
|
310
|
+
label=DocItemLabel.CAPTION,
|
311
|
+
text=element.title,
|
312
|
+
formatting=formatting,
|
313
|
+
hyperlink=hyperlink,
|
272
314
|
)
|
273
315
|
|
274
316
|
doc.add_picture(parent=parent_item, caption=fig_caption)
|
275
317
|
|
276
|
-
elif isinstance(element, marko.
|
277
|
-
|
318
|
+
elif isinstance(element, marko.inline.Emphasis):
|
319
|
+
_log.debug(f" - Emphasis: {element.children}")
|
320
|
+
formatting = deepcopy(formatting) if formatting else Formatting()
|
321
|
+
formatting.italic = True
|
322
|
+
|
323
|
+
elif isinstance(element, marko.inline.StrongEmphasis):
|
324
|
+
_log.debug(f" - StrongEmphasis: {element.children}")
|
325
|
+
formatting = deepcopy(formatting) if formatting else Formatting()
|
326
|
+
formatting.bold = True
|
327
|
+
|
328
|
+
elif isinstance(element, marko.inline.Link):
|
329
|
+
_log.debug(f" - Link: {element.children}")
|
330
|
+
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
|
331
|
+
element.dest
|
332
|
+
)
|
278
333
|
|
279
334
|
elif isinstance(element, marko.inline.RawText):
|
280
335
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
@@ -287,28 +342,66 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
287
342
|
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
288
343
|
else:
|
289
344
|
self.md_table_buffer.append(snippet_text)
|
290
|
-
|
345
|
+
elif snippet_text:
|
291
346
|
self._close_table(doc)
|
292
|
-
|
293
|
-
|
347
|
+
|
348
|
+
if creation_stack:
|
349
|
+
while len(creation_stack) > 0:
|
350
|
+
to_create = creation_stack.pop()
|
351
|
+
if isinstance(to_create, _ListItemCreationPayload):
|
352
|
+
parent_item = self._create_list_item(
|
353
|
+
doc=doc,
|
354
|
+
parent_item=parent_item,
|
355
|
+
text=snippet_text,
|
356
|
+
formatting=formatting,
|
357
|
+
hyperlink=hyperlink,
|
358
|
+
)
|
359
|
+
elif isinstance(to_create, _HeadingCreationPayload):
|
360
|
+
# not keeping as parent_item as logic for correctly tracking
|
361
|
+
# that not implemented yet (section components not captured
|
362
|
+
# as heading children in marko)
|
363
|
+
self._create_heading_item(
|
364
|
+
doc=doc,
|
365
|
+
parent_item=parent_item,
|
366
|
+
text=snippet_text,
|
367
|
+
level=to_create.level,
|
368
|
+
formatting=formatting,
|
369
|
+
hyperlink=hyperlink,
|
370
|
+
)
|
371
|
+
else:
|
372
|
+
doc.add_text(
|
373
|
+
label=DocItemLabel.TEXT,
|
374
|
+
parent=parent_item,
|
375
|
+
text=snippet_text,
|
376
|
+
formatting=formatting,
|
377
|
+
hyperlink=hyperlink,
|
378
|
+
)
|
294
379
|
|
295
380
|
elif isinstance(element, marko.inline.CodeSpan):
|
296
381
|
self._close_table(doc)
|
297
|
-
self._process_inline_text(parent_item, doc)
|
298
382
|
_log.debug(f" - Code Span: {element.children}")
|
299
383
|
snippet_text = str(element.children).strip()
|
300
|
-
doc.add_code(
|
384
|
+
doc.add_code(
|
385
|
+
parent=parent_item,
|
386
|
+
text=snippet_text,
|
387
|
+
formatting=formatting,
|
388
|
+
hyperlink=hyperlink,
|
389
|
+
)
|
301
390
|
|
302
391
|
elif (
|
303
392
|
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
304
393
|
and len(element.children) > 0
|
305
|
-
and isinstance((
|
306
|
-
and len(snippet_text := (
|
394
|
+
and isinstance((child := element.children[0]), marko.inline.RawText)
|
395
|
+
and len(snippet_text := (child.children.strip())) > 0
|
307
396
|
):
|
308
397
|
self._close_table(doc)
|
309
|
-
self._process_inline_text(parent_item, doc)
|
310
398
|
_log.debug(f" - Code Block: {element.children}")
|
311
|
-
doc.add_code(
|
399
|
+
doc.add_code(
|
400
|
+
parent=parent_item,
|
401
|
+
text=snippet_text,
|
402
|
+
formatting=formatting,
|
403
|
+
hyperlink=hyperlink,
|
404
|
+
)
|
312
405
|
|
313
406
|
elif isinstance(element, marko.inline.LineBreak):
|
314
407
|
if self.in_table:
|
@@ -317,7 +410,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
317
410
|
|
318
411
|
elif isinstance(element, marko.block.HTMLBlock):
|
319
412
|
self._html_blocks += 1
|
320
|
-
self._process_inline_text(parent_item, doc)
|
321
413
|
self._close_table(doc)
|
322
414
|
_log.debug(f"HTML Block: {element}")
|
323
415
|
if (
|
@@ -327,14 +419,24 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
327
419
|
|
328
420
|
# wrap in markers to enable post-processing in convert()
|
329
421
|
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
330
|
-
doc.add_code(
|
422
|
+
doc.add_code(
|
423
|
+
parent=parent_item,
|
424
|
+
text=text_to_add,
|
425
|
+
formatting=formatting,
|
426
|
+
hyperlink=hyperlink,
|
427
|
+
)
|
331
428
|
else:
|
332
429
|
if not isinstance(element, str):
|
333
430
|
self._close_table(doc)
|
334
431
|
_log.debug(f"Some other element: {element}")
|
335
432
|
|
433
|
+
if (
|
434
|
+
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
|
435
|
+
and len(element.children) > 1
|
436
|
+
):
|
437
|
+
parent_item = doc.add_inline_group(parent=parent_item)
|
438
|
+
|
336
439
|
processed_block_types = (
|
337
|
-
marko.block.Heading,
|
338
440
|
marko.block.CodeBlock,
|
339
441
|
marko.block.FencedCode,
|
340
442
|
marko.inline.RawText,
|
@@ -350,7 +452,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
350
452
|
depth=depth + 1,
|
351
453
|
doc=doc,
|
352
454
|
visited=visited,
|
455
|
+
creation_stack=creation_stack,
|
353
456
|
parent_item=parent_item,
|
457
|
+
formatting=formatting,
|
458
|
+
hyperlink=hyperlink,
|
354
459
|
)
|
355
460
|
|
356
461
|
def is_valid(self) -> bool:
|
@@ -391,8 +496,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
391
496
|
doc=doc,
|
392
497
|
parent_item=None,
|
393
498
|
visited=set(),
|
499
|
+
creation_stack=[],
|
394
500
|
)
|
395
|
-
self._process_inline_text(None, doc) # handle last hanging inline text
|
396
501
|
self._close_table(doc=doc) # handle any last hanging table
|
397
502
|
|
398
503
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|