docling 2.37.0__tar.gz → 2.38.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.37.0 → docling-2.38.0}/PKG-INFO +7 -4
- {docling-2.37.0 → docling-2.38.0}/README.md +4 -3
- {docling-2.37.0 → docling-2.38.0}/docling/backend/md_backend.py +101 -81
- {docling-2.37.0 → docling-2.38.0}/docling/backend/msword_backend.py +71 -62
- docling-2.38.0/docling/backend/noop_backend.py +51 -0
- {docling-2.37.0 → docling-2.38.0}/docling/cli/main.py +82 -14
- docling-2.38.0/docling/datamodel/asr_model_specs.py +92 -0
- {docling-2.37.0 → docling-2.38.0}/docling/datamodel/base_models.py +11 -1
- {docling-2.37.0 → docling-2.38.0}/docling/datamodel/document.py +3 -1
- {docling-2.37.0 → docling-2.38.0}/docling/datamodel/pipeline_options.py +12 -1
- docling-2.38.0/docling/datamodel/pipeline_options_asr_model.py +57 -0
- {docling-2.37.0 → docling-2.38.0}/docling/datamodel/pipeline_options_vlm_model.py +2 -3
- {docling-2.37.0 → docling-2.38.0}/docling/document_converter.py +8 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/api_vlm_model.py +3 -1
- {docling-2.37.0 → docling-2.38.0}/docling/models/base_model.py +1 -1
- {docling-2.37.0 → docling-2.38.0}/docling/models/readingorder_model.py +1 -1
- {docling-2.37.0 → docling-2.38.0}/docling/models/vlm_models_inline/hf_transformers_model.py +3 -1
- {docling-2.37.0 → docling-2.38.0}/docling/models/vlm_models_inline/mlx_model.py +3 -1
- docling-2.38.0/docling/pipeline/asr_pipeline.py +253 -0
- {docling-2.37.0 → docling-2.38.0}/docling/pipeline/base_pipeline.py +11 -0
- {docling-2.37.0 → docling-2.38.0}/docling.egg-info/PKG-INFO +7 -4
- {docling-2.37.0 → docling-2.38.0}/docling.egg-info/SOURCES.txt +5 -0
- {docling-2.37.0 → docling-2.38.0}/docling.egg-info/requires.txt +3 -0
- {docling-2.37.0 → docling-2.38.0}/pyproject.toml +7 -1
- docling-2.38.0/tests/test_asr_pipeline.py +59 -0
- docling-2.38.0/tests/test_backend_markdown.py +52 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_msword.py +44 -4
- {docling-2.37.0 → docling-2.38.0}/tests/test_code_formula.py +22 -0
- docling-2.37.0/tests/test_backend_markdown.py +0 -41
- {docling-2.37.0 → docling-2.38.0}/LICENSE +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/html_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/chunking/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/cli/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/cli/models.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/cli/tools.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/datamodel/settings.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/exceptions.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/layout_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/py.typed +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/__init__.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/export.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/locks.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/orientation.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/profiling.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/utils.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling/utils/visualization.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.37.0 → docling-2.38.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.37.0 → docling-2.38.0}/setup.cfg +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_csv.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_html.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_jats.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_backend_webp.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_cli.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_input_doc.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_interfaces.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_invalid_input.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_options.py +0 -0
- {docling-2.37.0 → docling-2.38.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.38.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -61,6 +61,8 @@ Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "d
|
|
61
61
|
Provides-Extra: rapidocr
|
62
62
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
63
63
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
64
|
+
Provides-Extra: asr
|
65
|
+
Requires-Dist: openai-whisper>=20240930; extra == "asr"
|
64
66
|
Dynamic: license-file
|
65
67
|
|
66
68
|
<p align="center">
|
@@ -93,14 +95,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
93
95
|
|
94
96
|
## Features
|
95
97
|
|
96
|
-
* 🗂️
|
98
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
97
99
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
98
100
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
99
|
-
* ↪️
|
101
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
100
102
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
101
103
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
102
104
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
103
|
-
*
|
105
|
+
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
106
|
+
* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
|
104
107
|
* 💻 Simple and convenient CLI
|
105
108
|
|
106
109
|
### Coming soon
|
@@ -28,14 +28,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
28
28
|
|
29
29
|
## Features
|
30
30
|
|
31
|
-
* 🗂️
|
31
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
32
32
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
33
33
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
34
|
-
* ↪️
|
34
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
35
35
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
36
36
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
37
37
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
38
|
-
*
|
38
|
+
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
39
|
+
* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
|
39
40
|
* 💻 Simple and convenient CLI
|
40
41
|
|
41
42
|
### Coming soon
|
@@ -1,17 +1,15 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
3
|
import warnings
|
4
|
+
from copy import deepcopy
|
4
5
|
from io import BytesIO
|
5
6
|
from pathlib import Path
|
6
7
|
from typing import List, Optional, Set, Union
|
7
8
|
|
8
9
|
import marko
|
9
10
|
import marko.element
|
10
|
-
import marko.ext
|
11
|
-
import marko.ext.gfm
|
12
11
|
import marko.inline
|
13
12
|
from docling_core.types.doc import (
|
14
|
-
DocItem,
|
15
13
|
DocItemLabel,
|
16
14
|
DoclingDocument,
|
17
15
|
DocumentOrigin,
|
@@ -21,7 +19,9 @@ from docling_core.types.doc import (
|
|
21
19
|
TableData,
|
22
20
|
TextItem,
|
23
21
|
)
|
22
|
+
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
24
23
|
from marko import Markdown
|
24
|
+
from pydantic import AnyUrl, TypeAdapter
|
25
25
|
|
26
26
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
27
27
|
from docling.backend.html_backend import HTMLDocumentBackend
|
@@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
71
71
|
|
72
72
|
self.in_table = False
|
73
73
|
self.md_table_buffer: list[str] = []
|
74
|
-
self.inline_texts: list[str] = []
|
75
74
|
self._html_blocks: int = 0
|
76
75
|
|
77
76
|
try:
|
@@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
156
155
|
doc.add_table(data=table_data)
|
157
156
|
return
|
158
157
|
|
159
|
-
def _process_inline_text(
|
160
|
-
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
161
|
-
):
|
162
|
-
txt = " ".join(self.inline_texts)
|
163
|
-
if len(txt) > 0:
|
164
|
-
doc.add_text(
|
165
|
-
label=DocItemLabel.PARAGRAPH,
|
166
|
-
parent=parent_item,
|
167
|
-
text=txt,
|
168
|
-
)
|
169
|
-
self.inline_texts = []
|
170
|
-
|
171
158
|
def _iterate_elements( # noqa: C901
|
172
159
|
self,
|
160
|
+
*,
|
173
161
|
element: marko.element.Element,
|
174
162
|
depth: int,
|
175
163
|
doc: DoclingDocument,
|
176
164
|
visited: Set[marko.element.Element],
|
177
165
|
parent_item: Optional[NodeItem] = None,
|
166
|
+
formatting: Optional[Formatting] = None,
|
167
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
178
168
|
):
|
179
169
|
if element in visited:
|
180
170
|
return
|
@@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
183
173
|
# Check for different element types and process relevant details
|
184
174
|
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
185
175
|
self._close_table(doc)
|
186
|
-
self._process_inline_text(parent_item, doc)
|
187
176
|
_log.debug(
|
188
177
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
189
178
|
)
|
179
|
+
|
180
|
+
if len(element.children) == 1:
|
181
|
+
child = element.children[0]
|
182
|
+
snippet_text = str(child.children) # type: ignore
|
183
|
+
visited.add(child)
|
184
|
+
else:
|
185
|
+
snippet_text = "" # inline group will be created
|
186
|
+
|
190
187
|
if element.level == 1:
|
191
|
-
|
188
|
+
parent_item = doc.add_title(
|
189
|
+
text=snippet_text,
|
190
|
+
parent=parent_item,
|
191
|
+
formatting=formatting,
|
192
|
+
hyperlink=hyperlink,
|
193
|
+
)
|
192
194
|
else:
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
def traverse(node: marko.block.BlockElement):
|
201
|
-
# Check if the node has a "children" attribute
|
202
|
-
if hasattr(node, "children"):
|
203
|
-
# If "children" is a list, continue traversal
|
204
|
-
if isinstance(node.children, list):
|
205
|
-
for child in node.children:
|
206
|
-
traverse(child)
|
207
|
-
# If "children" is text, add it to header text
|
208
|
-
elif isinstance(node.children, str):
|
209
|
-
strings.append(node.children)
|
210
|
-
|
211
|
-
traverse(element)
|
212
|
-
snippet_text = "".join(strings)
|
213
|
-
if len(snippet_text) > 0:
|
214
|
-
if doc_label == DocItemLabel.SECTION_HEADER:
|
215
|
-
parent_item = doc.add_heading(
|
216
|
-
text=snippet_text,
|
217
|
-
level=element.level - 1,
|
218
|
-
parent=parent_item,
|
219
|
-
)
|
220
|
-
else:
|
221
|
-
parent_item = doc.add_text(
|
222
|
-
label=doc_label, parent=parent_item, text=snippet_text
|
223
|
-
)
|
195
|
+
parent_item = doc.add_heading(
|
196
|
+
text=snippet_text,
|
197
|
+
level=element.level - 1,
|
198
|
+
parent=parent_item,
|
199
|
+
formatting=formatting,
|
200
|
+
hyperlink=hyperlink,
|
201
|
+
)
|
224
202
|
|
225
203
|
elif isinstance(element, marko.block.List):
|
226
204
|
has_non_empty_list_items = False
|
@@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
230
208
|
break
|
231
209
|
|
232
210
|
self._close_table(doc)
|
233
|
-
self._process_inline_text(parent_item, doc)
|
234
211
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
235
212
|
if has_non_empty_list_items:
|
236
213
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
@@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
240
217
|
|
241
218
|
elif (
|
242
219
|
isinstance(element, marko.block.ListItem)
|
243
|
-
and len(element.children)
|
244
|
-
and isinstance((
|
220
|
+
and len(element.children) == 1
|
221
|
+
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
222
|
+
and len(child.children) > 0
|
245
223
|
):
|
246
224
|
self._close_table(doc)
|
247
|
-
self._process_inline_text(parent_item, doc)
|
248
225
|
_log.debug(" - List item")
|
249
226
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
):
|
257
|
-
|
258
|
-
|
259
|
-
|
227
|
+
if len(child.children) == 1:
|
228
|
+
snippet_text = str(child.children[0].children) # type: ignore
|
229
|
+
visited.add(child)
|
230
|
+
else:
|
231
|
+
snippet_text = "" # inline group will be created
|
232
|
+
is_numbered = isinstance(parent_item, OrderedList)
|
233
|
+
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
234
|
+
_log.warning("ListItem would have not had a list parent, adding one.")
|
235
|
+
parent_item = doc.add_unordered_list(parent=parent_item)
|
236
|
+
parent_item = doc.add_list_item(
|
237
|
+
enumerated=is_numbered,
|
238
|
+
parent=parent_item,
|
239
|
+
text=snippet_text,
|
240
|
+
formatting=formatting,
|
241
|
+
hyperlink=hyperlink,
|
260
242
|
)
|
261
|
-
visited.add(first_child)
|
262
243
|
|
263
244
|
elif isinstance(element, marko.inline.Image):
|
264
245
|
self._close_table(doc)
|
265
|
-
self._process_inline_text(parent_item, doc)
|
266
246
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
267
247
|
|
268
248
|
fig_caption: Optional[TextItem] = None
|
269
249
|
if element.title is not None and element.title != "":
|
270
250
|
fig_caption = doc.add_text(
|
271
|
-
label=DocItemLabel.CAPTION,
|
251
|
+
label=DocItemLabel.CAPTION,
|
252
|
+
text=element.title,
|
253
|
+
formatting=formatting,
|
254
|
+
hyperlink=hyperlink,
|
272
255
|
)
|
273
256
|
|
274
257
|
doc.add_picture(parent=parent_item, caption=fig_caption)
|
275
258
|
|
276
|
-
elif isinstance(element, marko.
|
277
|
-
|
259
|
+
elif isinstance(element, marko.inline.Emphasis):
|
260
|
+
_log.debug(f" - Emphasis: {element.children}")
|
261
|
+
formatting = deepcopy(formatting) if formatting else Formatting()
|
262
|
+
formatting.italic = True
|
263
|
+
|
264
|
+
elif isinstance(element, marko.inline.StrongEmphasis):
|
265
|
+
_log.debug(f" - StrongEmphasis: {element.children}")
|
266
|
+
formatting = deepcopy(formatting) if formatting else Formatting()
|
267
|
+
formatting.bold = True
|
268
|
+
|
269
|
+
elif isinstance(element, marko.inline.Link):
|
270
|
+
_log.debug(f" - Link: {element.children}")
|
271
|
+
hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
|
272
|
+
element.dest
|
273
|
+
)
|
278
274
|
|
279
275
|
elif isinstance(element, marko.inline.RawText):
|
280
276
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
@@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
287
283
|
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
288
284
|
else:
|
289
285
|
self.md_table_buffer.append(snippet_text)
|
290
|
-
|
286
|
+
elif snippet_text:
|
291
287
|
self._close_table(doc)
|
292
|
-
|
293
|
-
|
288
|
+
doc.add_text(
|
289
|
+
label=DocItemLabel.TEXT,
|
290
|
+
parent=parent_item,
|
291
|
+
text=snippet_text,
|
292
|
+
formatting=formatting,
|
293
|
+
hyperlink=hyperlink,
|
294
|
+
)
|
294
295
|
|
295
296
|
elif isinstance(element, marko.inline.CodeSpan):
|
296
297
|
self._close_table(doc)
|
297
|
-
self._process_inline_text(parent_item, doc)
|
298
298
|
_log.debug(f" - Code Span: {element.children}")
|
299
299
|
snippet_text = str(element.children).strip()
|
300
|
-
doc.add_code(
|
300
|
+
doc.add_code(
|
301
|
+
parent=parent_item,
|
302
|
+
text=snippet_text,
|
303
|
+
formatting=formatting,
|
304
|
+
hyperlink=hyperlink,
|
305
|
+
)
|
301
306
|
|
302
307
|
elif (
|
303
308
|
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
304
309
|
and len(element.children) > 0
|
305
|
-
and isinstance((
|
306
|
-
and len(snippet_text := (
|
310
|
+
and isinstance((child := element.children[0]), marko.inline.RawText)
|
311
|
+
and len(snippet_text := (child.children.strip())) > 0
|
307
312
|
):
|
308
313
|
self._close_table(doc)
|
309
|
-
self._process_inline_text(parent_item, doc)
|
310
314
|
_log.debug(f" - Code Block: {element.children}")
|
311
|
-
doc.add_code(
|
315
|
+
doc.add_code(
|
316
|
+
parent=parent_item,
|
317
|
+
text=snippet_text,
|
318
|
+
formatting=formatting,
|
319
|
+
hyperlink=hyperlink,
|
320
|
+
)
|
312
321
|
|
313
322
|
elif isinstance(element, marko.inline.LineBreak):
|
314
323
|
if self.in_table:
|
@@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
317
326
|
|
318
327
|
elif isinstance(element, marko.block.HTMLBlock):
|
319
328
|
self._html_blocks += 1
|
320
|
-
self._process_inline_text(parent_item, doc)
|
321
329
|
self._close_table(doc)
|
322
330
|
_log.debug(f"HTML Block: {element}")
|
323
331
|
if (
|
@@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
327
335
|
|
328
336
|
# wrap in markers to enable post-processing in convert()
|
329
337
|
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
330
|
-
doc.add_code(
|
338
|
+
doc.add_code(
|
339
|
+
parent=parent_item,
|
340
|
+
text=text_to_add,
|
341
|
+
formatting=formatting,
|
342
|
+
hyperlink=hyperlink,
|
343
|
+
)
|
331
344
|
else:
|
332
345
|
if not isinstance(element, str):
|
333
346
|
self._close_table(doc)
|
334
347
|
_log.debug(f"Some other element: {element}")
|
335
348
|
|
349
|
+
if (
|
350
|
+
isinstance(element, (marko.block.Paragraph, marko.block.Heading))
|
351
|
+
and len(element.children) > 1
|
352
|
+
):
|
353
|
+
parent_item = doc.add_inline_group(parent=parent_item)
|
354
|
+
|
336
355
|
processed_block_types = (
|
337
|
-
marko.block.Heading,
|
356
|
+
# marko.block.Heading,
|
338
357
|
marko.block.CodeBlock,
|
339
358
|
marko.block.FencedCode,
|
340
359
|
marko.inline.RawText,
|
@@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
351
370
|
doc=doc,
|
352
371
|
visited=visited,
|
353
372
|
parent_item=parent_item,
|
373
|
+
formatting=formatting,
|
374
|
+
hyperlink=hyperlink,
|
354
375
|
)
|
355
376
|
|
356
377
|
def is_valid(self) -> bool:
|
@@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
392
413
|
parent_item=None,
|
393
414
|
visited=set(),
|
394
415
|
)
|
395
|
-
self._process_inline_text(None, doc) # handle last hanging inline text
|
396
416
|
self._close_table(doc=doc) # handle any last hanging table
|
397
417
|
|
398
418
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
@@ -14,7 +14,7 @@ from docling_core.types.doc import (
|
|
14
14
|
TableCell,
|
15
15
|
TableData,
|
16
16
|
)
|
17
|
-
from docling_core.types.doc.document import Formatting
|
17
|
+
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
18
18
|
from docx import Document
|
19
19
|
from docx.document import Document as DocxDocument
|
20
20
|
from docx.oxml.table import CT_Tc
|
@@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
84
84
|
self.valid = True
|
85
85
|
except Exception as e:
|
86
86
|
raise RuntimeError(
|
87
|
-
f"
|
87
|
+
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
88
88
|
) from e
|
89
89
|
|
90
90
|
@override
|
@@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
251
251
|
self._handle_tables(element, docx_obj, doc)
|
252
252
|
except Exception:
|
253
253
|
_log.debug("could not parse a table, broken docx table")
|
254
|
-
|
254
|
+
# Check for Image
|
255
255
|
elif drawing_blip:
|
256
256
|
self._handle_pictures(docx_obj, drawing_blip, doc)
|
257
|
+
# Check for Text after the Image
|
258
|
+
if (
|
259
|
+
tag_name in ["p"]
|
260
|
+
and element.find(".//w:t", namespaces=namespaces) is not None
|
261
|
+
):
|
262
|
+
self._handle_text_elements(element, docx_obj, doc)
|
257
263
|
# Check for the sdt containers, like table of contents
|
258
264
|
elif tag_name in ["sdt"]:
|
259
265
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
@@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
268
274
|
self._handle_text_elements(element, docx_obj, doc)
|
269
275
|
else:
|
270
276
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
277
|
+
|
271
278
|
return doc
|
272
279
|
|
273
280
|
def _str_to_int(
|
@@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
578
585
|
all_paragraphs = []
|
579
586
|
|
580
587
|
# Sort paragraphs within each container, then process containers
|
581
|
-
for
|
588
|
+
for paragraphs in container_paragraphs.values():
|
582
589
|
# Sort by vertical position within each container
|
583
590
|
sorted_container_paragraphs = sorted(
|
584
591
|
paragraphs,
|
@@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
689
696
|
doc: DoclingDocument,
|
690
697
|
) -> None:
|
691
698
|
paragraph = Paragraph(element, docx_obj)
|
692
|
-
|
699
|
+
paragraph_elements = self._get_paragraph_elements(paragraph)
|
693
700
|
text, equations = self._handle_equations_in_text(
|
694
701
|
element=element, text=paragraph.text
|
695
702
|
)
|
696
703
|
|
697
704
|
if text is None:
|
698
705
|
return
|
699
|
-
paragraph_elements = self._get_paragraph_elements(paragraph)
|
700
706
|
text = text.strip()
|
701
707
|
|
702
708
|
# Common styles for bullet and numbered lists.
|
@@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
912
918
|
)
|
913
919
|
return
|
914
920
|
|
921
|
+
def _add_formatted_list_item(
|
922
|
+
self,
|
923
|
+
doc: DoclingDocument,
|
924
|
+
elements: list,
|
925
|
+
marker: str,
|
926
|
+
enumerated: bool,
|
927
|
+
level: int,
|
928
|
+
) -> None:
|
929
|
+
# This should not happen by construction
|
930
|
+
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
931
|
+
return
|
932
|
+
if len(elements) == 1:
|
933
|
+
text, format, hyperlink = elements[0]
|
934
|
+
doc.add_list_item(
|
935
|
+
marker=marker,
|
936
|
+
enumerated=enumerated,
|
937
|
+
parent=self.parents[level],
|
938
|
+
text=text,
|
939
|
+
formatting=format,
|
940
|
+
hyperlink=hyperlink,
|
941
|
+
)
|
942
|
+
else:
|
943
|
+
new_item = doc.add_list_item(
|
944
|
+
marker=marker,
|
945
|
+
enumerated=enumerated,
|
946
|
+
parent=self.parents[level],
|
947
|
+
text="",
|
948
|
+
)
|
949
|
+
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
950
|
+
for text, format, hyperlink in elements:
|
951
|
+
doc.add_text(
|
952
|
+
label=DocItemLabel.TEXT,
|
953
|
+
parent=new_parent,
|
954
|
+
text=text,
|
955
|
+
formatting=format,
|
956
|
+
hyperlink=hyperlink,
|
957
|
+
)
|
958
|
+
|
915
959
|
def _add_list_item(
|
916
960
|
self,
|
917
961
|
*,
|
@@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
921
965
|
elements: list,
|
922
966
|
is_numbered: bool = False,
|
923
967
|
) -> None:
|
968
|
+
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
969
|
+
if not elements:
|
970
|
+
return None
|
924
971
|
enum_marker = ""
|
925
972
|
|
926
973
|
level = self._get_level()
|
@@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
937
984
|
if is_numbered:
|
938
985
|
enum_marker = str(self.listIter) + "."
|
939
986
|
is_numbered = True
|
940
|
-
|
941
|
-
doc
|
942
|
-
prev_parent=self.parents[level],
|
943
|
-
paragraph_elements=elements,
|
987
|
+
self._add_formatted_list_item(
|
988
|
+
doc, elements, enum_marker, is_numbered, level
|
944
989
|
)
|
945
|
-
for text, format, hyperlink in elements:
|
946
|
-
doc.add_list_item(
|
947
|
-
marker=enum_marker,
|
948
|
-
enumerated=is_numbered,
|
949
|
-
parent=new_parent,
|
950
|
-
text=text,
|
951
|
-
formatting=format,
|
952
|
-
hyperlink=hyperlink,
|
953
|
-
)
|
954
|
-
|
955
990
|
elif (
|
956
991
|
self._prev_numid() == numid
|
957
992
|
and self.level_at_new_list is not None
|
@@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
981
1016
|
if is_numbered:
|
982
1017
|
enum_marker = str(self.listIter) + "."
|
983
1018
|
is_numbered = True
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
1019
|
+
self._add_formatted_list_item(
|
1020
|
+
doc,
|
1021
|
+
elements,
|
1022
|
+
enum_marker,
|
1023
|
+
is_numbered,
|
1024
|
+
self.level_at_new_list + ilevel,
|
989
1025
|
)
|
990
|
-
for text, format, hyperlink in elements:
|
991
|
-
doc.add_list_item(
|
992
|
-
marker=enum_marker,
|
993
|
-
enumerated=is_numbered,
|
994
|
-
parent=new_parent,
|
995
|
-
text=text,
|
996
|
-
formatting=format,
|
997
|
-
hyperlink=hyperlink,
|
998
|
-
)
|
999
1026
|
elif (
|
1000
1027
|
self._prev_numid() == numid
|
1001
1028
|
and self.level_at_new_list is not None
|
1002
1029
|
and prev_indent is not None
|
1003
1030
|
and ilevel < prev_indent
|
1004
1031
|
): # Close list
|
1005
|
-
for k
|
1032
|
+
for k in self.parents:
|
1006
1033
|
if k > self.level_at_new_list + ilevel:
|
1007
1034
|
self.parents[k] = None
|
1008
1035
|
|
@@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1011
1038
|
if is_numbered:
|
1012
1039
|
enum_marker = str(self.listIter) + "."
|
1013
1040
|
is_numbered = True
|
1014
|
-
|
1015
|
-
doc
|
1016
|
-
|
1017
|
-
|
1041
|
+
self._add_formatted_list_item(
|
1042
|
+
doc,
|
1043
|
+
elements,
|
1044
|
+
enum_marker,
|
1045
|
+
is_numbered,
|
1046
|
+
self.level_at_new_list + ilevel,
|
1018
1047
|
)
|
1019
|
-
for text, format, hyperlink in elements:
|
1020
|
-
doc.add_list_item(
|
1021
|
-
marker=enum_marker,
|
1022
|
-
enumerated=is_numbered,
|
1023
|
-
parent=new_parent,
|
1024
|
-
text=text,
|
1025
|
-
formatting=format,
|
1026
|
-
hyperlink=hyperlink,
|
1027
|
-
)
|
1028
1048
|
self.listIter = 0
|
1029
1049
|
|
1030
1050
|
elif self._prev_numid() == numid or prev_indent == ilevel:
|
@@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1033
1053
|
if is_numbered:
|
1034
1054
|
enum_marker = str(self.listIter) + "."
|
1035
1055
|
is_numbered = True
|
1036
|
-
|
1037
|
-
doc
|
1038
|
-
prev_parent=self.parents[level - 1],
|
1039
|
-
paragraph_elements=elements,
|
1056
|
+
self._add_formatted_list_item(
|
1057
|
+
doc, elements, enum_marker, is_numbered, level - 1
|
1040
1058
|
)
|
1041
|
-
|
1042
|
-
# Add the list item to the parent group
|
1043
|
-
doc.add_list_item(
|
1044
|
-
marker=enum_marker,
|
1045
|
-
enumerated=is_numbered,
|
1046
|
-
parent=new_parent,
|
1047
|
-
text=text,
|
1048
|
-
formatting=format,
|
1049
|
-
hyperlink=hyperlink,
|
1050
|
-
)
|
1059
|
+
|
1051
1060
|
return
|
1052
1061
|
|
1053
1062
|
def _handle_tables(
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Set, Union
|
5
|
+
|
6
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
7
|
+
from docling.datamodel.base_models import InputFormat
|
8
|
+
from docling.datamodel.document import InputDocument
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class NoOpBackend(AbstractDocumentBackend):
|
14
|
+
"""
|
15
|
+
A no-op backend that only validates input existence.
|
16
|
+
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
20
|
+
super().__init__(in_doc, path_or_stream)
|
21
|
+
|
22
|
+
_log.debug(f"NoOpBackend initialized for: {path_or_stream}")
|
23
|
+
|
24
|
+
# Validate input
|
25
|
+
try:
|
26
|
+
if isinstance(self.path_or_stream, BytesIO):
|
27
|
+
# Check if stream has content
|
28
|
+
self.valid = len(self.path_or_stream.getvalue()) > 0
|
29
|
+
_log.debug(
|
30
|
+
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
|
31
|
+
)
|
32
|
+
elif isinstance(self.path_or_stream, Path):
|
33
|
+
# Check if file exists
|
34
|
+
self.valid = self.path_or_stream.exists()
|
35
|
+
_log.debug(f"File exists: {self.valid}")
|
36
|
+
else:
|
37
|
+
self.valid = False
|
38
|
+
except Exception as e:
|
39
|
+
_log.error(f"NoOpBackend validation failed: {e}")
|
40
|
+
self.valid = False
|
41
|
+
|
42
|
+
def is_valid(self) -> bool:
|
43
|
+
return self.valid
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def supports_pagination(cls) -> bool:
|
47
|
+
return False
|
48
|
+
|
49
|
+
@classmethod
|
50
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
51
|
+
return set(InputFormat)
|