docling 2.42.0__tar.gz → 2.42.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.42.0 → docling-2.42.2}/PKG-INFO +2 -1
- {docling-2.42.0 → docling-2.42.2}/README.md +1 -0
- docling-2.42.2/docling/backend/html_backend.py +570 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/msword_backend.py +10 -1
- {docling-2.42.0 → docling-2.42.2}/docling/backend/pdf_backend.py +25 -1
- {docling-2.42.0 → docling-2.42.2}/docling/pipeline/base_pipeline.py +7 -1
- {docling-2.42.0 → docling-2.42.2}/docling/utils/layout_postprocessor.py +7 -2
- {docling-2.42.0 → docling-2.42.2}/docling.egg-info/PKG-INFO +2 -1
- {docling-2.42.0 → docling-2.42.2}/pyproject.toml +1 -1
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_html.py +2 -6
- {docling-2.42.0 → docling-2.42.2}/tests/test_input_doc.py +23 -0
- docling-2.42.0/docling/backend/html_backend.py +0 -585
- {docling-2.42.0 → docling-2.42.2}/LICENSE +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/abstract_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/csv_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/docx/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/json/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/md_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/noop_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/xml/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/chunking/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/cli/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/cli/main.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/cli/models.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/cli/tools.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/base_models.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/document.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/settings.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/document_converter.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/exceptions.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/api_vlm_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/base_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/base_ocr_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/code_formula_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/easyocr_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/factories/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/factories/base_factory.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/layout_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/page_assemble_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/plugins/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/plugins/defaults.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/readingorder_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/table_structure_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/utils/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/pipeline/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/py.typed +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/__init__.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/api_image_request.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/export.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/glm_utils.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/locks.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/model_downloader.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/ocr_utils.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/orientation.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/profiling.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/utils.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling/utils/visualization.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling.egg-info/requires.txt +0 -0
- {docling-2.42.0 → docling-2.42.2}/docling.egg-info/top_level.txt +0 -0
- {docling-2.42.0 → docling-2.42.2}/setup.cfg +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_asr_pipeline.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_csv.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_docling_json.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_jats.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_markdown.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_msexcel.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_msword.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_pdfium.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_pptx.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_backend_webp.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_cli.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_code_formula.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_data_gen_flag.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_e2e_conversion.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_interfaces.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_invalid_input.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_ocr_utils.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_options.py +0 -0
- {docling-2.42.0 → docling-2.42.2}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.42.
|
3
|
+
Version: 2.42.2
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -89,6 +89,7 @@ Dynamic: license-file
|
|
89
89
|
[](https://opensource.org/licenses/MIT)
|
90
90
|
[](https://pepy.tech/projects/docling)
|
91
91
|
[](https://apify.com/vancura/docling)
|
92
|
+
[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
|
92
93
|
[](https://www.bestpractices.dev/projects/10101)
|
93
94
|
[](https://lfaidata.foundation/projects/)
|
94
95
|
|
@@ -21,6 +21,7 @@
|
|
21
21
|
[](https://opensource.org/licenses/MIT)
|
22
22
|
[](https://pepy.tech/projects/docling)
|
23
23
|
[](https://apify.com/vancura/docling)
|
24
|
+
[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
|
24
25
|
[](https://www.bestpractices.dev/projects/10101)
|
25
26
|
[](https://lfaidata.foundation/projects/)
|
26
27
|
|
@@ -0,0 +1,570 @@
|
|
1
|
+
import logging
|
2
|
+
import re
|
3
|
+
import traceback
|
4
|
+
from io import BytesIO
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Final, Optional, Union, cast
|
7
|
+
|
8
|
+
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
9
|
+
from bs4.element import PreformattedString
|
10
|
+
from docling_core.types.doc import (
|
11
|
+
DocItem,
|
12
|
+
DocItemLabel,
|
13
|
+
DoclingDocument,
|
14
|
+
DocumentOrigin,
|
15
|
+
GroupItem,
|
16
|
+
GroupLabel,
|
17
|
+
TableCell,
|
18
|
+
TableData,
|
19
|
+
TextItem,
|
20
|
+
)
|
21
|
+
from docling_core.types.doc.document import ContentLayer
|
22
|
+
from pydantic import BaseModel
|
23
|
+
from typing_extensions import override
|
24
|
+
|
25
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
26
|
+
from docling.datamodel.base_models import InputFormat
|
27
|
+
from docling.datamodel.document import InputDocument
|
28
|
+
|
29
|
+
_log = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
DEFAULT_IMAGE_WIDTH = 128
|
32
|
+
DEFAULT_IMAGE_HEIGHT = 128
|
33
|
+
|
34
|
+
# Tags that initiate distinct Docling items
|
35
|
+
_BLOCK_TAGS: Final = {
|
36
|
+
"address",
|
37
|
+
"details",
|
38
|
+
"figure",
|
39
|
+
"h1",
|
40
|
+
"h2",
|
41
|
+
"h3",
|
42
|
+
"h4",
|
43
|
+
"h5",
|
44
|
+
"h6",
|
45
|
+
"p",
|
46
|
+
"pre",
|
47
|
+
"code",
|
48
|
+
"ul",
|
49
|
+
"ol",
|
50
|
+
"summary",
|
51
|
+
"table",
|
52
|
+
}
|
53
|
+
|
54
|
+
|
55
|
+
class _Context(BaseModel):
|
56
|
+
list_ordered_flag_by_ref: dict[str, bool] = {}
|
57
|
+
list_start_by_ref: dict[str, int] = {}
|
58
|
+
|
59
|
+
|
60
|
+
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
61
|
+
@override
|
62
|
+
def __init__(
|
63
|
+
self,
|
64
|
+
in_doc: InputDocument,
|
65
|
+
path_or_stream: Union[BytesIO, Path],
|
66
|
+
):
|
67
|
+
super().__init__(in_doc, path_or_stream)
|
68
|
+
self.soup: Optional[Tag] = None
|
69
|
+
self.path_or_stream = path_or_stream
|
70
|
+
|
71
|
+
# Initialize the parents for the hierarchy
|
72
|
+
self.max_levels = 10
|
73
|
+
self.level = 0
|
74
|
+
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
75
|
+
self.ctx = _Context()
|
76
|
+
for i in range(self.max_levels):
|
77
|
+
self.parents[i] = None
|
78
|
+
|
79
|
+
try:
|
80
|
+
raw = (
|
81
|
+
path_or_stream.getvalue()
|
82
|
+
if isinstance(path_or_stream, BytesIO)
|
83
|
+
else Path(path_or_stream).read_bytes()
|
84
|
+
)
|
85
|
+
self.soup = BeautifulSoup(raw, "html.parser")
|
86
|
+
except Exception as e:
|
87
|
+
raise RuntimeError(
|
88
|
+
"Could not initialize HTML backend for file with "
|
89
|
+
f"hash {self.document_hash}."
|
90
|
+
) from e
|
91
|
+
|
92
|
+
@override
|
93
|
+
def is_valid(self) -> bool:
|
94
|
+
return self.soup is not None
|
95
|
+
|
96
|
+
@classmethod
|
97
|
+
@override
|
98
|
+
def supports_pagination(cls) -> bool:
|
99
|
+
return False
|
100
|
+
|
101
|
+
@override
|
102
|
+
def unload(self):
|
103
|
+
if isinstance(self.path_or_stream, BytesIO):
|
104
|
+
self.path_or_stream.close()
|
105
|
+
self.path_or_stream = None
|
106
|
+
|
107
|
+
@classmethod
|
108
|
+
@override
|
109
|
+
def supported_formats(cls) -> set[InputFormat]:
|
110
|
+
return {InputFormat.HTML}
|
111
|
+
|
112
|
+
@override
|
113
|
+
def convert(self) -> DoclingDocument:
|
114
|
+
_log.debug("Starting HTML conversion...")
|
115
|
+
if not self.is_valid():
|
116
|
+
raise RuntimeError("Invalid HTML document.")
|
117
|
+
|
118
|
+
origin = DocumentOrigin(
|
119
|
+
filename=self.file.name or "file",
|
120
|
+
mimetype="text/html",
|
121
|
+
binary_hash=self.document_hash,
|
122
|
+
)
|
123
|
+
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
124
|
+
|
125
|
+
assert self.soup is not None
|
126
|
+
# set the title as furniture, since it is part of the document metadata
|
127
|
+
title = self.soup.title
|
128
|
+
if title:
|
129
|
+
doc.add_title(
|
130
|
+
text=title.get_text(separator=" ", strip=True),
|
131
|
+
content_layer=ContentLayer.FURNITURE,
|
132
|
+
)
|
133
|
+
# remove scripts/styles
|
134
|
+
for tag in self.soup(["script", "style"]):
|
135
|
+
tag.decompose()
|
136
|
+
content = self.soup.body or self.soup
|
137
|
+
# normalize <br> tags
|
138
|
+
for br in content("br"):
|
139
|
+
br.replace_with(NavigableString("\n"))
|
140
|
+
# set default content layer
|
141
|
+
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
142
|
+
self.content_layer = (
|
143
|
+
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
144
|
+
)
|
145
|
+
# reset context
|
146
|
+
self.ctx = _Context()
|
147
|
+
|
148
|
+
try:
|
149
|
+
self._walk(content, doc)
|
150
|
+
except Exception:
|
151
|
+
print(traceback.format_exc())
|
152
|
+
|
153
|
+
return doc
|
154
|
+
|
155
|
+
def _walk(self, element: Tag, doc: DoclingDocument) -> None:
|
156
|
+
"""Parse an XML tag by recursively walking its content.
|
157
|
+
|
158
|
+
While walking, the method buffers inline text across tags like <b> or <span>,
|
159
|
+
emitting text nodes only at block boundaries.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
element: The XML tag to parse.
|
163
|
+
doc: The Docling document to be updated with the parsed content.
|
164
|
+
"""
|
165
|
+
buffer: list[str] = []
|
166
|
+
|
167
|
+
def flush_buffer():
|
168
|
+
if not buffer:
|
169
|
+
return
|
170
|
+
text = "".join(buffer).strip()
|
171
|
+
buffer.clear()
|
172
|
+
if not text:
|
173
|
+
return
|
174
|
+
for part in text.split("\n"):
|
175
|
+
seg = part.strip()
|
176
|
+
if seg:
|
177
|
+
doc.add_text(
|
178
|
+
DocItemLabel.TEXT,
|
179
|
+
seg,
|
180
|
+
parent=self.parents[self.level],
|
181
|
+
content_layer=self.content_layer,
|
182
|
+
)
|
183
|
+
|
184
|
+
for node in element.contents:
|
185
|
+
if isinstance(node, Tag):
|
186
|
+
name = node.name.lower()
|
187
|
+
if name == "img":
|
188
|
+
flush_buffer()
|
189
|
+
self._emit_image(node, doc)
|
190
|
+
elif name in _BLOCK_TAGS:
|
191
|
+
flush_buffer()
|
192
|
+
self._handle_block(node, doc)
|
193
|
+
elif node.find(_BLOCK_TAGS):
|
194
|
+
flush_buffer()
|
195
|
+
self._walk(node, doc)
|
196
|
+
else:
|
197
|
+
buffer.append(node.text)
|
198
|
+
elif isinstance(node, NavigableString) and not isinstance(
|
199
|
+
node, PreformattedString
|
200
|
+
):
|
201
|
+
buffer.append(str(node))
|
202
|
+
|
203
|
+
flush_buffer()
|
204
|
+
|
205
|
+
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
206
|
+
tag_name = tag.name.lower()
|
207
|
+
# set default content layer to BODY as soon as we encounter a heading
|
208
|
+
self.content_layer = ContentLayer.BODY
|
209
|
+
level = int(tag_name[1])
|
210
|
+
text = tag.get_text(strip=True, separator=" ")
|
211
|
+
# the first level is for the title item
|
212
|
+
if level == 1:
|
213
|
+
for key in self.parents.keys():
|
214
|
+
self.parents[key] = None
|
215
|
+
self.level = 0
|
216
|
+
self.parents[self.level + 1] = doc.add_title(
|
217
|
+
text, content_layer=self.content_layer
|
218
|
+
)
|
219
|
+
# the other levels need to be lowered by 1 if a title was set
|
220
|
+
else:
|
221
|
+
level -= 1
|
222
|
+
if level > self.level:
|
223
|
+
# add invisible group
|
224
|
+
for i in range(self.level, level):
|
225
|
+
_log.debug(f"Adding invisible group to level {i}")
|
226
|
+
self.parents[i + 1] = doc.add_group(
|
227
|
+
name=f"header-{i + 1}",
|
228
|
+
label=GroupLabel.SECTION,
|
229
|
+
parent=self.parents[i],
|
230
|
+
content_layer=self.content_layer,
|
231
|
+
)
|
232
|
+
self.level = level
|
233
|
+
elif level < self.level:
|
234
|
+
# remove the tail
|
235
|
+
for key in self.parents.keys():
|
236
|
+
if key > level + 1:
|
237
|
+
_log.debug(f"Remove the tail of level {key}")
|
238
|
+
self.parents[key] = None
|
239
|
+
self.level = level
|
240
|
+
self.parents[self.level + 1] = doc.add_heading(
|
241
|
+
parent=self.parents[self.level],
|
242
|
+
text=text,
|
243
|
+
level=self.level,
|
244
|
+
content_layer=self.content_layer,
|
245
|
+
)
|
246
|
+
self.level += 1
|
247
|
+
for img_tag in tag("img"):
|
248
|
+
if isinstance(img_tag, Tag):
|
249
|
+
self._emit_image(img_tag, doc)
|
250
|
+
|
251
|
+
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
|
252
|
+
tag_name = tag.name.lower()
|
253
|
+
start: Optional[int] = None
|
254
|
+
name: str = ""
|
255
|
+
is_ordered = tag_name == "ol"
|
256
|
+
if is_ordered:
|
257
|
+
start_attr = tag.get("start")
|
258
|
+
if isinstance(start_attr, str) and start_attr.isnumeric():
|
259
|
+
start = int(start_attr)
|
260
|
+
name = "ordered list" + (f" start {start}" if start is not None else "")
|
261
|
+
else:
|
262
|
+
name = "list"
|
263
|
+
# Create the list container
|
264
|
+
list_group = doc.add_list_group(
|
265
|
+
name=name,
|
266
|
+
parent=self.parents[self.level],
|
267
|
+
content_layer=self.content_layer,
|
268
|
+
)
|
269
|
+
self.parents[self.level + 1] = list_group
|
270
|
+
self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
|
271
|
+
if is_ordered and start is not None:
|
272
|
+
self.ctx.list_start_by_ref[list_group.self_ref] = start
|
273
|
+
self.level += 1
|
274
|
+
|
275
|
+
# For each top-level <li> in this list
|
276
|
+
for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
|
277
|
+
if not isinstance(li, Tag):
|
278
|
+
continue
|
279
|
+
|
280
|
+
# sub-list items should be indented under main list items, but temporarily
|
281
|
+
# addressing invalid HTML (docling-core/issues/357)
|
282
|
+
if li.name in {"ul", "ol"}:
|
283
|
+
self._handle_block(li, doc)
|
284
|
+
|
285
|
+
else:
|
286
|
+
# 1) determine the marker
|
287
|
+
if is_ordered and start is not None:
|
288
|
+
marker = f"{start + len(list_group.children)}."
|
289
|
+
else:
|
290
|
+
marker = ""
|
291
|
+
|
292
|
+
# 2) extract only the "direct" text from this <li>
|
293
|
+
parts: list[str] = []
|
294
|
+
for child in li.contents:
|
295
|
+
if isinstance(child, NavigableString) and not isinstance(
|
296
|
+
child, PreformattedString
|
297
|
+
):
|
298
|
+
parts.append(child)
|
299
|
+
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
|
300
|
+
text_part = HTMLDocumentBackend.get_text(child)
|
301
|
+
if text_part:
|
302
|
+
parts.append(text_part)
|
303
|
+
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
304
|
+
|
305
|
+
# 3) add the list item
|
306
|
+
if li_text:
|
307
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
308
|
+
text=li_text,
|
309
|
+
enumerated=is_ordered,
|
310
|
+
marker=marker,
|
311
|
+
parent=list_group,
|
312
|
+
content_layer=self.content_layer,
|
313
|
+
)
|
314
|
+
|
315
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
316
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
317
|
+
if isinstance(sublist, Tag):
|
318
|
+
self.level += 1
|
319
|
+
self._handle_block(sublist, doc)
|
320
|
+
self.parents[self.level + 1] = None
|
321
|
+
self.level -= 1
|
322
|
+
else:
|
323
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
324
|
+
if isinstance(sublist, Tag):
|
325
|
+
self._handle_block(sublist, doc)
|
326
|
+
|
327
|
+
# 5) extract any images under this <li>
|
328
|
+
for img_tag in li("img"):
|
329
|
+
if isinstance(img_tag, Tag):
|
330
|
+
self._emit_image(img_tag, doc)
|
331
|
+
|
332
|
+
self.parents[self.level + 1] = None
|
333
|
+
self.level -= 1
|
334
|
+
|
335
|
+
def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
|
336
|
+
tag_name = tag.name.lower()
|
337
|
+
|
338
|
+
if tag_name == "figure":
|
339
|
+
img_tag = tag.find("img")
|
340
|
+
if isinstance(img_tag, Tag):
|
341
|
+
self._emit_image(img_tag, doc)
|
342
|
+
|
343
|
+
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
344
|
+
self._handle_heading(tag, doc)
|
345
|
+
|
346
|
+
elif tag_name in {"ul", "ol"}:
|
347
|
+
self._handle_list(tag, doc)
|
348
|
+
|
349
|
+
elif tag_name in {"p", "address", "summary"}:
|
350
|
+
for part in tag.text.split("\n"):
|
351
|
+
seg = part.strip()
|
352
|
+
if seg:
|
353
|
+
doc.add_text(
|
354
|
+
parent=self.parents[self.level],
|
355
|
+
label=DocItemLabel.TEXT,
|
356
|
+
text=seg,
|
357
|
+
content_layer=self.content_layer,
|
358
|
+
)
|
359
|
+
for img_tag in tag("img"):
|
360
|
+
if isinstance(img_tag, Tag):
|
361
|
+
self._emit_image(img_tag, doc)
|
362
|
+
|
363
|
+
elif tag_name == "table":
|
364
|
+
data = HTMLDocumentBackend.parse_table_data(tag)
|
365
|
+
for img_tag in tag("img"):
|
366
|
+
if isinstance(img_tag, Tag):
|
367
|
+
self._emit_image(tag, doc)
|
368
|
+
if data is not None:
|
369
|
+
doc.add_table(
|
370
|
+
data=data,
|
371
|
+
parent=self.parents[self.level],
|
372
|
+
content_layer=self.content_layer,
|
373
|
+
)
|
374
|
+
|
375
|
+
elif tag_name in {"pre", "code"}:
|
376
|
+
# handle monospace code snippets (pre).
|
377
|
+
text = tag.get_text(strip=True)
|
378
|
+
if text:
|
379
|
+
doc.add_code(
|
380
|
+
parent=self.parents[self.level],
|
381
|
+
text=text,
|
382
|
+
content_layer=self.content_layer,
|
383
|
+
)
|
384
|
+
|
385
|
+
elif tag_name == "details":
|
386
|
+
# handle details and its content.
|
387
|
+
self.parents[self.level + 1] = doc.add_group(
|
388
|
+
name="details",
|
389
|
+
label=GroupLabel.SECTION,
|
390
|
+
parent=self.parents[self.level],
|
391
|
+
content_layer=self.content_layer,
|
392
|
+
)
|
393
|
+
self.level += 1
|
394
|
+
self._walk(tag, doc)
|
395
|
+
self.parents[self.level + 1] = None
|
396
|
+
self.level -= 1
|
397
|
+
|
398
|
+
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
399
|
+
figure = img_tag.find_parent("figure")
|
400
|
+
caption: str = ""
|
401
|
+
if isinstance(figure, Tag):
|
402
|
+
caption_tag = figure.find("figcaption", recursive=False)
|
403
|
+
if isinstance(caption_tag, Tag):
|
404
|
+
caption = caption_tag.get_text()
|
405
|
+
if not caption:
|
406
|
+
caption = str(img_tag.get("alt", "")).strip()
|
407
|
+
|
408
|
+
caption_item: Optional[TextItem] = None
|
409
|
+
if caption:
|
410
|
+
caption_item = doc.add_text(
|
411
|
+
DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
|
412
|
+
)
|
413
|
+
|
414
|
+
doc.add_picture(
|
415
|
+
caption=caption_item,
|
416
|
+
parent=self.parents[self.level],
|
417
|
+
content_layer=self.content_layer,
|
418
|
+
)
|
419
|
+
|
420
|
+
@staticmethod
|
421
|
+
def get_text(item: PageElement) -> str:
|
422
|
+
"""Concatenate all child strings of a PageElement.
|
423
|
+
|
424
|
+
This method is equivalent to `PageElement.get_text()` but also considers
|
425
|
+
certain tags. When called on a <p> or <li> tags, it returns the text with a
|
426
|
+
trailing space, otherwise the text is concatenated without separators.
|
427
|
+
"""
|
428
|
+
|
429
|
+
def _extract_text_recursively(item: PageElement) -> list[str]:
|
430
|
+
"""Recursively extract text from all child nodes."""
|
431
|
+
result: list[str] = []
|
432
|
+
|
433
|
+
if isinstance(item, NavigableString):
|
434
|
+
result = [item]
|
435
|
+
elif isinstance(item, Tag):
|
436
|
+
tag = cast(Tag, item)
|
437
|
+
parts: list[str] = []
|
438
|
+
for child in tag:
|
439
|
+
parts.extend(_extract_text_recursively(child))
|
440
|
+
result.append(
|
441
|
+
"".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
|
442
|
+
)
|
443
|
+
|
444
|
+
return result
|
445
|
+
|
446
|
+
parts: list[str] = _extract_text_recursively(item)
|
447
|
+
|
448
|
+
return "".join(parts)
|
449
|
+
|
450
|
+
@staticmethod
|
451
|
+
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
452
|
+
"""Extract colspan and rowspan values from a table cell tag.
|
453
|
+
|
454
|
+
This function retrieves the 'colspan' and 'rowspan' attributes from a given
|
455
|
+
table cell tag.
|
456
|
+
If the attribute does not exist or it is not numeric, it defaults to 1.
|
457
|
+
"""
|
458
|
+
raw_spans: tuple[str, str] = (
|
459
|
+
str(cell.get("colspan", "1")),
|
460
|
+
str(cell.get("rowspan", "1")),
|
461
|
+
)
|
462
|
+
int_spans: tuple[int, int] = (
|
463
|
+
int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
|
464
|
+
int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
|
465
|
+
)
|
466
|
+
|
467
|
+
return int_spans
|
468
|
+
|
469
|
+
@staticmethod
|
470
|
+
def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
|
471
|
+
nested_tables = element.find("table")
|
472
|
+
if nested_tables is not None:
|
473
|
+
_log.debug("Skipping nested table.")
|
474
|
+
return None
|
475
|
+
|
476
|
+
# Find the number of rows and columns (taking into account spans)
|
477
|
+
num_rows = 0
|
478
|
+
num_cols = 0
|
479
|
+
for row in element("tr"):
|
480
|
+
col_count = 0
|
481
|
+
is_row_header = True
|
482
|
+
if not isinstance(row, Tag):
|
483
|
+
continue
|
484
|
+
for cell in row(["td", "th"]):
|
485
|
+
if not isinstance(row, Tag):
|
486
|
+
continue
|
487
|
+
cell_tag = cast(Tag, cell)
|
488
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
489
|
+
col_count += col_span
|
490
|
+
if cell_tag.name == "td" or row_span == 1:
|
491
|
+
is_row_header = False
|
492
|
+
num_cols = max(num_cols, col_count)
|
493
|
+
if not is_row_header:
|
494
|
+
num_rows += 1
|
495
|
+
|
496
|
+
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
497
|
+
|
498
|
+
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
499
|
+
|
500
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
501
|
+
|
502
|
+
# Iterate over the rows in the table
|
503
|
+
start_row_span = 0
|
504
|
+
row_idx = -1
|
505
|
+
for row in element("tr"):
|
506
|
+
if not isinstance(row, Tag):
|
507
|
+
continue
|
508
|
+
|
509
|
+
# For each row, find all the column cells (both <td> and <th>)
|
510
|
+
cells = row(["td", "th"])
|
511
|
+
|
512
|
+
# Check if cell is in a column header or row header
|
513
|
+
col_header = True
|
514
|
+
row_header = True
|
515
|
+
for html_cell in cells:
|
516
|
+
if isinstance(html_cell, Tag):
|
517
|
+
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
518
|
+
if html_cell.name == "td":
|
519
|
+
col_header = False
|
520
|
+
row_header = False
|
521
|
+
elif row_span == 1:
|
522
|
+
row_header = False
|
523
|
+
if not row_header:
|
524
|
+
row_idx += 1
|
525
|
+
start_row_span = 0
|
526
|
+
else:
|
527
|
+
start_row_span += 1
|
528
|
+
|
529
|
+
# Extract the text content of each cell
|
530
|
+
col_idx = 0
|
531
|
+
for html_cell in cells:
|
532
|
+
if not isinstance(html_cell, Tag):
|
533
|
+
continue
|
534
|
+
|
535
|
+
# extract inline formulas
|
536
|
+
for formula in html_cell("inline-formula"):
|
537
|
+
math_parts = formula.text.split("$$")
|
538
|
+
if len(math_parts) == 3:
|
539
|
+
math_formula = f"$${math_parts[1]}$$"
|
540
|
+
formula.replace_with(NavigableString(math_formula))
|
541
|
+
|
542
|
+
# TODO: extract content correctly from table-cells with lists
|
543
|
+
text = HTMLDocumentBackend.get_text(html_cell).strip()
|
544
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
545
|
+
if row_header:
|
546
|
+
row_span -= 1
|
547
|
+
while (
|
548
|
+
col_idx < num_cols
|
549
|
+
and grid[row_idx + start_row_span][col_idx] is not None
|
550
|
+
):
|
551
|
+
col_idx += 1
|
552
|
+
for r in range(start_row_span, start_row_span + row_span):
|
553
|
+
for c in range(col_span):
|
554
|
+
if row_idx + r < num_rows and col_idx + c < num_cols:
|
555
|
+
grid[row_idx + r][col_idx + c] = text
|
556
|
+
|
557
|
+
table_cell = TableCell(
|
558
|
+
text=text,
|
559
|
+
row_span=row_span,
|
560
|
+
col_span=col_span,
|
561
|
+
start_row_offset_idx=start_row_span + row_idx,
|
562
|
+
end_row_offset_idx=start_row_span + row_idx + row_span,
|
563
|
+
start_col_offset_idx=col_idx,
|
564
|
+
end_col_offset_idx=col_idx + col_span,
|
565
|
+
column_header=col_header,
|
566
|
+
row_header=((not col_header) and html_cell.name == "th"),
|
567
|
+
)
|
568
|
+
data.table_cells.append(table_cell)
|
569
|
+
|
570
|
+
return data
|
@@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1104
1104
|
)
|
1105
1105
|
_log.debug(f" spanned before row {spanned_idx}")
|
1106
1106
|
|
1107
|
+
# Detect equations in cell text
|
1108
|
+
text, equations = self._handle_equations_in_text(
|
1109
|
+
element=cell._element, text=cell.text
|
1110
|
+
)
|
1111
|
+
if len(equations) == 0:
|
1112
|
+
text = cell.text
|
1113
|
+
else:
|
1114
|
+
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
1115
|
+
|
1107
1116
|
table_cell = TableCell(
|
1108
|
-
text=
|
1117
|
+
text=text,
|
1109
1118
|
row_span=spanned_idx - row_idx,
|
1110
1119
|
col_span=cell.grid_span,
|
1111
1120
|
start_row_offset_idx=row.grid_cols_before + row_idx,
|
@@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
57
57
|
if self.input_format is InputFormat.IMAGE:
|
58
58
|
buf = BytesIO()
|
59
59
|
img = Image.open(self.path_or_stream)
|
60
|
-
|
60
|
+
|
61
|
+
# Handle multi-page TIFF images
|
62
|
+
if hasattr(img, "n_frames") and img.n_frames > 1:
|
63
|
+
# Extract all frames from multi-page image
|
64
|
+
frames = []
|
65
|
+
try:
|
66
|
+
for i in range(img.n_frames):
|
67
|
+
img.seek(i)
|
68
|
+
frame = img.copy().convert("RGB")
|
69
|
+
frames.append(frame)
|
70
|
+
except EOFError:
|
71
|
+
pass
|
72
|
+
|
73
|
+
# Save as multi-page PDF
|
74
|
+
if frames:
|
75
|
+
frames[0].save(
|
76
|
+
buf, "PDF", save_all=True, append_images=frames[1:]
|
77
|
+
)
|
78
|
+
else:
|
79
|
+
# Fallback to single page if frame extraction fails
|
80
|
+
img.convert("RGB").save(buf, "PDF")
|
81
|
+
else:
|
82
|
+
# Single page image - convert to RGB and save
|
83
|
+
img.convert("RGB").save(buf, "PDF")
|
84
|
+
|
61
85
|
buf.seek(0)
|
62
86
|
self.path_or_stream = buf
|
63
87
|
else:
|
@@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
217
217
|
return conv_res
|
218
218
|
|
219
219
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
220
|
-
status =
|
220
|
+
status = conv_res.status
|
221
|
+
if status in [
|
222
|
+
ConversionStatus.PENDING,
|
223
|
+
ConversionStatus.STARTED,
|
224
|
+
]: # preserves ConversionStatus.PARTIAL_SUCCESS
|
225
|
+
status = ConversionStatus.SUCCESS
|
226
|
+
|
221
227
|
for page in conv_res.pages:
|
222
228
|
if page._backend is None or not page._backend.is_valid():
|
223
229
|
conv_res.errors.append(
|
@@ -267,9 +267,14 @@ class LayoutPostprocessor:
|
|
267
267
|
# Initial cell assignment
|
268
268
|
clusters = self._assign_cells_to_clusters(clusters)
|
269
269
|
|
270
|
-
# Remove clusters with no cells (if keep_empty_clusters is False)
|
270
|
+
# Remove clusters with no cells (if keep_empty_clusters is False),
|
271
|
+
# but always keep clusters with label DocItemLabel.FORMULA
|
271
272
|
if not self.options.keep_empty_clusters:
|
272
|
-
clusters = [
|
273
|
+
clusters = [
|
274
|
+
cluster
|
275
|
+
for cluster in clusters
|
276
|
+
if cluster.cells or cluster.label == DocItemLabel.FORMULA
|
277
|
+
]
|
273
278
|
|
274
279
|
# Handle orphaned cells
|
275
280
|
unassigned = self._find_unassigned_cells(clusters)
|