docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1502 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import warnings
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from copy import deepcopy
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Final, Optional, Union, cast
|
|
11
|
+
from urllib.parse import urljoin, urlparse
|
|
12
|
+
|
|
13
|
+
import requests
|
|
14
|
+
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
|
15
|
+
from bs4.element import PreformattedString
|
|
16
|
+
from docling_core.types.doc import (
|
|
17
|
+
DocItem,
|
|
18
|
+
DocItemLabel,
|
|
19
|
+
DoclingDocument,
|
|
20
|
+
DocumentOrigin,
|
|
21
|
+
GroupItem,
|
|
22
|
+
GroupLabel,
|
|
23
|
+
PictureItem,
|
|
24
|
+
RefItem,
|
|
25
|
+
RichTableCell,
|
|
26
|
+
TableCell,
|
|
27
|
+
TableData,
|
|
28
|
+
TableItem,
|
|
29
|
+
TextItem,
|
|
30
|
+
)
|
|
31
|
+
from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
|
|
32
|
+
from PIL import Image, UnidentifiedImageError
|
|
33
|
+
from pydantic import AnyUrl, BaseModel, ValidationError
|
|
34
|
+
from typing_extensions import override
|
|
35
|
+
|
|
36
|
+
from docling.backend.abstract_backend import (
|
|
37
|
+
DeclarativeDocumentBackend,
|
|
38
|
+
)
|
|
39
|
+
from docling.datamodel.backend_options import HTMLBackendOptions
|
|
40
|
+
from docling.datamodel.base_models import InputFormat
|
|
41
|
+
from docling.datamodel.document import InputDocument
|
|
42
|
+
from docling.exceptions import OperationNotAllowed
|
|
43
|
+
|
|
44
|
+
_log = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
DEFAULT_IMAGE_WIDTH = 128
|
|
47
|
+
DEFAULT_IMAGE_HEIGHT = 128
|
|
48
|
+
|
|
49
|
+
# Tags that initiate distinct Docling items
|
|
50
|
+
_BLOCK_TAGS: Final = {
|
|
51
|
+
"address",
|
|
52
|
+
"details",
|
|
53
|
+
"figure",
|
|
54
|
+
"footer",
|
|
55
|
+
"img",
|
|
56
|
+
"h1",
|
|
57
|
+
"h2",
|
|
58
|
+
"h3",
|
|
59
|
+
"h4",
|
|
60
|
+
"h5",
|
|
61
|
+
"h6",
|
|
62
|
+
"ol",
|
|
63
|
+
"p",
|
|
64
|
+
"pre",
|
|
65
|
+
"summary",
|
|
66
|
+
"table",
|
|
67
|
+
"ul",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# Block-level elements that should not appear inside <p>
|
|
71
|
+
_PARA_BREAKERS = {
|
|
72
|
+
"address",
|
|
73
|
+
"article",
|
|
74
|
+
"aside",
|
|
75
|
+
"blockquote",
|
|
76
|
+
"div",
|
|
77
|
+
"dl",
|
|
78
|
+
"fieldset",
|
|
79
|
+
"figcaption",
|
|
80
|
+
"figure",
|
|
81
|
+
"footer",
|
|
82
|
+
"form",
|
|
83
|
+
"h1",
|
|
84
|
+
"h2",
|
|
85
|
+
"h3",
|
|
86
|
+
"h4",
|
|
87
|
+
"h5",
|
|
88
|
+
"h6",
|
|
89
|
+
"header",
|
|
90
|
+
"hr",
|
|
91
|
+
"main",
|
|
92
|
+
"nav",
|
|
93
|
+
"ol",
|
|
94
|
+
"ul",
|
|
95
|
+
"li",
|
|
96
|
+
"p", # <p> inside <p> also forces closing
|
|
97
|
+
"pre",
|
|
98
|
+
"section",
|
|
99
|
+
"table",
|
|
100
|
+
"thead",
|
|
101
|
+
"tbody",
|
|
102
|
+
"tfoot",
|
|
103
|
+
"tr",
|
|
104
|
+
"td",
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
|
|
108
|
+
|
|
109
|
+
_FORMAT_TAG_MAP: Final = {
|
|
110
|
+
"b": {"bold": True},
|
|
111
|
+
"strong": {"bold": True},
|
|
112
|
+
"i": {"italic": True},
|
|
113
|
+
"em": {"italic": True},
|
|
114
|
+
"var": {"italic": True},
|
|
115
|
+
# "mark",
|
|
116
|
+
# "small",
|
|
117
|
+
"s": {"strikethrough": True},
|
|
118
|
+
"del": {"strikethrough": True},
|
|
119
|
+
"u": {"underline": True},
|
|
120
|
+
"ins": {"underline": True},
|
|
121
|
+
"sub": {"script": Script.SUB},
|
|
122
|
+
"sup": {"script": Script.SUPER},
|
|
123
|
+
**{k: {} for k in _CODE_TAG_SET},
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class _Context(BaseModel):
|
|
128
|
+
list_ordered_flag_by_ref: dict[str, bool] = {}
|
|
129
|
+
list_start_by_ref: dict[str, int] = {}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class AnnotatedText(BaseModel):
|
|
133
|
+
text: str
|
|
134
|
+
hyperlink: Union[AnyUrl, Path, None] = None
|
|
135
|
+
formatting: Union[Formatting, None] = None
|
|
136
|
+
code: bool = False
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class AnnotatedTextList(list):
|
|
140
|
+
def to_single_text_element(self) -> AnnotatedText:
|
|
141
|
+
current_h = None
|
|
142
|
+
current_text = ""
|
|
143
|
+
current_f = None
|
|
144
|
+
current_code = False
|
|
145
|
+
for at in self:
|
|
146
|
+
t = at.text
|
|
147
|
+
h = at.hyperlink
|
|
148
|
+
f = at.formatting
|
|
149
|
+
c = at.code
|
|
150
|
+
current_text += t.strip() + " "
|
|
151
|
+
if f is not None and current_f is None:
|
|
152
|
+
current_f = f
|
|
153
|
+
elif f is not None and current_f is not None and f != current_f:
|
|
154
|
+
_log.warning(
|
|
155
|
+
f"Clashing formatting: '{f}' and '{current_f}'! Chose '{current_f}'"
|
|
156
|
+
)
|
|
157
|
+
if h is not None and current_h is None:
|
|
158
|
+
current_h = h
|
|
159
|
+
elif h is not None and current_h is not None and h != current_h:
|
|
160
|
+
_log.warning(
|
|
161
|
+
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
|
162
|
+
)
|
|
163
|
+
current_code = c if c else current_code
|
|
164
|
+
|
|
165
|
+
return AnnotatedText(
|
|
166
|
+
text=current_text.strip(),
|
|
167
|
+
hyperlink=current_h,
|
|
168
|
+
formatting=current_f,
|
|
169
|
+
code=current_code,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def simplify_text_elements(self) -> "AnnotatedTextList":
|
|
173
|
+
simplified = AnnotatedTextList()
|
|
174
|
+
if not self:
|
|
175
|
+
return self
|
|
176
|
+
text = self[0].text
|
|
177
|
+
hyperlink = self[0].hyperlink
|
|
178
|
+
formatting = self[0].formatting
|
|
179
|
+
code = self[0].code
|
|
180
|
+
last_elm = text
|
|
181
|
+
for i in range(1, len(self)):
|
|
182
|
+
if (
|
|
183
|
+
hyperlink == self[i].hyperlink
|
|
184
|
+
and formatting == self[i].formatting
|
|
185
|
+
and code == self[i].code
|
|
186
|
+
):
|
|
187
|
+
sep = " "
|
|
188
|
+
if not self[i].text.strip() or not last_elm.strip():
|
|
189
|
+
sep = ""
|
|
190
|
+
text += sep + self[i].text
|
|
191
|
+
last_elm = self[i].text
|
|
192
|
+
else:
|
|
193
|
+
simplified.append(
|
|
194
|
+
AnnotatedText(
|
|
195
|
+
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
text = self[i].text
|
|
199
|
+
last_elm = text
|
|
200
|
+
hyperlink = self[i].hyperlink
|
|
201
|
+
formatting = self[i].formatting
|
|
202
|
+
code = self[i].code
|
|
203
|
+
if text:
|
|
204
|
+
simplified.append(
|
|
205
|
+
AnnotatedText(
|
|
206
|
+
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
return simplified
|
|
210
|
+
|
|
211
|
+
def split_by_newline(self):
|
|
212
|
+
super_list = []
|
|
213
|
+
active_annotated_text_list = AnnotatedTextList()
|
|
214
|
+
for el in self:
|
|
215
|
+
sub_texts = el.text.split("\n")
|
|
216
|
+
if len(sub_texts) == 1:
|
|
217
|
+
active_annotated_text_list.append(el)
|
|
218
|
+
else:
|
|
219
|
+
for text in sub_texts:
|
|
220
|
+
sub_el = deepcopy(el)
|
|
221
|
+
sub_el.text = text
|
|
222
|
+
active_annotated_text_list.append(sub_el)
|
|
223
|
+
super_list.append(active_annotated_text_list)
|
|
224
|
+
active_annotated_text_list = AnnotatedTextList()
|
|
225
|
+
if active_annotated_text_list:
|
|
226
|
+
super_list.append(active_annotated_text_list)
|
|
227
|
+
return super_list
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
231
|
+
@override
|
|
232
|
+
def __init__(
|
|
233
|
+
self,
|
|
234
|
+
in_doc: InputDocument,
|
|
235
|
+
path_or_stream: Union[BytesIO, Path],
|
|
236
|
+
options: HTMLBackendOptions = HTMLBackendOptions(),
|
|
237
|
+
):
|
|
238
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
239
|
+
self.options: HTMLBackendOptions
|
|
240
|
+
self.soup: Optional[BeautifulSoup] = None
|
|
241
|
+
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
|
242
|
+
self.base_path: Optional[str] = str(options.source_uri)
|
|
243
|
+
|
|
244
|
+
# Initialize the parents for the hierarchy
|
|
245
|
+
self.max_levels = 10
|
|
246
|
+
self.level = 0
|
|
247
|
+
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
|
248
|
+
self.ctx = _Context()
|
|
249
|
+
for i in range(self.max_levels):
|
|
250
|
+
self.parents[i] = None
|
|
251
|
+
self.hyperlink: Union[AnyUrl, Path, None] = None
|
|
252
|
+
self.format_tags: list[str] = []
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
raw = (
|
|
256
|
+
path_or_stream.getvalue()
|
|
257
|
+
if isinstance(path_or_stream, BytesIO)
|
|
258
|
+
else Path(path_or_stream).read_bytes()
|
|
259
|
+
)
|
|
260
|
+
self.soup = BeautifulSoup(raw, "html.parser")
|
|
261
|
+
except Exception as e:
|
|
262
|
+
raise RuntimeError(
|
|
263
|
+
"Could not initialize HTML backend for file with "
|
|
264
|
+
f"hash {self.document_hash}."
|
|
265
|
+
) from e
|
|
266
|
+
|
|
267
|
+
@override
|
|
268
|
+
def is_valid(self) -> bool:
|
|
269
|
+
return self.soup is not None
|
|
270
|
+
|
|
271
|
+
@classmethod
|
|
272
|
+
@override
|
|
273
|
+
def supports_pagination(cls) -> bool:
|
|
274
|
+
return False
|
|
275
|
+
|
|
276
|
+
@override
|
|
277
|
+
def unload(self):
|
|
278
|
+
if isinstance(self.path_or_stream, BytesIO):
|
|
279
|
+
self.path_or_stream.close()
|
|
280
|
+
self.path_or_stream = None
|
|
281
|
+
|
|
282
|
+
@classmethod
|
|
283
|
+
@override
|
|
284
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
285
|
+
return {InputFormat.HTML}
|
|
286
|
+
|
|
287
|
+
@override
|
|
288
|
+
def convert(self) -> DoclingDocument:
|
|
289
|
+
_log.debug("Starting HTML conversion...")
|
|
290
|
+
if not self.is_valid():
|
|
291
|
+
raise RuntimeError("Invalid HTML document.")
|
|
292
|
+
|
|
293
|
+
origin = DocumentOrigin(
|
|
294
|
+
filename=self.file.name or "file",
|
|
295
|
+
mimetype="text/html",
|
|
296
|
+
binary_hash=self.document_hash,
|
|
297
|
+
)
|
|
298
|
+
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
|
299
|
+
|
|
300
|
+
assert self.soup is not None
|
|
301
|
+
# set the title as furniture, since it is part of the document metadata
|
|
302
|
+
title = self.soup.title
|
|
303
|
+
if title and self.options.add_title:
|
|
304
|
+
title_text = title.get_text(separator=" ", strip=True)
|
|
305
|
+
title_clean = HTMLDocumentBackend._clean_unicode(title_text)
|
|
306
|
+
doc.add_title(
|
|
307
|
+
text=title_clean,
|
|
308
|
+
orig=title_text,
|
|
309
|
+
content_layer=ContentLayer.FURNITURE,
|
|
310
|
+
)
|
|
311
|
+
# remove script and style tags
|
|
312
|
+
for tag in self.soup(["script", "noscript", "style"]):
|
|
313
|
+
tag.decompose()
|
|
314
|
+
# remove any hidden tag
|
|
315
|
+
for tag in self.soup(hidden=True):
|
|
316
|
+
tag.decompose()
|
|
317
|
+
# fix flow content that is not permitted inside <p>
|
|
318
|
+
HTMLDocumentBackend._fix_invalid_paragraph_structure(self.soup)
|
|
319
|
+
|
|
320
|
+
content = self.soup.body or self.soup
|
|
321
|
+
# normalize <br> tags
|
|
322
|
+
for br in content("br"):
|
|
323
|
+
br.replace_with(NavigableString("\n"))
|
|
324
|
+
# set default content layer
|
|
325
|
+
|
|
326
|
+
# Furniture before the first heading rule, except for headers in tables
|
|
327
|
+
header = None
|
|
328
|
+
# Find all headers first
|
|
329
|
+
all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
|
|
330
|
+
# Keep only those that do NOT have a <table> in a parent chain
|
|
331
|
+
clean_headers = [h for h in all_headers if not h.find_parent("table")]
|
|
332
|
+
# Pick the first header from the remaining
|
|
333
|
+
if len(clean_headers):
|
|
334
|
+
header = clean_headers[0]
|
|
335
|
+
# Set starting content layer
|
|
336
|
+
self.content_layer = (
|
|
337
|
+
ContentLayer.BODY
|
|
338
|
+
if (not self.options.infer_furniture) or (header is None)
|
|
339
|
+
else ContentLayer.FURNITURE
|
|
340
|
+
)
|
|
341
|
+
# reset context
|
|
342
|
+
self.ctx = _Context()
|
|
343
|
+
self._walk(content, doc)
|
|
344
|
+
return doc
|
|
345
|
+
|
|
346
|
+
@staticmethod
|
|
347
|
+
def _fix_invalid_paragraph_structure(soup: BeautifulSoup) -> None:
|
|
348
|
+
"""Rewrite <p> elements that contain block-level breakers.
|
|
349
|
+
|
|
350
|
+
This function emulates browser logic when other block-level elements
|
|
351
|
+
are found inside a <p> element.
|
|
352
|
+
When a <p> is open and a block-level breaker (e.g., h1-h6, div, table)
|
|
353
|
+
appears, automatically close the <p>, emit it, then emit the breaker,
|
|
354
|
+
and if needed open a new <p> for trailing text.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
soup: The HTML document. The DOM may be rewritten.
|
|
358
|
+
"""
|
|
359
|
+
|
|
360
|
+
def _start_para():
|
|
361
|
+
nonlocal current_p
|
|
362
|
+
if current_p is None:
|
|
363
|
+
current_p = soup.new_tag("p")
|
|
364
|
+
new_nodes.append(current_p)
|
|
365
|
+
|
|
366
|
+
def _flush_para_if_empty():
|
|
367
|
+
nonlocal current_p
|
|
368
|
+
if current_p is not None and not current_p.get_text(strip=True):
|
|
369
|
+
# remove empty paragraph placeholder
|
|
370
|
+
if current_p in new_nodes:
|
|
371
|
+
new_nodes.remove(current_p)
|
|
372
|
+
current_p = None
|
|
373
|
+
|
|
374
|
+
paragraphs = soup.select(f"p:has({','.join(tag for tag in _PARA_BREAKERS)})")
|
|
375
|
+
|
|
376
|
+
for p in paragraphs:
|
|
377
|
+
parent = p.parent
|
|
378
|
+
if parent is None:
|
|
379
|
+
continue
|
|
380
|
+
|
|
381
|
+
new_nodes = []
|
|
382
|
+
current_p = None
|
|
383
|
+
|
|
384
|
+
for node in list(p.contents):
|
|
385
|
+
if isinstance(node, NavigableString):
|
|
386
|
+
text = str(node)
|
|
387
|
+
node.extract()
|
|
388
|
+
if text.strip():
|
|
389
|
+
_start_para()
|
|
390
|
+
if current_p is not None:
|
|
391
|
+
current_p.append(NavigableString(text))
|
|
392
|
+
# skip whitespace-only text
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
if isinstance(node, Tag):
|
|
396
|
+
node.extract()
|
|
397
|
+
|
|
398
|
+
if node.name in _PARA_BREAKERS:
|
|
399
|
+
_flush_para_if_empty()
|
|
400
|
+
new_nodes.append(node)
|
|
401
|
+
continue
|
|
402
|
+
else:
|
|
403
|
+
_start_para()
|
|
404
|
+
if current_p is not None:
|
|
405
|
+
current_p.append(node)
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
_flush_para_if_empty()
|
|
409
|
+
|
|
410
|
+
siblings = list(parent.children)
|
|
411
|
+
try:
|
|
412
|
+
idx = siblings.index(p)
|
|
413
|
+
except ValueError:
|
|
414
|
+
# p might have been removed
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
p.extract()
|
|
418
|
+
for n in reversed(new_nodes):
|
|
419
|
+
parent.insert(idx, n)
|
|
420
|
+
|
|
421
|
+
@staticmethod
|
|
422
|
+
def _is_remote_url(value: str) -> bool:
|
|
423
|
+
parsed = urlparse(value)
|
|
424
|
+
return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
|
|
425
|
+
|
|
426
|
+
def _resolve_relative_path(self, loc: str) -> str:
|
|
427
|
+
abs_loc = loc
|
|
428
|
+
|
|
429
|
+
if self.base_path:
|
|
430
|
+
if loc.startswith("//"):
|
|
431
|
+
# Protocol-relative URL - default to https
|
|
432
|
+
abs_loc = "https:" + loc
|
|
433
|
+
elif not loc.startswith(("http://", "https://", "data:", "file://")):
|
|
434
|
+
if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
|
|
435
|
+
abs_loc = urljoin(self.base_path, loc)
|
|
436
|
+
elif self.base_path: # local fetch
|
|
437
|
+
# For local files, resolve relative to the HTML file location
|
|
438
|
+
abs_loc = str(Path(self.base_path).parent / loc)
|
|
439
|
+
|
|
440
|
+
_log.debug(f"Resolved location {loc} to {abs_loc}")
|
|
441
|
+
return abs_loc
|
|
442
|
+
|
|
443
|
+
@staticmethod
|
|
444
|
+
def group_cell_elements(
|
|
445
|
+
group_name: str,
|
|
446
|
+
doc: DoclingDocument,
|
|
447
|
+
provs_in_cell: list[RefItem],
|
|
448
|
+
docling_table: TableItem,
|
|
449
|
+
) -> RefItem:
|
|
450
|
+
group_element = doc.add_group(
|
|
451
|
+
label=GroupLabel.UNSPECIFIED,
|
|
452
|
+
name=group_name,
|
|
453
|
+
parent=docling_table,
|
|
454
|
+
)
|
|
455
|
+
for prov in provs_in_cell:
|
|
456
|
+
group_element.children.append(prov)
|
|
457
|
+
pr_item = prov.resolve(doc)
|
|
458
|
+
item_parent = pr_item.parent.resolve(doc)
|
|
459
|
+
if pr_item.get_ref() in item_parent.children:
|
|
460
|
+
item_parent.children.remove(pr_item.get_ref())
|
|
461
|
+
pr_item.parent = group_element.get_ref()
|
|
462
|
+
ref_for_rich_cell = group_element.get_ref()
|
|
463
|
+
return ref_for_rich_cell
|
|
464
|
+
|
|
465
|
+
@staticmethod
|
|
466
|
+
def process_rich_table_cells(
|
|
467
|
+
provs_in_cell: list[RefItem],
|
|
468
|
+
group_name: str,
|
|
469
|
+
doc: DoclingDocument,
|
|
470
|
+
docling_table: TableItem,
|
|
471
|
+
) -> tuple[bool, Union[RefItem, None]]:
|
|
472
|
+
rich_table_cell = False
|
|
473
|
+
ref_for_rich_cell = None
|
|
474
|
+
if len(provs_in_cell) >= 1:
|
|
475
|
+
# Cell rich cell has multiple elements, we need to group them
|
|
476
|
+
rich_table_cell = True
|
|
477
|
+
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
|
478
|
+
group_name, doc, provs_in_cell, docling_table
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
return rich_table_cell, ref_for_rich_cell
|
|
482
|
+
|
|
483
|
+
def _is_rich_table_cell(self, table_cell: Tag) -> bool:
|
|
484
|
+
"""Determine whether an table cell should be parsed as a Docling RichTableCell.
|
|
485
|
+
|
|
486
|
+
A table cell can hold rich content and be parsed with a Docling RichTableCell.
|
|
487
|
+
However, this requires walking through the content elements and creating
|
|
488
|
+
Docling node items. If the cell holds only plain text, the parsing is simpler
|
|
489
|
+
and using a TableCell is prefered.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
table_cell: The HTML tag representing a table cell.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
Whether the cell should be parsed as RichTableCell.
|
|
496
|
+
"""
|
|
497
|
+
is_rich: bool = True
|
|
498
|
+
|
|
499
|
+
children = table_cell.find_all(recursive=True) # all descendants of type Tag
|
|
500
|
+
if not children:
|
|
501
|
+
content = [
|
|
502
|
+
item
|
|
503
|
+
for item in table_cell.contents
|
|
504
|
+
if isinstance(item, NavigableString)
|
|
505
|
+
]
|
|
506
|
+
is_rich = len(content) > 1
|
|
507
|
+
else:
|
|
508
|
+
annotations = self._extract_text_and_hyperlink_recursively(
|
|
509
|
+
table_cell, find_parent_annotation=True
|
|
510
|
+
)
|
|
511
|
+
if not annotations:
|
|
512
|
+
is_rich = bool(item for item in children if item.name == "img")
|
|
513
|
+
elif len(annotations) == 1:
|
|
514
|
+
anno: AnnotatedText = annotations[0]
|
|
515
|
+
is_rich = bool(anno.formatting) or bool(anno.hyperlink) or anno.code
|
|
516
|
+
|
|
517
|
+
return is_rich
|
|
518
|
+
|
|
519
|
+
def parse_table_data(
|
|
520
|
+
self,
|
|
521
|
+
element: Tag,
|
|
522
|
+
doc: DoclingDocument,
|
|
523
|
+
docling_table: TableItem,
|
|
524
|
+
num_rows: int,
|
|
525
|
+
num_cols: int,
|
|
526
|
+
) -> Optional[TableData]:
|
|
527
|
+
for t in cast(list[Tag], element.find_all(["thead", "tbody"], recursive=False)):
|
|
528
|
+
t.unwrap()
|
|
529
|
+
|
|
530
|
+
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
|
531
|
+
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
|
532
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
|
533
|
+
|
|
534
|
+
# Iterate over the rows in the table
|
|
535
|
+
start_row_span = 0
|
|
536
|
+
row_idx = -1
|
|
537
|
+
|
|
538
|
+
# We don't want this recursive to support nested tables
|
|
539
|
+
for row in element("tr", recursive=False):
|
|
540
|
+
if not isinstance(row, Tag):
|
|
541
|
+
continue
|
|
542
|
+
# For each row, find all the column cells (both <td> and <th>)
|
|
543
|
+
# We don't want this recursive to support nested tables
|
|
544
|
+
cells = row(["td", "th"], recursive=False)
|
|
545
|
+
# Check if cell is in a column header or row header
|
|
546
|
+
col_header = True
|
|
547
|
+
row_header = True
|
|
548
|
+
for html_cell in cells:
|
|
549
|
+
if isinstance(html_cell, Tag):
|
|
550
|
+
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
|
551
|
+
if html_cell.name == "td":
|
|
552
|
+
col_header = False
|
|
553
|
+
row_header = False
|
|
554
|
+
elif row_span == 1:
|
|
555
|
+
row_header = False
|
|
556
|
+
if not row_header:
|
|
557
|
+
row_idx += 1
|
|
558
|
+
start_row_span = 0
|
|
559
|
+
else:
|
|
560
|
+
start_row_span += 1
|
|
561
|
+
|
|
562
|
+
# Extract the text content of each cell
|
|
563
|
+
col_idx = 0
|
|
564
|
+
for html_cell in cells:
|
|
565
|
+
if not isinstance(html_cell, Tag):
|
|
566
|
+
continue
|
|
567
|
+
|
|
568
|
+
# extract inline formulas
|
|
569
|
+
for formula in html_cell("inline-formula"):
|
|
570
|
+
math_parts = formula.text.split("$$")
|
|
571
|
+
if len(math_parts) == 3:
|
|
572
|
+
math_formula = f"$${math_parts[1]}$$"
|
|
573
|
+
formula.replace_with(NavigableString(math_formula))
|
|
574
|
+
|
|
575
|
+
provs_in_cell: list[RefItem] = []
|
|
576
|
+
rich_table_cell = self._is_rich_table_cell(html_cell)
|
|
577
|
+
if rich_table_cell:
|
|
578
|
+
# Parse table cell sub-tree for Rich Cells content:
|
|
579
|
+
with self._use_table_cell_context():
|
|
580
|
+
provs_in_cell = self._walk(html_cell, doc)
|
|
581
|
+
|
|
582
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
|
|
583
|
+
rich_table_cell, ref_for_rich_cell = (
|
|
584
|
+
HTMLDocumentBackend.process_rich_table_cells(
|
|
585
|
+
provs_in_cell, group_name, doc, docling_table
|
|
586
|
+
)
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
# Extracting text
|
|
590
|
+
text = HTMLDocumentBackend._clean_unicode(
|
|
591
|
+
self.get_text(html_cell).strip()
|
|
592
|
+
)
|
|
593
|
+
col_span, row_span = self._get_cell_spans(html_cell)
|
|
594
|
+
if row_header:
|
|
595
|
+
row_span -= 1
|
|
596
|
+
while (
|
|
597
|
+
col_idx < num_cols
|
|
598
|
+
and grid[row_idx + start_row_span][col_idx] is not None
|
|
599
|
+
):
|
|
600
|
+
col_idx += 1
|
|
601
|
+
for r in range(start_row_span, start_row_span + row_span):
|
|
602
|
+
for c in range(col_span):
|
|
603
|
+
if row_idx + r < num_rows and col_idx + c < num_cols:
|
|
604
|
+
grid[row_idx + r][col_idx + c] = text
|
|
605
|
+
|
|
606
|
+
if rich_table_cell:
|
|
607
|
+
rich_cell = RichTableCell(
|
|
608
|
+
text=text,
|
|
609
|
+
row_span=row_span,
|
|
610
|
+
col_span=col_span,
|
|
611
|
+
start_row_offset_idx=start_row_span + row_idx,
|
|
612
|
+
end_row_offset_idx=start_row_span + row_idx + row_span,
|
|
613
|
+
start_col_offset_idx=col_idx,
|
|
614
|
+
end_col_offset_idx=col_idx + col_span,
|
|
615
|
+
column_header=col_header,
|
|
616
|
+
row_header=((not col_header) and html_cell.name == "th"),
|
|
617
|
+
ref=ref_for_rich_cell, # points to an artificial group around children
|
|
618
|
+
)
|
|
619
|
+
doc.add_table_cell(table_item=docling_table, cell=rich_cell)
|
|
620
|
+
else:
|
|
621
|
+
simple_cell = TableCell(
|
|
622
|
+
text=text,
|
|
623
|
+
row_span=row_span,
|
|
624
|
+
col_span=col_span,
|
|
625
|
+
start_row_offset_idx=start_row_span + row_idx,
|
|
626
|
+
end_row_offset_idx=start_row_span + row_idx + row_span,
|
|
627
|
+
start_col_offset_idx=col_idx,
|
|
628
|
+
end_col_offset_idx=col_idx + col_span,
|
|
629
|
+
column_header=col_header,
|
|
630
|
+
row_header=((not col_header) and html_cell.name == "th"),
|
|
631
|
+
)
|
|
632
|
+
doc.add_table_cell(table_item=docling_table, cell=simple_cell)
|
|
633
|
+
return data
|
|
634
|
+
|
|
635
|
+
def _walk(self, element: Tag, doc: DoclingDocument) -> list[RefItem]:
|
|
636
|
+
"""Parse an XML tag by recursively walking its content.
|
|
637
|
+
|
|
638
|
+
While walking, the method buffers inline text across tags like <b> or <span>,
|
|
639
|
+
emitting text nodes only at block boundaries.
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
element: The XML tag to parse.
|
|
643
|
+
doc: The Docling document to be updated with the parsed content.
|
|
644
|
+
"""
|
|
645
|
+
added_refs: list[RefItem] = []
|
|
646
|
+
buffer: AnnotatedTextList = AnnotatedTextList()
|
|
647
|
+
|
|
648
|
+
def _flush_buffer() -> None:
|
|
649
|
+
if not buffer:
|
|
650
|
+
return
|
|
651
|
+
annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
|
|
652
|
+
parts = annotated_text_list.split_by_newline()
|
|
653
|
+
buffer.clear()
|
|
654
|
+
|
|
655
|
+
if not "".join([el.text for el in annotated_text_list]):
|
|
656
|
+
return
|
|
657
|
+
|
|
658
|
+
for annotated_text_list in parts:
|
|
659
|
+
with self._use_inline_group(annotated_text_list, doc):
|
|
660
|
+
for annotated_text in annotated_text_list:
|
|
661
|
+
if annotated_text.text.strip():
|
|
662
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(
|
|
663
|
+
annotated_text.text.strip()
|
|
664
|
+
)
|
|
665
|
+
if annotated_text.code:
|
|
666
|
+
docling_code2 = doc.add_code(
|
|
667
|
+
parent=self.parents[self.level],
|
|
668
|
+
text=seg_clean,
|
|
669
|
+
content_layer=self.content_layer,
|
|
670
|
+
formatting=annotated_text.formatting,
|
|
671
|
+
hyperlink=annotated_text.hyperlink,
|
|
672
|
+
)
|
|
673
|
+
added_refs.append(docling_code2.get_ref())
|
|
674
|
+
else:
|
|
675
|
+
docling_text2 = doc.add_text(
|
|
676
|
+
parent=self.parents[self.level],
|
|
677
|
+
label=DocItemLabel.TEXT,
|
|
678
|
+
text=seg_clean,
|
|
679
|
+
content_layer=self.content_layer,
|
|
680
|
+
formatting=annotated_text.formatting,
|
|
681
|
+
hyperlink=annotated_text.hyperlink,
|
|
682
|
+
)
|
|
683
|
+
added_refs.append(docling_text2.get_ref())
|
|
684
|
+
|
|
685
|
+
for node in element.contents:
|
|
686
|
+
if isinstance(node, Tag):
|
|
687
|
+
name = node.name.lower()
|
|
688
|
+
if name == "img":
|
|
689
|
+
_flush_buffer()
|
|
690
|
+
im_ref3 = self._emit_image(node, doc)
|
|
691
|
+
if im_ref3:
|
|
692
|
+
added_refs.append(im_ref3)
|
|
693
|
+
elif name in _FORMAT_TAG_MAP:
|
|
694
|
+
_flush_buffer()
|
|
695
|
+
with self._use_format([name]):
|
|
696
|
+
wk = self._walk(node, doc)
|
|
697
|
+
added_refs.extend(wk)
|
|
698
|
+
elif name == "a":
|
|
699
|
+
with self._use_hyperlink(node):
|
|
700
|
+
wk2 = self._walk(node, doc)
|
|
701
|
+
added_refs.extend(wk2)
|
|
702
|
+
elif name in _BLOCK_TAGS:
|
|
703
|
+
_flush_buffer()
|
|
704
|
+
blk = self._handle_block(node, doc)
|
|
705
|
+
added_refs.extend(blk)
|
|
706
|
+
elif node.find(_BLOCK_TAGS):
|
|
707
|
+
_flush_buffer()
|
|
708
|
+
wk3 = self._walk(node, doc)
|
|
709
|
+
added_refs.extend(wk3)
|
|
710
|
+
else:
|
|
711
|
+
buffer.extend(
|
|
712
|
+
self._extract_text_and_hyperlink_recursively(
|
|
713
|
+
node, find_parent_annotation=True, keep_newlines=True
|
|
714
|
+
)
|
|
715
|
+
)
|
|
716
|
+
elif isinstance(node, NavigableString) and not isinstance(
|
|
717
|
+
node, PreformattedString
|
|
718
|
+
):
|
|
719
|
+
if str(node).strip("\n\r") == "":
|
|
720
|
+
_flush_buffer()
|
|
721
|
+
else:
|
|
722
|
+
buffer.extend(
|
|
723
|
+
self._extract_text_and_hyperlink_recursively(
|
|
724
|
+
node, find_parent_annotation=True, keep_newlines=True
|
|
725
|
+
)
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
_flush_buffer()
|
|
729
|
+
return added_refs
|
|
730
|
+
|
|
731
|
+
@staticmethod
|
|
732
|
+
def _collect_parent_format_tags(item: PageElement) -> list[str]:
|
|
733
|
+
tags = []
|
|
734
|
+
for format_tag in _FORMAT_TAG_MAP:
|
|
735
|
+
this_parent = item.parent
|
|
736
|
+
while this_parent is not None:
|
|
737
|
+
if this_parent.name == format_tag:
|
|
738
|
+
tags.append(format_tag)
|
|
739
|
+
break
|
|
740
|
+
this_parent = this_parent.parent
|
|
741
|
+
return tags
|
|
742
|
+
|
|
743
|
+
@property
|
|
744
|
+
def _formatting(self):
|
|
745
|
+
kwargs = {}
|
|
746
|
+
for t in self.format_tags:
|
|
747
|
+
kwargs.update(_FORMAT_TAG_MAP[t])
|
|
748
|
+
if not kwargs:
|
|
749
|
+
return None
|
|
750
|
+
return Formatting(**kwargs)
|
|
751
|
+
|
|
752
|
+
def _extract_text_and_hyperlink_recursively(
|
|
753
|
+
self,
|
|
754
|
+
item: PageElement,
|
|
755
|
+
ignore_list=False,
|
|
756
|
+
find_parent_annotation=False,
|
|
757
|
+
keep_newlines=False,
|
|
758
|
+
) -> AnnotatedTextList:
|
|
759
|
+
result: AnnotatedTextList = AnnotatedTextList()
|
|
760
|
+
|
|
761
|
+
# If find_parent_annotation, make sure that we keep track of
|
|
762
|
+
# any a- or formatting-tag that has been present in the
|
|
763
|
+
# DOM-parents already.
|
|
764
|
+
if find_parent_annotation:
|
|
765
|
+
format_tags = self._collect_parent_format_tags(item)
|
|
766
|
+
this_parent = item.parent
|
|
767
|
+
while this_parent is not None:
|
|
768
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
|
769
|
+
with self._use_format(format_tags):
|
|
770
|
+
with self._use_hyperlink(this_parent):
|
|
771
|
+
return self._extract_text_and_hyperlink_recursively(
|
|
772
|
+
item, ignore_list
|
|
773
|
+
)
|
|
774
|
+
this_parent = this_parent.parent
|
|
775
|
+
|
|
776
|
+
if isinstance(item, PreformattedString):
|
|
777
|
+
return AnnotatedTextList()
|
|
778
|
+
|
|
779
|
+
if isinstance(item, NavigableString):
|
|
780
|
+
text = item.strip()
|
|
781
|
+
code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
|
|
782
|
+
if text:
|
|
783
|
+
return AnnotatedTextList(
|
|
784
|
+
[
|
|
785
|
+
AnnotatedText(
|
|
786
|
+
text=text,
|
|
787
|
+
hyperlink=self.hyperlink,
|
|
788
|
+
formatting=self._formatting,
|
|
789
|
+
code=code,
|
|
790
|
+
)
|
|
791
|
+
]
|
|
792
|
+
)
|
|
793
|
+
if keep_newlines and item.strip("\n\r") == "":
|
|
794
|
+
return AnnotatedTextList(
|
|
795
|
+
[
|
|
796
|
+
AnnotatedText(
|
|
797
|
+
text="\n",
|
|
798
|
+
hyperlink=self.hyperlink,
|
|
799
|
+
formatting=self._formatting,
|
|
800
|
+
code=code,
|
|
801
|
+
)
|
|
802
|
+
]
|
|
803
|
+
)
|
|
804
|
+
return AnnotatedTextList()
|
|
805
|
+
|
|
806
|
+
tag = cast(Tag, item)
|
|
807
|
+
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
|
808
|
+
for child in tag:
|
|
809
|
+
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
|
|
810
|
+
with self._use_format([child.name]):
|
|
811
|
+
result.extend(
|
|
812
|
+
self._extract_text_and_hyperlink_recursively(
|
|
813
|
+
child, ignore_list, keep_newlines=keep_newlines
|
|
814
|
+
)
|
|
815
|
+
)
|
|
816
|
+
elif isinstance(child, Tag) and child.name == "a":
|
|
817
|
+
with self._use_hyperlink(child):
|
|
818
|
+
result.extend(
|
|
819
|
+
self._extract_text_and_hyperlink_recursively(
|
|
820
|
+
child, ignore_list, keep_newlines=keep_newlines
|
|
821
|
+
)
|
|
822
|
+
)
|
|
823
|
+
else:
|
|
824
|
+
# Recursively get the child's text content
|
|
825
|
+
result.extend(
|
|
826
|
+
self._extract_text_and_hyperlink_recursively(
|
|
827
|
+
child, ignore_list, keep_newlines=keep_newlines
|
|
828
|
+
)
|
|
829
|
+
)
|
|
830
|
+
return result
|
|
831
|
+
|
|
832
|
+
@contextmanager
|
|
833
|
+
def _use_hyperlink(self, tag: Tag):
|
|
834
|
+
old_hyperlink: Union[AnyUrl, Path, None] = None
|
|
835
|
+
new_hyperlink: Union[AnyUrl, Path, None] = None
|
|
836
|
+
this_href = tag.get("href")
|
|
837
|
+
if this_href is None:
|
|
838
|
+
yield None
|
|
839
|
+
else:
|
|
840
|
+
if isinstance(this_href, str) and this_href:
|
|
841
|
+
old_hyperlink = self.hyperlink
|
|
842
|
+
this_href = self._resolve_relative_path(this_href)
|
|
843
|
+
# ugly fix for relative links since pydantic does not support them.
|
|
844
|
+
try:
|
|
845
|
+
new_hyperlink = AnyUrl(this_href)
|
|
846
|
+
except ValidationError:
|
|
847
|
+
new_hyperlink = Path(this_href)
|
|
848
|
+
self.hyperlink = new_hyperlink
|
|
849
|
+
try:
|
|
850
|
+
yield None
|
|
851
|
+
finally:
|
|
852
|
+
if new_hyperlink:
|
|
853
|
+
self.hyperlink = old_hyperlink
|
|
854
|
+
|
|
855
|
+
@contextmanager
|
|
856
|
+
def _use_format(self, tags: list[str]):
|
|
857
|
+
if not tags:
|
|
858
|
+
yield None
|
|
859
|
+
else:
|
|
860
|
+
self.format_tags.extend(tags)
|
|
861
|
+
try:
|
|
862
|
+
yield None
|
|
863
|
+
finally:
|
|
864
|
+
self.format_tags = self.format_tags[: -len(tags)]
|
|
865
|
+
|
|
866
|
+
@contextmanager
|
|
867
|
+
def _use_inline_group(
|
|
868
|
+
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
|
869
|
+
):
|
|
870
|
+
"""Create an inline group for annotated texts.
|
|
871
|
+
|
|
872
|
+
Checks if annotated_text_list has more than one item and if so creates an inline
|
|
873
|
+
group in which the text elements can then be generated. While the context manager
|
|
874
|
+
is active the inline group is set as the current parent.
|
|
875
|
+
|
|
876
|
+
Args:
|
|
877
|
+
annotated_text_list (AnnotatedTextList): Annotated text
|
|
878
|
+
doc (DoclingDocument): Currently used document
|
|
879
|
+
"""
|
|
880
|
+
if len(annotated_text_list) > 1:
|
|
881
|
+
inline_fmt = doc.add_group(
|
|
882
|
+
label=GroupLabel.INLINE,
|
|
883
|
+
parent=self.parents[self.level],
|
|
884
|
+
content_layer=self.content_layer,
|
|
885
|
+
)
|
|
886
|
+
self.parents[self.level + 1] = inline_fmt
|
|
887
|
+
self.level += 1
|
|
888
|
+
try:
|
|
889
|
+
yield None
|
|
890
|
+
finally:
|
|
891
|
+
self.parents[self.level] = None
|
|
892
|
+
self.level -= 1
|
|
893
|
+
else:
|
|
894
|
+
yield None
|
|
895
|
+
|
|
896
|
+
@contextmanager
|
|
897
|
+
def _use_details(self, tag: Tag, doc: DoclingDocument):
|
|
898
|
+
"""Create a group with the content of a details tag.
|
|
899
|
+
|
|
900
|
+
While the context manager is active, the hierarchy level is set one
|
|
901
|
+
level higher as the cuurent parent.
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
tag: The details tag.
|
|
905
|
+
doc: Currently used document.
|
|
906
|
+
"""
|
|
907
|
+
self.parents[self.level + 1] = doc.add_group(
|
|
908
|
+
name=tag.name,
|
|
909
|
+
label=GroupLabel.SECTION,
|
|
910
|
+
parent=self.parents[self.level],
|
|
911
|
+
content_layer=self.content_layer,
|
|
912
|
+
)
|
|
913
|
+
self.level += 1
|
|
914
|
+
try:
|
|
915
|
+
yield None
|
|
916
|
+
finally:
|
|
917
|
+
self.parents[self.level + 1] = None
|
|
918
|
+
self.level -= 1
|
|
919
|
+
|
|
920
|
+
@contextmanager
|
|
921
|
+
def _use_footer(self, tag: Tag, doc: DoclingDocument):
|
|
922
|
+
"""Create a group with a footer.
|
|
923
|
+
|
|
924
|
+
Create a group with the content of a footer tag. While the context manager
|
|
925
|
+
is active, the hierarchy level is set one level higher as the cuurent parent.
|
|
926
|
+
|
|
927
|
+
Args:
|
|
928
|
+
tag: The footer tag.
|
|
929
|
+
doc: Currently used document.
|
|
930
|
+
"""
|
|
931
|
+
current_layer = self.content_layer
|
|
932
|
+
self.content_layer = ContentLayer.FURNITURE
|
|
933
|
+
self.parents[self.level + 1] = doc.add_group(
|
|
934
|
+
name=tag.name,
|
|
935
|
+
label=GroupLabel.SECTION,
|
|
936
|
+
parent=self.parents[self.level],
|
|
937
|
+
content_layer=self.content_layer,
|
|
938
|
+
)
|
|
939
|
+
self.level += 1
|
|
940
|
+
try:
|
|
941
|
+
yield None
|
|
942
|
+
finally:
|
|
943
|
+
self.parents[self.level + 1] = None
|
|
944
|
+
self.level -= 1
|
|
945
|
+
self.content_layer = current_layer
|
|
946
|
+
|
|
947
|
+
@contextmanager
|
|
948
|
+
def _use_table_cell_context(self):
|
|
949
|
+
"""Preserve the hierarchy level and parents during table cell processing.
|
|
950
|
+
|
|
951
|
+
While the context manager is active, the hierarchy level and parents can be modified.
|
|
952
|
+
When exiting, the original level and parents are restored.
|
|
953
|
+
"""
|
|
954
|
+
original_level = self.level
|
|
955
|
+
original_parents = self.parents.copy()
|
|
956
|
+
try:
|
|
957
|
+
yield
|
|
958
|
+
finally:
|
|
959
|
+
self.level = original_level
|
|
960
|
+
self.parents = original_parents
|
|
961
|
+
|
|
962
|
+
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
|
|
963
|
+
added_ref = []
|
|
964
|
+
tag_name = tag.name.lower()
|
|
965
|
+
# set default content layer to BODY as soon as we encounter a heading
|
|
966
|
+
self.content_layer = ContentLayer.BODY
|
|
967
|
+
level = int(tag_name[1])
|
|
968
|
+
annotated_text_list = self._extract_text_and_hyperlink_recursively(
|
|
969
|
+
tag, find_parent_annotation=True
|
|
970
|
+
)
|
|
971
|
+
annotated_text = annotated_text_list.to_single_text_element()
|
|
972
|
+
text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
|
|
973
|
+
# the first level is for the title item
|
|
974
|
+
if level == 1:
|
|
975
|
+
for key in self.parents.keys():
|
|
976
|
+
self.parents[key] = None
|
|
977
|
+
self.level = 0
|
|
978
|
+
self.parents[self.level + 1] = doc.add_title(
|
|
979
|
+
text_clean,
|
|
980
|
+
content_layer=self.content_layer,
|
|
981
|
+
formatting=annotated_text.formatting,
|
|
982
|
+
hyperlink=annotated_text.hyperlink,
|
|
983
|
+
)
|
|
984
|
+
p1 = self.parents[self.level + 1]
|
|
985
|
+
if p1 is not None:
|
|
986
|
+
added_ref = [p1.get_ref()]
|
|
987
|
+
# the other levels need to be lowered by 1 if a title was set
|
|
988
|
+
else:
|
|
989
|
+
level -= 1
|
|
990
|
+
if level > self.level:
|
|
991
|
+
# add invisible group
|
|
992
|
+
for i in range(self.level, level):
|
|
993
|
+
_log.debug(f"Adding invisible group to level {i}")
|
|
994
|
+
self.parents[i + 1] = doc.add_group(
|
|
995
|
+
name=f"header-{i + 1}",
|
|
996
|
+
label=GroupLabel.SECTION,
|
|
997
|
+
parent=self.parents[i],
|
|
998
|
+
content_layer=self.content_layer,
|
|
999
|
+
)
|
|
1000
|
+
self.level = level
|
|
1001
|
+
elif level < self.level:
|
|
1002
|
+
# remove the tail
|
|
1003
|
+
for key in self.parents.keys():
|
|
1004
|
+
if key > level + 1:
|
|
1005
|
+
_log.debug(f"Remove the tail of level {key}")
|
|
1006
|
+
self.parents[key] = None
|
|
1007
|
+
self.level = level
|
|
1008
|
+
self.parents[self.level + 1] = doc.add_heading(
|
|
1009
|
+
parent=self.parents[self.level],
|
|
1010
|
+
text=text_clean,
|
|
1011
|
+
orig=annotated_text.text,
|
|
1012
|
+
level=self.level,
|
|
1013
|
+
content_layer=self.content_layer,
|
|
1014
|
+
formatting=annotated_text.formatting,
|
|
1015
|
+
hyperlink=annotated_text.hyperlink,
|
|
1016
|
+
)
|
|
1017
|
+
p2 = self.parents[self.level + 1]
|
|
1018
|
+
if p2 is not None:
|
|
1019
|
+
added_ref = [p2.get_ref()]
|
|
1020
|
+
self.level += 1
|
|
1021
|
+
for img_tag in tag("img"):
|
|
1022
|
+
if isinstance(img_tag, Tag):
|
|
1023
|
+
im_ref = self._emit_image(img_tag, doc)
|
|
1024
|
+
if im_ref:
|
|
1025
|
+
added_ref.append(im_ref)
|
|
1026
|
+
return added_ref
|
|
1027
|
+
|
|
1028
|
+
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
|
|
1029
|
+
tag_name = tag.name.lower()
|
|
1030
|
+
start: Optional[int] = None
|
|
1031
|
+
name: str = ""
|
|
1032
|
+
is_ordered = tag_name == "ol"
|
|
1033
|
+
if is_ordered:
|
|
1034
|
+
start_attr = tag.get("start")
|
|
1035
|
+
if isinstance(start_attr, str) and start_attr.isnumeric():
|
|
1036
|
+
start = int(start_attr)
|
|
1037
|
+
name = "ordered list" + (f" start {start}" if start is not None else "")
|
|
1038
|
+
else:
|
|
1039
|
+
name = "list"
|
|
1040
|
+
# Create the list container
|
|
1041
|
+
list_group = doc.add_list_group(
|
|
1042
|
+
name=name,
|
|
1043
|
+
parent=self.parents[self.level],
|
|
1044
|
+
content_layer=self.content_layer,
|
|
1045
|
+
)
|
|
1046
|
+
self.parents[self.level + 1] = list_group
|
|
1047
|
+
self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
|
|
1048
|
+
if is_ordered and start is not None:
|
|
1049
|
+
self.ctx.list_start_by_ref[list_group.self_ref] = start
|
|
1050
|
+
self.level += 1
|
|
1051
|
+
|
|
1052
|
+
# For each top-level <li> in this list
|
|
1053
|
+
for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
|
|
1054
|
+
if not isinstance(li, Tag):
|
|
1055
|
+
continue
|
|
1056
|
+
|
|
1057
|
+
# sub-list items should be indented under main list items, but temporarily
|
|
1058
|
+
# addressing invalid HTML (docling-core/issues/357)
|
|
1059
|
+
if li.name in {"ul", "ol"}:
|
|
1060
|
+
self._handle_block(li, doc)
|
|
1061
|
+
|
|
1062
|
+
else:
|
|
1063
|
+
# 1) determine the marker
|
|
1064
|
+
if is_ordered and start is not None:
|
|
1065
|
+
marker = f"{start + len(list_group.children)}."
|
|
1066
|
+
else:
|
|
1067
|
+
marker = ""
|
|
1068
|
+
|
|
1069
|
+
# 2) extract only the "direct" text from this <li>
|
|
1070
|
+
parts = self._extract_text_and_hyperlink_recursively(
|
|
1071
|
+
li, ignore_list=True, find_parent_annotation=True
|
|
1072
|
+
)
|
|
1073
|
+
min_parts = parts.simplify_text_elements()
|
|
1074
|
+
li_text = re.sub(
|
|
1075
|
+
r"\s+|\n+", " ", "".join([el.text for el in min_parts])
|
|
1076
|
+
).strip()
|
|
1077
|
+
|
|
1078
|
+
# 3) add the list item
|
|
1079
|
+
if li_text:
|
|
1080
|
+
if len(min_parts) > 1:
|
|
1081
|
+
# create an empty list element in order to hook the inline group onto that one
|
|
1082
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
|
1083
|
+
text="",
|
|
1084
|
+
enumerated=is_ordered,
|
|
1085
|
+
marker=marker,
|
|
1086
|
+
parent=list_group,
|
|
1087
|
+
content_layer=self.content_layer,
|
|
1088
|
+
)
|
|
1089
|
+
self.level += 1
|
|
1090
|
+
with self._use_inline_group(min_parts, doc):
|
|
1091
|
+
for annotated_text in min_parts:
|
|
1092
|
+
li_text = re.sub(
|
|
1093
|
+
r"\s+|\n+", " ", annotated_text.text
|
|
1094
|
+
).strip()
|
|
1095
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
|
1096
|
+
if annotated_text.code:
|
|
1097
|
+
doc.add_code(
|
|
1098
|
+
parent=self.parents[self.level],
|
|
1099
|
+
text=li_clean,
|
|
1100
|
+
content_layer=self.content_layer,
|
|
1101
|
+
formatting=annotated_text.formatting,
|
|
1102
|
+
hyperlink=annotated_text.hyperlink,
|
|
1103
|
+
)
|
|
1104
|
+
else:
|
|
1105
|
+
doc.add_text(
|
|
1106
|
+
parent=self.parents[self.level],
|
|
1107
|
+
label=DocItemLabel.TEXT,
|
|
1108
|
+
text=li_clean,
|
|
1109
|
+
content_layer=self.content_layer,
|
|
1110
|
+
formatting=annotated_text.formatting,
|
|
1111
|
+
hyperlink=annotated_text.hyperlink,
|
|
1112
|
+
)
|
|
1113
|
+
|
|
1114
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
|
1115
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
|
1116
|
+
if isinstance(sublist, Tag):
|
|
1117
|
+
self._handle_block(sublist, doc)
|
|
1118
|
+
|
|
1119
|
+
# now the list element with inline group is not a parent anymore
|
|
1120
|
+
self.parents[self.level] = None
|
|
1121
|
+
self.level -= 1
|
|
1122
|
+
else:
|
|
1123
|
+
annotated_text = min_parts[0]
|
|
1124
|
+
li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
|
|
1125
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
|
1126
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
|
1127
|
+
text=li_clean,
|
|
1128
|
+
enumerated=is_ordered,
|
|
1129
|
+
marker=marker,
|
|
1130
|
+
orig=li_text,
|
|
1131
|
+
parent=list_group,
|
|
1132
|
+
content_layer=self.content_layer,
|
|
1133
|
+
formatting=annotated_text.formatting,
|
|
1134
|
+
hyperlink=annotated_text.hyperlink,
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
|
1138
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
|
1139
|
+
if isinstance(sublist, Tag):
|
|
1140
|
+
self.level += 1
|
|
1141
|
+
self._handle_block(sublist, doc)
|
|
1142
|
+
self.parents[self.level + 1] = None
|
|
1143
|
+
self.level -= 1
|
|
1144
|
+
else:
|
|
1145
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
|
1146
|
+
if isinstance(sublist, Tag):
|
|
1147
|
+
self._handle_block(sublist, doc)
|
|
1148
|
+
|
|
1149
|
+
# 5) extract any images under this <li>
|
|
1150
|
+
for img_tag in li("img"):
|
|
1151
|
+
if isinstance(img_tag, Tag):
|
|
1152
|
+
self._emit_image(img_tag, doc)
|
|
1153
|
+
|
|
1154
|
+
self.parents[self.level + 1] = None
|
|
1155
|
+
self.level -= 1
|
|
1156
|
+
return list_group.get_ref()
|
|
1157
|
+
|
|
1158
|
+
@staticmethod
|
|
1159
|
+
def get_html_table_row_col(tag: Tag) -> tuple[int, int]:
|
|
1160
|
+
for t in cast(list[Tag], tag.find_all(["thead", "tbody"], recursive=False)):
|
|
1161
|
+
t.unwrap()
|
|
1162
|
+
# Find the number of rows and columns (taking into account spans)
|
|
1163
|
+
num_rows: int = 0
|
|
1164
|
+
num_cols: int = 0
|
|
1165
|
+
for row in tag("tr", recursive=False):
|
|
1166
|
+
col_count = 0
|
|
1167
|
+
is_row_header = True
|
|
1168
|
+
if not isinstance(row, Tag):
|
|
1169
|
+
continue
|
|
1170
|
+
for cell in row(["td", "th"], recursive=False):
|
|
1171
|
+
if not isinstance(row, Tag):
|
|
1172
|
+
continue
|
|
1173
|
+
cell_tag = cast(Tag, cell)
|
|
1174
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
|
1175
|
+
col_count += col_span
|
|
1176
|
+
if cell_tag.name == "td" or row_span == 1:
|
|
1177
|
+
is_row_header = False
|
|
1178
|
+
num_cols = max(num_cols, col_count)
|
|
1179
|
+
if not is_row_header:
|
|
1180
|
+
num_rows += 1
|
|
1181
|
+
return num_rows, num_cols
|
|
1182
|
+
|
|
1183
|
+
def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
|
|
1184
|
+
added_refs = []
|
|
1185
|
+
tag_name = tag.name.lower()
|
|
1186
|
+
|
|
1187
|
+
if tag_name == "figure":
|
|
1188
|
+
img_tag = tag.find("img")
|
|
1189
|
+
if isinstance(img_tag, Tag):
|
|
1190
|
+
im_ref = self._emit_image(img_tag, doc)
|
|
1191
|
+
if im_ref is not None:
|
|
1192
|
+
added_refs.append(im_ref)
|
|
1193
|
+
|
|
1194
|
+
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
|
1195
|
+
heading_refs = self._handle_heading(tag, doc)
|
|
1196
|
+
added_refs.extend(heading_refs)
|
|
1197
|
+
|
|
1198
|
+
elif tag_name in {"ul", "ol"}:
|
|
1199
|
+
list_ref = self._handle_list(tag, doc)
|
|
1200
|
+
added_refs.append(list_ref)
|
|
1201
|
+
|
|
1202
|
+
elif tag_name in {"p", "address", "summary"}:
|
|
1203
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
|
1204
|
+
tag, find_parent_annotation=True
|
|
1205
|
+
)
|
|
1206
|
+
annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
|
|
1207
|
+
for part in annotated_texts.split_by_newline():
|
|
1208
|
+
with self._use_inline_group(part, doc):
|
|
1209
|
+
for annotated_text in part:
|
|
1210
|
+
if seg := annotated_text.text.strip():
|
|
1211
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
|
1212
|
+
if annotated_text.code:
|
|
1213
|
+
docling_code = doc.add_code(
|
|
1214
|
+
parent=self.parents[self.level],
|
|
1215
|
+
text=seg_clean,
|
|
1216
|
+
content_layer=self.content_layer,
|
|
1217
|
+
formatting=annotated_text.formatting,
|
|
1218
|
+
hyperlink=annotated_text.hyperlink,
|
|
1219
|
+
)
|
|
1220
|
+
added_refs.append(docling_code.get_ref())
|
|
1221
|
+
else:
|
|
1222
|
+
docling_text = doc.add_text(
|
|
1223
|
+
parent=self.parents[self.level],
|
|
1224
|
+
label=DocItemLabel.TEXT,
|
|
1225
|
+
text=seg_clean,
|
|
1226
|
+
content_layer=self.content_layer,
|
|
1227
|
+
formatting=annotated_text.formatting,
|
|
1228
|
+
hyperlink=annotated_text.hyperlink,
|
|
1229
|
+
)
|
|
1230
|
+
added_refs.append(docling_text.get_ref())
|
|
1231
|
+
|
|
1232
|
+
for img_tag in tag("img"):
|
|
1233
|
+
if isinstance(img_tag, Tag):
|
|
1234
|
+
self._emit_image(img_tag, doc)
|
|
1235
|
+
|
|
1236
|
+
elif tag_name == "table":
|
|
1237
|
+
num_rows, num_cols = self.get_html_table_row_col(tag)
|
|
1238
|
+
data_e = TableData(num_rows=num_rows, num_cols=num_cols)
|
|
1239
|
+
docling_table = doc.add_table(
|
|
1240
|
+
data=data_e,
|
|
1241
|
+
parent=self.parents[self.level],
|
|
1242
|
+
content_layer=self.content_layer,
|
|
1243
|
+
)
|
|
1244
|
+
added_refs.append(docling_table.get_ref())
|
|
1245
|
+
self.parse_table_data(tag, doc, docling_table, num_rows, num_cols)
|
|
1246
|
+
|
|
1247
|
+
for img_tag in tag("img"):
|
|
1248
|
+
if isinstance(img_tag, Tag):
|
|
1249
|
+
im_ref2 = self._emit_image(tag, doc)
|
|
1250
|
+
if im_ref2 is not None:
|
|
1251
|
+
added_refs.append(im_ref2)
|
|
1252
|
+
|
|
1253
|
+
elif tag_name in {"pre"}:
|
|
1254
|
+
# handle monospace code snippets (pre).
|
|
1255
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
|
1256
|
+
tag, find_parent_annotation=True, keep_newlines=True
|
|
1257
|
+
)
|
|
1258
|
+
annotated_texts = text_list.simplify_text_elements()
|
|
1259
|
+
with self._use_inline_group(annotated_texts, doc):
|
|
1260
|
+
for annotated_text in annotated_texts:
|
|
1261
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
|
1262
|
+
annotated_text.text.strip()
|
|
1263
|
+
)
|
|
1264
|
+
docling_code2 = doc.add_code(
|
|
1265
|
+
parent=self.parents[self.level],
|
|
1266
|
+
text=text_clean,
|
|
1267
|
+
content_layer=self.content_layer,
|
|
1268
|
+
formatting=annotated_text.formatting,
|
|
1269
|
+
hyperlink=annotated_text.hyperlink,
|
|
1270
|
+
)
|
|
1271
|
+
added_refs.append(docling_code2.get_ref())
|
|
1272
|
+
|
|
1273
|
+
elif tag_name == "footer":
|
|
1274
|
+
with self._use_footer(tag, doc):
|
|
1275
|
+
self._walk(tag, doc)
|
|
1276
|
+
|
|
1277
|
+
elif tag_name == "details":
|
|
1278
|
+
with self._use_details(tag, doc):
|
|
1279
|
+
self._walk(tag, doc)
|
|
1280
|
+
return added_refs
|
|
1281
|
+
|
|
1282
|
+
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
|
|
1283
|
+
figure = img_tag.find_parent("figure")
|
|
1284
|
+
caption: AnnotatedTextList = AnnotatedTextList()
|
|
1285
|
+
|
|
1286
|
+
parent = self.parents[self.level]
|
|
1287
|
+
|
|
1288
|
+
# check if the figure has a link - this is HACK:
|
|
1289
|
+
def get_img_hyperlink(img_tag):
|
|
1290
|
+
this_parent = img_tag.parent
|
|
1291
|
+
while this_parent is not None:
|
|
1292
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
|
1293
|
+
return this_parent.get("href")
|
|
1294
|
+
this_parent = this_parent.parent
|
|
1295
|
+
return None
|
|
1296
|
+
|
|
1297
|
+
if img_hyperlink := get_img_hyperlink(img_tag):
|
|
1298
|
+
img_text = img_tag.get("alt") or ""
|
|
1299
|
+
caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))
|
|
1300
|
+
|
|
1301
|
+
if isinstance(figure, Tag):
|
|
1302
|
+
caption_tag = figure.find("figcaption", recursive=False)
|
|
1303
|
+
if isinstance(caption_tag, Tag):
|
|
1304
|
+
caption = self._extract_text_and_hyperlink_recursively(
|
|
1305
|
+
caption_tag, find_parent_annotation=True
|
|
1306
|
+
)
|
|
1307
|
+
if not caption and img_tag.get("alt"):
|
|
1308
|
+
caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
|
|
1309
|
+
|
|
1310
|
+
caption_anno_text = caption.to_single_text_element()
|
|
1311
|
+
|
|
1312
|
+
caption_item: Optional[TextItem] = None
|
|
1313
|
+
if caption_anno_text.text:
|
|
1314
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
|
1315
|
+
caption_anno_text.text.strip()
|
|
1316
|
+
)
|
|
1317
|
+
caption_item = doc.add_text(
|
|
1318
|
+
label=DocItemLabel.CAPTION,
|
|
1319
|
+
text=text_clean,
|
|
1320
|
+
orig=caption_anno_text.text,
|
|
1321
|
+
content_layer=self.content_layer,
|
|
1322
|
+
formatting=caption_anno_text.formatting,
|
|
1323
|
+
hyperlink=caption_anno_text.hyperlink,
|
|
1324
|
+
)
|
|
1325
|
+
|
|
1326
|
+
src_loc: str = self._get_attr_as_string(img_tag, "src")
|
|
1327
|
+
if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
|
|
1328
|
+
# Do not fetch the image, just add a placeholder
|
|
1329
|
+
placeholder: PictureItem = doc.add_picture(
|
|
1330
|
+
caption=caption_item,
|
|
1331
|
+
parent=parent,
|
|
1332
|
+
content_layer=self.content_layer,
|
|
1333
|
+
)
|
|
1334
|
+
return placeholder.get_ref()
|
|
1335
|
+
|
|
1336
|
+
src_loc = self._resolve_relative_path(src_loc)
|
|
1337
|
+
img_ref = self._create_image_ref(src_loc)
|
|
1338
|
+
|
|
1339
|
+
docling_pic = doc.add_picture(
|
|
1340
|
+
image=img_ref,
|
|
1341
|
+
caption=caption_item,
|
|
1342
|
+
parent=parent,
|
|
1343
|
+
content_layer=self.content_layer,
|
|
1344
|
+
)
|
|
1345
|
+
return docling_pic.get_ref()
|
|
1346
|
+
|
|
1347
|
+
def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
|
|
1348
|
+
try:
|
|
1349
|
+
img_data = self._load_image_data(src_url)
|
|
1350
|
+
if img_data:
|
|
1351
|
+
img = Image.open(BytesIO(img_data))
|
|
1352
|
+
return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
|
|
1353
|
+
except (
|
|
1354
|
+
requests.HTTPError,
|
|
1355
|
+
ValidationError,
|
|
1356
|
+
UnidentifiedImageError,
|
|
1357
|
+
OperationNotAllowed,
|
|
1358
|
+
TypeError,
|
|
1359
|
+
ValueError,
|
|
1360
|
+
) as e:
|
|
1361
|
+
warnings.warn(f"Could not process an image from {src_url}: {e}")
|
|
1362
|
+
|
|
1363
|
+
return None
|
|
1364
|
+
|
|
1365
|
+
def _load_image_data(self, src_loc: str) -> Optional[bytes]:
|
|
1366
|
+
if src_loc.lower().endswith(".svg"):
|
|
1367
|
+
_log.debug(f"Skipping SVG file: {src_loc}")
|
|
1368
|
+
return None
|
|
1369
|
+
|
|
1370
|
+
if HTMLDocumentBackend._is_remote_url(src_loc):
|
|
1371
|
+
if not self.options.enable_remote_fetch:
|
|
1372
|
+
raise OperationNotAllowed(
|
|
1373
|
+
"Fetching remote resources is only allowed when set explicitly. "
|
|
1374
|
+
"Set options.enable_remote_fetch=True."
|
|
1375
|
+
)
|
|
1376
|
+
response = requests.get(src_loc, stream=True)
|
|
1377
|
+
response.raise_for_status()
|
|
1378
|
+
return response.content
|
|
1379
|
+
elif src_loc.startswith("data:"):
|
|
1380
|
+
data = re.sub(r"^data:image/.+;base64,", "", src_loc)
|
|
1381
|
+
return base64.b64decode(data)
|
|
1382
|
+
|
|
1383
|
+
if src_loc.startswith("file://"):
|
|
1384
|
+
src_loc = src_loc[7:]
|
|
1385
|
+
|
|
1386
|
+
if not self.options.enable_local_fetch:
|
|
1387
|
+
raise OperationNotAllowed(
|
|
1388
|
+
"Fetching local resources is only allowed when set explicitly. "
|
|
1389
|
+
"Set options.enable_local_fetch=True."
|
|
1390
|
+
)
|
|
1391
|
+
# add check that file exists and can read
|
|
1392
|
+
if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
|
|
1393
|
+
with open(src_loc, "rb") as f:
|
|
1394
|
+
return f.read()
|
|
1395
|
+
else:
|
|
1396
|
+
raise ValueError("File does not exist or it is not readable.")
|
|
1397
|
+
|
|
1398
|
+
@staticmethod
|
|
1399
|
+
def get_text(item: PageElement) -> str:
|
|
1400
|
+
"""Concatenate all child strings of a PageElement.
|
|
1401
|
+
|
|
1402
|
+
This method is equivalent to `PageElement.get_text()` but also considers
|
|
1403
|
+
certain tags. When called on a <p> or <li> tags, it returns the text with a
|
|
1404
|
+
trailing space, otherwise the text is concatenated without separators.
|
|
1405
|
+
"""
|
|
1406
|
+
|
|
1407
|
+
def _extract_text_recursively(item: PageElement) -> list[str]:
|
|
1408
|
+
"""Recursively extract text from all child nodes."""
|
|
1409
|
+
result: list[str] = []
|
|
1410
|
+
|
|
1411
|
+
if isinstance(item, NavigableString):
|
|
1412
|
+
result = [item]
|
|
1413
|
+
elif isinstance(item, Tag):
|
|
1414
|
+
tag = cast(Tag, item)
|
|
1415
|
+
parts: list[str] = []
|
|
1416
|
+
for child in tag:
|
|
1417
|
+
parts.extend(_extract_text_recursively(child))
|
|
1418
|
+
result.append(
|
|
1419
|
+
"".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
|
|
1420
|
+
)
|
|
1421
|
+
|
|
1422
|
+
return result
|
|
1423
|
+
|
|
1424
|
+
parts: list[str] = _extract_text_recursively(item)
|
|
1425
|
+
|
|
1426
|
+
return "".join(parts)
|
|
1427
|
+
|
|
1428
|
+
@staticmethod
|
|
1429
|
+
def _clean_unicode(text: str) -> str:
|
|
1430
|
+
"""Replace typical Unicode characters in HTML for text processing.
|
|
1431
|
+
|
|
1432
|
+
Several Unicode characters (e.g., non-printable or formatting) are typically
|
|
1433
|
+
found in HTML but are worth replacing to sanitize text and ensure consistency
|
|
1434
|
+
in text processing tasks.
|
|
1435
|
+
|
|
1436
|
+
Args:
|
|
1437
|
+
text: The original text.
|
|
1438
|
+
|
|
1439
|
+
Returns:
|
|
1440
|
+
The sanitized text without typical Unicode characters.
|
|
1441
|
+
"""
|
|
1442
|
+
replacements = {
|
|
1443
|
+
"\u00a0": " ", # non-breaking space
|
|
1444
|
+
"\u200b": "", # zero-width space
|
|
1445
|
+
"\u200c": "", # zero-width non-joiner
|
|
1446
|
+
"\u200d": "", # zero-width joiner
|
|
1447
|
+
"\u2010": "-", # hyphen
|
|
1448
|
+
"\u2011": "-", # non-breaking hyphen
|
|
1449
|
+
"\u2012": "-", # dash
|
|
1450
|
+
"\u2013": "-", # dash
|
|
1451
|
+
"\u2014": "-", # dash
|
|
1452
|
+
"\u2015": "-", # horizontal bar
|
|
1453
|
+
"\u2018": "'", # left single quotation mark
|
|
1454
|
+
"\u2019": "'", # right single quotation mark
|
|
1455
|
+
"\u201c": '"', # left double quotation mark
|
|
1456
|
+
"\u201d": '"', # right double quotation mark
|
|
1457
|
+
"\u2026": "...", # ellipsis
|
|
1458
|
+
"\u00ad": "", # soft hyphen
|
|
1459
|
+
"\ufeff": "", # zero width non-break space
|
|
1460
|
+
"\u202f": " ", # narrow non-break space
|
|
1461
|
+
"\u2060": "", # word joiner
|
|
1462
|
+
}
|
|
1463
|
+
for raw, clean in replacements.items():
|
|
1464
|
+
text = text.replace(raw, clean)
|
|
1465
|
+
|
|
1466
|
+
return text
|
|
1467
|
+
|
|
1468
|
+
@staticmethod
|
|
1469
|
+
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
|
1470
|
+
"""Extract colspan and rowspan values from a table cell tag.
|
|
1471
|
+
|
|
1472
|
+
This function retrieves the 'colspan' and 'rowspan' attributes from a given
|
|
1473
|
+
table cell tag.
|
|
1474
|
+
If the attribute does not exist or it is not numeric, it defaults to 1.
|
|
1475
|
+
"""
|
|
1476
|
+
raw_spans: tuple[str, str] = (
|
|
1477
|
+
str(cell.get("colspan", "1")),
|
|
1478
|
+
str(cell.get("rowspan", "1")),
|
|
1479
|
+
)
|
|
1480
|
+
|
|
1481
|
+
def _extract_num(s: str) -> int:
|
|
1482
|
+
if s and s[0].isnumeric():
|
|
1483
|
+
match = re.search(r"\d+", s)
|
|
1484
|
+
if match:
|
|
1485
|
+
return int(match.group())
|
|
1486
|
+
return 1
|
|
1487
|
+
|
|
1488
|
+
int_spans: tuple[int, int] = (
|
|
1489
|
+
_extract_num(raw_spans[0]),
|
|
1490
|
+
_extract_num(raw_spans[1]),
|
|
1491
|
+
)
|
|
1492
|
+
|
|
1493
|
+
return int_spans
|
|
1494
|
+
|
|
1495
|
+
@staticmethod
|
|
1496
|
+
def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
|
|
1497
|
+
"""Get attribute value as string, handling list values."""
|
|
1498
|
+
value = tag.get(attr)
|
|
1499
|
+
if not value:
|
|
1500
|
+
return default
|
|
1501
|
+
|
|
1502
|
+
return value[0] if isinstance(value, list) else value
|