docling 2.44.0__py3-none-any.whl → 2.46.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v4_backend.py +61 -27
- docling/backend/html_backend.py +356 -80
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/pdf_backend.py +3 -3
- docling/cli/main.py +10 -0
- docling/datamodel/base_models.py +3 -0
- docling/datamodel/document.py +26 -0
- docling/datamodel/pipeline_options.py +1 -3
- docling/datamodel/pipeline_options_vlm_model.py +8 -2
- docling/document_converter.py +4 -0
- docling/models/api_vlm_model.py +2 -5
- docling/models/code_formula_model.py +87 -76
- docling/models/tesseract_ocr_cli_model.py +4 -2
- docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
- docling/models/vlm_models_inline/mlx_model.py +2 -4
- docling/pipeline/base_pipeline.py +14 -5
- docling/pipeline/threaded_standard_pdf_pipeline.py +6 -4
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/METADATA +2 -2
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/RECORD +23 -22
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/WHEEL +0 -0
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/entry_points.txt +0 -0
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/top_level.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
+
from contextlib import contextmanager
|
4
|
+
from copy import deepcopy
|
3
5
|
from io import BytesIO
|
4
6
|
from pathlib import Path
|
5
7
|
from typing import Final, Optional, Union, cast
|
8
|
+
from urllib.parse import urljoin
|
6
9
|
|
7
10
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
8
11
|
from bs4.element import PreformattedString
|
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
|
|
18
21
|
TextItem,
|
19
22
|
)
|
20
23
|
from docling_core.types.doc.document import ContentLayer
|
21
|
-
from pydantic import BaseModel
|
24
|
+
from pydantic import AnyUrl, BaseModel, ValidationError
|
22
25
|
from typing_extensions import override
|
23
26
|
|
24
27
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
@@ -35,6 +38,7 @@ _BLOCK_TAGS: Final = {
|
|
35
38
|
"address",
|
36
39
|
"details",
|
37
40
|
"figure",
|
41
|
+
"footer",
|
38
42
|
"h1",
|
39
43
|
"h2",
|
40
44
|
"h3",
|
@@ -56,12 +60,76 @@ class _Context(BaseModel):
|
|
56
60
|
list_start_by_ref: dict[str, int] = {}
|
57
61
|
|
58
62
|
|
63
|
+
class AnnotatedText(BaseModel):
|
64
|
+
text: str
|
65
|
+
hyperlink: Union[AnyUrl, Path, None] = None
|
66
|
+
|
67
|
+
|
68
|
+
class AnnotatedTextList(list):
|
69
|
+
def to_single_text_element(self) -> AnnotatedText:
|
70
|
+
current_h = None
|
71
|
+
current_text = ""
|
72
|
+
for at in self:
|
73
|
+
t = at.text
|
74
|
+
h = at.hyperlink
|
75
|
+
current_text += t.strip() + " "
|
76
|
+
if h is not None and current_h is None:
|
77
|
+
current_h = h
|
78
|
+
elif h is not None and current_h is not None and h != current_h:
|
79
|
+
_log.warning(
|
80
|
+
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
81
|
+
)
|
82
|
+
return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
|
83
|
+
|
84
|
+
def simplify_text_elements(self) -> "AnnotatedTextList":
|
85
|
+
simplified = AnnotatedTextList()
|
86
|
+
if not self:
|
87
|
+
return self
|
88
|
+
text = self[0].text
|
89
|
+
hyperlink = self[0].hyperlink
|
90
|
+
last_elm = text
|
91
|
+
for i in range(1, len(self)):
|
92
|
+
if hyperlink == self[i].hyperlink:
|
93
|
+
sep = " "
|
94
|
+
if not self[i].text.strip() or not last_elm.strip():
|
95
|
+
sep = ""
|
96
|
+
text += sep + self[i].text
|
97
|
+
last_elm = self[i].text
|
98
|
+
else:
|
99
|
+
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
100
|
+
text = self[i].text
|
101
|
+
last_elm = text
|
102
|
+
hyperlink = self[i].hyperlink
|
103
|
+
if text:
|
104
|
+
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
105
|
+
return simplified
|
106
|
+
|
107
|
+
def split_by_newline(self):
|
108
|
+
super_list = []
|
109
|
+
active_annotated_text_list = AnnotatedTextList()
|
110
|
+
for el in self:
|
111
|
+
sub_texts = el.text.split("\n")
|
112
|
+
if len(sub_texts) == 1:
|
113
|
+
active_annotated_text_list.append(el)
|
114
|
+
else:
|
115
|
+
for text in sub_texts:
|
116
|
+
sub_el = deepcopy(el)
|
117
|
+
sub_el.text = text
|
118
|
+
active_annotated_text_list.append(sub_el)
|
119
|
+
super_list.append(active_annotated_text_list)
|
120
|
+
active_annotated_text_list = AnnotatedTextList()
|
121
|
+
if active_annotated_text_list:
|
122
|
+
super_list.append(active_annotated_text_list)
|
123
|
+
return super_list
|
124
|
+
|
125
|
+
|
59
126
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
60
127
|
@override
|
61
128
|
def __init__(
|
62
129
|
self,
|
63
130
|
in_doc: InputDocument,
|
64
131
|
path_or_stream: Union[BytesIO, Path],
|
132
|
+
original_url: Optional[AnyUrl] = None,
|
65
133
|
):
|
66
134
|
super().__init__(in_doc, path_or_stream)
|
67
135
|
self.soup: Optional[Tag] = None
|
@@ -74,6 +142,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
74
142
|
self.ctx = _Context()
|
75
143
|
for i in range(self.max_levels):
|
76
144
|
self.parents[i] = None
|
145
|
+
self.hyperlink = None
|
146
|
+
self.original_url = original_url
|
77
147
|
|
78
148
|
try:
|
79
149
|
raw = (
|
@@ -160,26 +230,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
160
230
|
element: The XML tag to parse.
|
161
231
|
doc: The Docling document to be updated with the parsed content.
|
162
232
|
"""
|
163
|
-
buffer:
|
233
|
+
buffer: AnnotatedTextList = AnnotatedTextList()
|
164
234
|
|
165
235
|
def flush_buffer():
|
166
236
|
if not buffer:
|
167
237
|
return
|
168
|
-
|
238
|
+
annotated_text_list = buffer.simplify_text_elements()
|
239
|
+
parts = annotated_text_list.split_by_newline()
|
169
240
|
buffer.clear()
|
170
|
-
|
241
|
+
|
242
|
+
if not "".join([el.text for el in annotated_text_list]):
|
171
243
|
return
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
244
|
+
|
245
|
+
for annotated_text_list in parts:
|
246
|
+
with self.use_inline_group(annotated_text_list, doc):
|
247
|
+
for annotated_text in annotated_text_list:
|
248
|
+
if annotated_text.text.strip():
|
249
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(
|
250
|
+
annotated_text.text.strip()
|
251
|
+
)
|
252
|
+
doc.add_text(
|
253
|
+
parent=self.parents[self.level],
|
254
|
+
label=DocItemLabel.TEXT,
|
255
|
+
text=seg_clean,
|
256
|
+
content_layer=self.content_layer,
|
257
|
+
hyperlink=annotated_text.hyperlink,
|
258
|
+
)
|
183
259
|
|
184
260
|
for node in element.contents:
|
185
261
|
if isinstance(node, Tag):
|
@@ -187,6 +263,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
187
263
|
if name == "img":
|
188
264
|
flush_buffer()
|
189
265
|
self._emit_image(node, doc)
|
266
|
+
elif name == "a":
|
267
|
+
with self.use_hyperlink(node):
|
268
|
+
self._walk(node, doc)
|
190
269
|
elif name in _BLOCK_TAGS:
|
191
270
|
flush_buffer()
|
192
271
|
self._handle_block(node, doc)
|
@@ -194,28 +273,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
194
273
|
flush_buffer()
|
195
274
|
self._walk(node, doc)
|
196
275
|
else:
|
197
|
-
buffer.
|
276
|
+
buffer.extend(
|
277
|
+
self._extract_text_and_hyperlink_recursively(
|
278
|
+
node, find_parent_annotation=True, keep_newlines=True
|
279
|
+
)
|
280
|
+
)
|
198
281
|
elif isinstance(node, NavigableString) and not isinstance(
|
199
282
|
node, PreformattedString
|
200
283
|
):
|
201
|
-
|
284
|
+
if str(node).strip("\n\r") == "":
|
285
|
+
flush_buffer()
|
286
|
+
else:
|
287
|
+
buffer.extend(
|
288
|
+
self._extract_text_and_hyperlink_recursively(
|
289
|
+
node, find_parent_annotation=True, keep_newlines=True
|
290
|
+
)
|
291
|
+
)
|
202
292
|
|
203
293
|
flush_buffer()
|
204
294
|
|
295
|
+
def _extract_text_and_hyperlink_recursively(
|
296
|
+
self,
|
297
|
+
item: PageElement,
|
298
|
+
ignore_list=False,
|
299
|
+
find_parent_annotation=False,
|
300
|
+
keep_newlines=False,
|
301
|
+
) -> AnnotatedTextList:
|
302
|
+
result: AnnotatedTextList = AnnotatedTextList()
|
303
|
+
|
304
|
+
# If find_parent_annotation, make sure that we keep track of
|
305
|
+
# any a-tag that has been present in the DOM-parents already.
|
306
|
+
if find_parent_annotation:
|
307
|
+
this_parent = item.parent
|
308
|
+
while this_parent is not None:
|
309
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
310
|
+
with self.use_hyperlink(this_parent):
|
311
|
+
return self._extract_text_and_hyperlink_recursively(
|
312
|
+
item, ignore_list
|
313
|
+
)
|
314
|
+
this_parent = this_parent.parent
|
315
|
+
|
316
|
+
if isinstance(item, PreformattedString):
|
317
|
+
return AnnotatedTextList()
|
318
|
+
|
319
|
+
if isinstance(item, NavigableString):
|
320
|
+
text = item.strip()
|
321
|
+
if text:
|
322
|
+
return AnnotatedTextList(
|
323
|
+
[AnnotatedText(text=text, hyperlink=self.hyperlink)]
|
324
|
+
)
|
325
|
+
if keep_newlines and item.strip("\n\r") == "":
|
326
|
+
return AnnotatedTextList(
|
327
|
+
[AnnotatedText(text="\n", hyperlink=self.hyperlink)]
|
328
|
+
)
|
329
|
+
return AnnotatedTextList()
|
330
|
+
|
331
|
+
tag = cast(Tag, item)
|
332
|
+
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
333
|
+
for child in tag:
|
334
|
+
if isinstance(child, Tag) and child.name == "a":
|
335
|
+
with self.use_hyperlink(child):
|
336
|
+
result.extend(
|
337
|
+
self._extract_text_and_hyperlink_recursively(
|
338
|
+
child, ignore_list, keep_newlines=keep_newlines
|
339
|
+
)
|
340
|
+
)
|
341
|
+
else:
|
342
|
+
# Recursively get the child's text content
|
343
|
+
result.extend(
|
344
|
+
self._extract_text_and_hyperlink_recursively(
|
345
|
+
child, ignore_list, keep_newlines=keep_newlines
|
346
|
+
)
|
347
|
+
)
|
348
|
+
return result
|
349
|
+
|
350
|
+
@contextmanager
|
351
|
+
def use_hyperlink(self, tag):
|
352
|
+
this_href = tag.get("href")
|
353
|
+
if this_href is None:
|
354
|
+
yield None
|
355
|
+
else:
|
356
|
+
if this_href:
|
357
|
+
old_hyperlink = self.hyperlink
|
358
|
+
if self.original_url is not None:
|
359
|
+
this_href = urljoin(self.original_url, this_href)
|
360
|
+
# ugly fix for relative links since pydantic does not support them.
|
361
|
+
try:
|
362
|
+
AnyUrl(this_href)
|
363
|
+
except ValidationError:
|
364
|
+
this_href = Path(this_href)
|
365
|
+
self.hyperlink = this_href
|
366
|
+
try:
|
367
|
+
yield None
|
368
|
+
finally:
|
369
|
+
if this_href:
|
370
|
+
self.hyperlink = old_hyperlink
|
371
|
+
|
372
|
+
@contextmanager
|
373
|
+
def use_inline_group(
|
374
|
+
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
375
|
+
):
|
376
|
+
"""Create an inline group for annotated texts.
|
377
|
+
|
378
|
+
Checks if annotated_text_list has more than one item and if so creates an inline
|
379
|
+
group in which the text elements can then be generated. While the context manager
|
380
|
+
is active the inline group is set as the current parent.
|
381
|
+
|
382
|
+
Args:
|
383
|
+
annotated_text_list (AnnotatedTextList): Annotated text
|
384
|
+
doc (DoclingDocument): Currently used document
|
385
|
+
|
386
|
+
Yields:
|
387
|
+
None: _description_
|
388
|
+
"""
|
389
|
+
if len(annotated_text_list) > 1:
|
390
|
+
inline_fmt = doc.add_group(
|
391
|
+
label=GroupLabel.INLINE,
|
392
|
+
parent=self.parents[self.level],
|
393
|
+
content_layer=self.content_layer,
|
394
|
+
)
|
395
|
+
self.parents[self.level + 1] = inline_fmt
|
396
|
+
self.level += 1
|
397
|
+
try:
|
398
|
+
yield None
|
399
|
+
finally:
|
400
|
+
self.parents[self.level] = None
|
401
|
+
self.level -= 1
|
402
|
+
else:
|
403
|
+
yield None
|
404
|
+
|
205
405
|
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
206
406
|
tag_name = tag.name.lower()
|
207
407
|
# set default content layer to BODY as soon as we encounter a heading
|
208
408
|
self.content_layer = ContentLayer.BODY
|
209
409
|
level = int(tag_name[1])
|
210
|
-
|
211
|
-
|
410
|
+
annotated_text_list = self._extract_text_and_hyperlink_recursively(
|
411
|
+
tag, find_parent_annotation=True
|
412
|
+
)
|
413
|
+
annotated_text = annotated_text_list.to_single_text_element()
|
414
|
+
text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
|
212
415
|
# the first level is for the title item
|
213
416
|
if level == 1:
|
214
417
|
for key in self.parents.keys():
|
215
418
|
self.parents[key] = None
|
216
419
|
self.level = 0
|
217
420
|
self.parents[self.level + 1] = doc.add_title(
|
218
|
-
|
421
|
+
text_clean,
|
422
|
+
content_layer=self.content_layer,
|
423
|
+
hyperlink=annotated_text.hyperlink,
|
219
424
|
)
|
220
425
|
# the other levels need to be lowered by 1 if a title was set
|
221
426
|
else:
|
@@ -241,9 +446,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
241
446
|
self.parents[self.level + 1] = doc.add_heading(
|
242
447
|
parent=self.parents[self.level],
|
243
448
|
text=text_clean,
|
244
|
-
orig=text,
|
449
|
+
orig=annotated_text.text,
|
245
450
|
level=self.level,
|
246
451
|
content_layer=self.content_layer,
|
452
|
+
hyperlink=annotated_text.hyperlink,
|
247
453
|
)
|
248
454
|
self.level += 1
|
249
455
|
for img_tag in tag("img"):
|
@@ -292,37 +498,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
292
498
|
marker = ""
|
293
499
|
|
294
500
|
# 2) extract only the "direct" text from this <li>
|
295
|
-
parts
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
text_part = HTMLDocumentBackend.get_text(child)
|
303
|
-
if text_part:
|
304
|
-
parts.append(text_part)
|
305
|
-
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
306
|
-
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
501
|
+
parts = self._extract_text_and_hyperlink_recursively(
|
502
|
+
li, ignore_list=True, find_parent_annotation=True
|
503
|
+
)
|
504
|
+
min_parts = parts.simplify_text_elements()
|
505
|
+
li_text = re.sub(
|
506
|
+
r"\s+|\n+", " ", "".join([el.text for el in min_parts])
|
507
|
+
).strip()
|
307
508
|
|
308
509
|
# 3) add the list item
|
309
510
|
if li_text:
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
511
|
+
if len(min_parts) > 1:
|
512
|
+
# create an empty list element in order to hook the inline group onto that one
|
513
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
514
|
+
text="",
|
515
|
+
enumerated=is_ordered,
|
516
|
+
marker=marker,
|
517
|
+
parent=list_group,
|
518
|
+
content_layer=self.content_layer,
|
519
|
+
)
|
520
|
+
self.level += 1
|
521
|
+
with self.use_inline_group(min_parts, doc):
|
522
|
+
for annotated_text in min_parts:
|
523
|
+
li_text = re.sub(
|
524
|
+
r"\s+|\n+", " ", annotated_text.text
|
525
|
+
).strip()
|
526
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
527
|
+
doc.add_text(
|
528
|
+
parent=self.parents[self.level],
|
529
|
+
label=DocItemLabel.TEXT,
|
530
|
+
text=li_clean,
|
531
|
+
content_layer=self.content_layer,
|
532
|
+
hyperlink=annotated_text.hyperlink,
|
533
|
+
)
|
534
|
+
|
535
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
536
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
537
|
+
if isinstance(sublist, Tag):
|
538
|
+
self._handle_block(sublist, doc)
|
539
|
+
|
540
|
+
# now the list element with inline group is not a parent anymore
|
541
|
+
self.parents[self.level] = None
|
542
|
+
self.level -= 1
|
543
|
+
else:
|
544
|
+
annotated_text = min_parts[0]
|
545
|
+
li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
|
546
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
547
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
548
|
+
text=li_clean,
|
549
|
+
enumerated=is_ordered,
|
550
|
+
marker=marker,
|
551
|
+
orig=li_text,
|
552
|
+
parent=list_group,
|
553
|
+
content_layer=self.content_layer,
|
554
|
+
hyperlink=annotated_text.hyperlink,
|
555
|
+
)
|
556
|
+
|
557
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
558
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
559
|
+
if isinstance(sublist, Tag):
|
560
|
+
self.level += 1
|
561
|
+
self._handle_block(sublist, doc)
|
562
|
+
self.parents[self.level + 1] = None
|
563
|
+
self.level -= 1
|
326
564
|
else:
|
327
565
|
for sublist in li({"ul", "ol"}, recursive=False):
|
328
566
|
if isinstance(sublist, Tag):
|
@@ -351,17 +589,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
351
589
|
self._handle_list(tag, doc)
|
352
590
|
|
353
591
|
elif tag_name in {"p", "address", "summary"}:
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
592
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
593
|
+
tag, find_parent_annotation=True
|
594
|
+
)
|
595
|
+
annotated_texts = text_list.simplify_text_elements()
|
596
|
+
for part in annotated_texts.split_by_newline():
|
597
|
+
with self.use_inline_group(part, doc):
|
598
|
+
for annotated_text in part:
|
599
|
+
if seg := annotated_text.text.strip():
|
600
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
601
|
+
doc.add_text(
|
602
|
+
parent=self.parents[self.level],
|
603
|
+
label=DocItemLabel.TEXT,
|
604
|
+
text=seg_clean,
|
605
|
+
content_layer=self.content_layer,
|
606
|
+
hyperlink=annotated_text.hyperlink,
|
607
|
+
)
|
608
|
+
|
365
609
|
for img_tag in tag("img"):
|
366
610
|
if isinstance(img_tag, Tag):
|
367
611
|
self._emit_image(img_tag, doc)
|
@@ -380,20 +624,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
380
624
|
|
381
625
|
elif tag_name in {"pre", "code"}:
|
382
626
|
# handle monospace code snippets (pre).
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
627
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
628
|
+
tag, find_parent_annotation=True
|
629
|
+
)
|
630
|
+
annotated_texts = text_list.simplify_text_elements()
|
631
|
+
with self.use_inline_group(annotated_texts, doc):
|
632
|
+
for annotated_text in annotated_texts:
|
633
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
634
|
+
annotated_text.text.strip()
|
635
|
+
)
|
636
|
+
doc.add_code(
|
637
|
+
parent=self.parents[self.level],
|
638
|
+
text=text_clean,
|
639
|
+
content_layer=self.content_layer,
|
640
|
+
hyperlink=annotated_text.hyperlink,
|
641
|
+
)
|
392
642
|
|
393
|
-
elif tag_name
|
394
|
-
|
643
|
+
elif tag_name in {"details", "footer"}:
|
644
|
+
if tag_name == "footer":
|
645
|
+
current_layer = self.content_layer
|
646
|
+
self.content_layer = ContentLayer.FURNITURE
|
395
647
|
self.parents[self.level + 1] = doc.add_group(
|
396
|
-
name=
|
648
|
+
name=tag_name,
|
397
649
|
label=GroupLabel.SECTION,
|
398
650
|
parent=self.parents[self.level],
|
399
651
|
content_layer=self.content_layer,
|
@@ -402,25 +654,49 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
402
654
|
self._walk(tag, doc)
|
403
655
|
self.parents[self.level + 1] = None
|
404
656
|
self.level -= 1
|
657
|
+
if tag_name == "footer":
|
658
|
+
self.content_layer = current_layer
|
405
659
|
|
406
660
|
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
407
661
|
figure = img_tag.find_parent("figure")
|
408
|
-
caption:
|
662
|
+
caption: AnnotatedTextList = AnnotatedTextList()
|
663
|
+
|
664
|
+
# check if the figure has a link - this is HACK:
|
665
|
+
def get_img_hyperlink(img_tag):
|
666
|
+
this_parent = img_tag.parent
|
667
|
+
while this_parent is not None:
|
668
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
669
|
+
return this_parent.get("href")
|
670
|
+
this_parent = this_parent.parent
|
671
|
+
return None
|
672
|
+
|
673
|
+
if img_hyperlink := get_img_hyperlink(img_tag):
|
674
|
+
caption.append(
|
675
|
+
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
|
676
|
+
)
|
677
|
+
|
409
678
|
if isinstance(figure, Tag):
|
410
679
|
caption_tag = figure.find("figcaption", recursive=False)
|
411
680
|
if isinstance(caption_tag, Tag):
|
412
|
-
caption =
|
413
|
-
|
414
|
-
|
681
|
+
caption = self._extract_text_and_hyperlink_recursively(
|
682
|
+
caption_tag, find_parent_annotation=True
|
683
|
+
)
|
684
|
+
if not caption and img_tag.get("alt"):
|
685
|
+
caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
|
686
|
+
|
687
|
+
caption_anno_text = caption.to_single_text_element()
|
415
688
|
|
416
689
|
caption_item: Optional[TextItem] = None
|
417
|
-
if
|
418
|
-
|
690
|
+
if caption_anno_text.text:
|
691
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
692
|
+
caption_anno_text.text.strip()
|
693
|
+
)
|
419
694
|
caption_item = doc.add_text(
|
420
695
|
label=DocItemLabel.CAPTION,
|
421
|
-
text=
|
422
|
-
orig=
|
696
|
+
text=text_clean,
|
697
|
+
orig=caption_anno_text.text,
|
423
698
|
content_layer=self.content_layer,
|
699
|
+
hyperlink=caption_anno_text.hyperlink,
|
424
700
|
)
|
425
701
|
|
426
702
|
doc.add_picture(
|