docling 2.43.0__py3-none-any.whl → 2.45.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +406 -69
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/pdf_backend.py +3 -3
- docling/cli/main.py +16 -0
- docling/datamodel/base_models.py +3 -0
- docling/datamodel/document.py +26 -0
- docling/datamodel/pipeline_options_vlm_model.py +8 -2
- docling/document_converter.py +34 -0
- docling/models/api_vlm_model.py +2 -5
- docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
- docling/models/vlm_models_inline/mlx_model.py +4 -6
- docling/pipeline/base_pipeline.py +7 -4
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/METADATA +2 -2
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/RECORD +18 -17
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/WHEEL +0 -0
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/entry_points.txt +0 -0
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/top_level.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
+
from contextlib import contextmanager
|
4
|
+
from copy import deepcopy
|
3
5
|
from io import BytesIO
|
4
6
|
from pathlib import Path
|
5
7
|
from typing import Final, Optional, Union, cast
|
8
|
+
from urllib.parse import urljoin
|
6
9
|
|
7
10
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
8
11
|
from bs4.element import PreformattedString
|
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
|
|
18
21
|
TextItem,
|
19
22
|
)
|
20
23
|
from docling_core.types.doc.document import ContentLayer
|
21
|
-
from pydantic import BaseModel
|
24
|
+
from pydantic import AnyUrl, BaseModel, ValidationError
|
22
25
|
from typing_extensions import override
|
23
26
|
|
24
27
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
@@ -56,12 +59,76 @@ class _Context(BaseModel):
|
|
56
59
|
list_start_by_ref: dict[str, int] = {}
|
57
60
|
|
58
61
|
|
62
|
+
class AnnotatedText(BaseModel):
|
63
|
+
text: str
|
64
|
+
hyperlink: Union[AnyUrl, Path, None] = None
|
65
|
+
|
66
|
+
|
67
|
+
class AnnotatedTextList(list):
|
68
|
+
def to_single_text_element(self) -> AnnotatedText:
|
69
|
+
current_h = None
|
70
|
+
current_text = ""
|
71
|
+
for at in self:
|
72
|
+
t = at.text
|
73
|
+
h = at.hyperlink
|
74
|
+
current_text += t.strip() + " "
|
75
|
+
if h is not None and current_h is None:
|
76
|
+
current_h = h
|
77
|
+
elif h is not None and current_h is not None and h != current_h:
|
78
|
+
_log.warning(
|
79
|
+
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
80
|
+
)
|
81
|
+
return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
|
82
|
+
|
83
|
+
def simplify_text_elements(self) -> "AnnotatedTextList":
|
84
|
+
simplified = AnnotatedTextList()
|
85
|
+
if not self:
|
86
|
+
return self
|
87
|
+
text = self[0].text
|
88
|
+
hyperlink = self[0].hyperlink
|
89
|
+
last_elm = text
|
90
|
+
for i in range(1, len(self)):
|
91
|
+
if hyperlink == self[i].hyperlink:
|
92
|
+
sep = " "
|
93
|
+
if not self[i].text.strip() or not last_elm.strip():
|
94
|
+
sep = ""
|
95
|
+
text += sep + self[i].text
|
96
|
+
last_elm = self[i].text
|
97
|
+
else:
|
98
|
+
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
99
|
+
text = self[i].text
|
100
|
+
last_elm = text
|
101
|
+
hyperlink = self[i].hyperlink
|
102
|
+
if text:
|
103
|
+
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
104
|
+
return simplified
|
105
|
+
|
106
|
+
def split_by_newline(self):
|
107
|
+
super_list = []
|
108
|
+
active_annotated_text_list = AnnotatedTextList()
|
109
|
+
for el in self:
|
110
|
+
sub_texts = el.text.split("\n")
|
111
|
+
if len(sub_texts) == 1:
|
112
|
+
active_annotated_text_list.append(el)
|
113
|
+
else:
|
114
|
+
for text in sub_texts:
|
115
|
+
sub_el = deepcopy(el)
|
116
|
+
sub_el.text = text
|
117
|
+
active_annotated_text_list.append(sub_el)
|
118
|
+
super_list.append(active_annotated_text_list)
|
119
|
+
active_annotated_text_list = AnnotatedTextList()
|
120
|
+
if active_annotated_text_list:
|
121
|
+
super_list.append(active_annotated_text_list)
|
122
|
+
return super_list
|
123
|
+
|
124
|
+
|
59
125
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
60
126
|
@override
|
61
127
|
def __init__(
|
62
128
|
self,
|
63
129
|
in_doc: InputDocument,
|
64
130
|
path_or_stream: Union[BytesIO, Path],
|
131
|
+
original_url: Optional[AnyUrl] = None,
|
65
132
|
):
|
66
133
|
super().__init__(in_doc, path_or_stream)
|
67
134
|
self.soup: Optional[Tag] = None
|
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
74
141
|
self.ctx = _Context()
|
75
142
|
for i in range(self.max_levels):
|
76
143
|
self.parents[i] = None
|
144
|
+
self.hyperlink = None
|
145
|
+
self.original_url = original_url
|
77
146
|
|
78
147
|
try:
|
79
148
|
raw = (
|
@@ -125,8 +194,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
125
194
|
# set the title as furniture, since it is part of the document metadata
|
126
195
|
title = self.soup.title
|
127
196
|
if title:
|
197
|
+
title_text = title.get_text(separator=" ", strip=True)
|
198
|
+
title_clean = HTMLDocumentBackend._clean_unicode(title_text)
|
128
199
|
doc.add_title(
|
129
|
-
text=
|
200
|
+
text=title_clean,
|
201
|
+
orig=title_text,
|
130
202
|
content_layer=ContentLayer.FURNITURE,
|
131
203
|
)
|
132
204
|
# remove scripts/styles
|
@@ -157,24 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
157
229
|
element: The XML tag to parse.
|
158
230
|
doc: The Docling document to be updated with the parsed content.
|
159
231
|
"""
|
160
|
-
buffer:
|
232
|
+
buffer: AnnotatedTextList = AnnotatedTextList()
|
161
233
|
|
162
234
|
def flush_buffer():
|
163
235
|
if not buffer:
|
164
236
|
return
|
165
|
-
|
237
|
+
annotated_text_list = buffer.simplify_text_elements()
|
238
|
+
parts = annotated_text_list.split_by_newline()
|
166
239
|
buffer.clear()
|
167
|
-
|
240
|
+
|
241
|
+
if not "".join([el.text for el in annotated_text_list]):
|
168
242
|
return
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
243
|
+
|
244
|
+
for annotated_text_list in parts:
|
245
|
+
with self.use_inline_group(annotated_text_list, doc):
|
246
|
+
for annotated_text in annotated_text_list:
|
247
|
+
if annotated_text.text.strip():
|
248
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(
|
249
|
+
annotated_text.text.strip()
|
250
|
+
)
|
251
|
+
doc.add_text(
|
252
|
+
parent=self.parents[self.level],
|
253
|
+
label=DocItemLabel.TEXT,
|
254
|
+
text=seg_clean,
|
255
|
+
content_layer=self.content_layer,
|
256
|
+
hyperlink=annotated_text.hyperlink,
|
257
|
+
)
|
178
258
|
|
179
259
|
for node in element.contents:
|
180
260
|
if isinstance(node, Tag):
|
@@ -182,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
182
262
|
if name == "img":
|
183
263
|
flush_buffer()
|
184
264
|
self._emit_image(node, doc)
|
265
|
+
elif name == "a":
|
266
|
+
with self.use_hyperlink(node):
|
267
|
+
self._walk(node, doc)
|
185
268
|
elif name in _BLOCK_TAGS:
|
186
269
|
flush_buffer()
|
187
270
|
self._handle_block(node, doc)
|
@@ -189,27 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
189
272
|
flush_buffer()
|
190
273
|
self._walk(node, doc)
|
191
274
|
else:
|
192
|
-
buffer.
|
275
|
+
buffer.extend(
|
276
|
+
self._extract_text_and_hyperlink_recursively(
|
277
|
+
node, find_parent_annotation=True, keep_newlines=True
|
278
|
+
)
|
279
|
+
)
|
193
280
|
elif isinstance(node, NavigableString) and not isinstance(
|
194
281
|
node, PreformattedString
|
195
282
|
):
|
196
|
-
|
283
|
+
if str(node).strip("\n\r") == "":
|
284
|
+
flush_buffer()
|
285
|
+
else:
|
286
|
+
buffer.extend(
|
287
|
+
self._extract_text_and_hyperlink_recursively(
|
288
|
+
node, find_parent_annotation=True, keep_newlines=True
|
289
|
+
)
|
290
|
+
)
|
197
291
|
|
198
292
|
flush_buffer()
|
199
293
|
|
294
|
+
def _extract_text_and_hyperlink_recursively(
|
295
|
+
self,
|
296
|
+
item: PageElement,
|
297
|
+
ignore_list=False,
|
298
|
+
find_parent_annotation=False,
|
299
|
+
keep_newlines=False,
|
300
|
+
) -> AnnotatedTextList:
|
301
|
+
result: AnnotatedTextList = AnnotatedTextList()
|
302
|
+
|
303
|
+
# If find_parent_annotation, make sure that we keep track of
|
304
|
+
# any a-tag that has been present in the DOM-parents already.
|
305
|
+
if find_parent_annotation:
|
306
|
+
this_parent = item.parent
|
307
|
+
while this_parent is not None:
|
308
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
309
|
+
with self.use_hyperlink(this_parent):
|
310
|
+
return self._extract_text_and_hyperlink_recursively(
|
311
|
+
item, ignore_list
|
312
|
+
)
|
313
|
+
this_parent = this_parent.parent
|
314
|
+
|
315
|
+
if isinstance(item, PreformattedString):
|
316
|
+
return AnnotatedTextList()
|
317
|
+
|
318
|
+
if isinstance(item, NavigableString):
|
319
|
+
text = item.strip()
|
320
|
+
if text:
|
321
|
+
return AnnotatedTextList(
|
322
|
+
[AnnotatedText(text=text, hyperlink=self.hyperlink)]
|
323
|
+
)
|
324
|
+
if keep_newlines and item.strip("\n\r") == "":
|
325
|
+
return AnnotatedTextList(
|
326
|
+
[AnnotatedText(text="\n", hyperlink=self.hyperlink)]
|
327
|
+
)
|
328
|
+
return AnnotatedTextList()
|
329
|
+
|
330
|
+
tag = cast(Tag, item)
|
331
|
+
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
332
|
+
for child in tag:
|
333
|
+
if isinstance(child, Tag) and child.name == "a":
|
334
|
+
with self.use_hyperlink(child):
|
335
|
+
result.extend(
|
336
|
+
self._extract_text_and_hyperlink_recursively(
|
337
|
+
child, ignore_list, keep_newlines=keep_newlines
|
338
|
+
)
|
339
|
+
)
|
340
|
+
else:
|
341
|
+
# Recursively get the child's text content
|
342
|
+
result.extend(
|
343
|
+
self._extract_text_and_hyperlink_recursively(
|
344
|
+
child, ignore_list, keep_newlines=keep_newlines
|
345
|
+
)
|
346
|
+
)
|
347
|
+
return result
|
348
|
+
|
349
|
+
@contextmanager
|
350
|
+
def use_hyperlink(self, tag):
|
351
|
+
this_href = tag.get("href")
|
352
|
+
if this_href is None:
|
353
|
+
yield None
|
354
|
+
else:
|
355
|
+
if this_href:
|
356
|
+
old_hyperlink = self.hyperlink
|
357
|
+
if self.original_url is not None:
|
358
|
+
this_href = urljoin(self.original_url, this_href)
|
359
|
+
# ugly fix for relative links since pydantic does not support them.
|
360
|
+
try:
|
361
|
+
AnyUrl(this_href)
|
362
|
+
except ValidationError:
|
363
|
+
this_href = Path(this_href)
|
364
|
+
self.hyperlink = this_href
|
365
|
+
try:
|
366
|
+
yield None
|
367
|
+
finally:
|
368
|
+
if this_href:
|
369
|
+
self.hyperlink = old_hyperlink
|
370
|
+
|
371
|
+
@contextmanager
|
372
|
+
def use_inline_group(
|
373
|
+
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
374
|
+
):
|
375
|
+
"""Create an inline group for annotated texts.
|
376
|
+
|
377
|
+
Checks if annotated_text_list has more than one item and if so creates an inline
|
378
|
+
group in which the text elements can then be generated. While the context manager
|
379
|
+
is active the inline group is set as the current parent.
|
380
|
+
|
381
|
+
Args:
|
382
|
+
annotated_text_list (AnnotatedTextList): Annotated text
|
383
|
+
doc (DoclingDocument): Currently used document
|
384
|
+
|
385
|
+
Yields:
|
386
|
+
None: _description_
|
387
|
+
"""
|
388
|
+
if len(annotated_text_list) > 1:
|
389
|
+
inline_fmt = doc.add_group(
|
390
|
+
label=GroupLabel.INLINE,
|
391
|
+
parent=self.parents[self.level],
|
392
|
+
content_layer=self.content_layer,
|
393
|
+
)
|
394
|
+
self.parents[self.level + 1] = inline_fmt
|
395
|
+
self.level += 1
|
396
|
+
try:
|
397
|
+
yield None
|
398
|
+
finally:
|
399
|
+
self.parents[self.level] = None
|
400
|
+
self.level -= 1
|
401
|
+
else:
|
402
|
+
yield None
|
403
|
+
|
200
404
|
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
201
405
|
tag_name = tag.name.lower()
|
202
406
|
# set default content layer to BODY as soon as we encounter a heading
|
203
407
|
self.content_layer = ContentLayer.BODY
|
204
408
|
level = int(tag_name[1])
|
205
|
-
|
409
|
+
annotated_text_list = self._extract_text_and_hyperlink_recursively(
|
410
|
+
tag, find_parent_annotation=True
|
411
|
+
)
|
412
|
+
annotated_text = annotated_text_list.to_single_text_element()
|
413
|
+
text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
|
206
414
|
# the first level is for the title item
|
207
415
|
if level == 1:
|
208
416
|
for key in self.parents.keys():
|
209
417
|
self.parents[key] = None
|
210
418
|
self.level = 0
|
211
419
|
self.parents[self.level + 1] = doc.add_title(
|
212
|
-
|
420
|
+
text_clean,
|
421
|
+
content_layer=self.content_layer,
|
422
|
+
hyperlink=annotated_text.hyperlink,
|
213
423
|
)
|
214
424
|
# the other levels need to be lowered by 1 if a title was set
|
215
425
|
else:
|
@@ -234,9 +444,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
234
444
|
self.level = level
|
235
445
|
self.parents[self.level + 1] = doc.add_heading(
|
236
446
|
parent=self.parents[self.level],
|
237
|
-
text=
|
447
|
+
text=text_clean,
|
448
|
+
orig=annotated_text.text,
|
238
449
|
level=self.level,
|
239
450
|
content_layer=self.content_layer,
|
451
|
+
hyperlink=annotated_text.hyperlink,
|
240
452
|
)
|
241
453
|
self.level += 1
|
242
454
|
for img_tag in tag("img"):
|
@@ -285,35 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
285
497
|
marker = ""
|
286
498
|
|
287
499
|
# 2) extract only the "direct" text from this <li>
|
288
|
-
parts
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
text_part = HTMLDocumentBackend.get_text(child)
|
296
|
-
if text_part:
|
297
|
-
parts.append(text_part)
|
298
|
-
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
500
|
+
parts = self._extract_text_and_hyperlink_recursively(
|
501
|
+
li, ignore_list=True, find_parent_annotation=True
|
502
|
+
)
|
503
|
+
min_parts = parts.simplify_text_elements()
|
504
|
+
li_text = re.sub(
|
505
|
+
r"\s+|\n+", " ", "".join([el.text for el in min_parts])
|
506
|
+
).strip()
|
299
507
|
|
300
508
|
# 3) add the list item
|
301
509
|
if li_text:
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
510
|
+
if len(min_parts) > 1:
|
511
|
+
# create an empty list element in order to hook the inline group onto that one
|
512
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
513
|
+
text="",
|
514
|
+
enumerated=is_ordered,
|
515
|
+
marker=marker,
|
516
|
+
parent=list_group,
|
517
|
+
content_layer=self.content_layer,
|
518
|
+
)
|
519
|
+
self.level += 1
|
520
|
+
with self.use_inline_group(min_parts, doc):
|
521
|
+
for annotated_text in min_parts:
|
522
|
+
li_text = re.sub(
|
523
|
+
r"\s+|\n+", " ", annotated_text.text
|
524
|
+
).strip()
|
525
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
526
|
+
doc.add_text(
|
527
|
+
parent=self.parents[self.level],
|
528
|
+
label=DocItemLabel.TEXT,
|
529
|
+
text=li_clean,
|
530
|
+
content_layer=self.content_layer,
|
531
|
+
hyperlink=annotated_text.hyperlink,
|
532
|
+
)
|
533
|
+
|
534
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
535
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
536
|
+
if isinstance(sublist, Tag):
|
537
|
+
self._handle_block(sublist, doc)
|
538
|
+
|
539
|
+
# now the list element with inline group is not a parent anymore
|
540
|
+
self.parents[self.level] = None
|
541
|
+
self.level -= 1
|
542
|
+
else:
|
543
|
+
annotated_text = min_parts[0]
|
544
|
+
li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
|
545
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
546
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
547
|
+
text=li_clean,
|
548
|
+
enumerated=is_ordered,
|
549
|
+
marker=marker,
|
550
|
+
orig=li_text,
|
551
|
+
parent=list_group,
|
552
|
+
content_layer=self.content_layer,
|
553
|
+
hyperlink=annotated_text.hyperlink,
|
554
|
+
)
|
555
|
+
|
556
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
557
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
558
|
+
if isinstance(sublist, Tag):
|
559
|
+
self.level += 1
|
560
|
+
self._handle_block(sublist, doc)
|
561
|
+
self.parents[self.level + 1] = None
|
562
|
+
self.level -= 1
|
317
563
|
else:
|
318
564
|
for sublist in li({"ul", "ol"}, recursive=False):
|
319
565
|
if isinstance(sublist, Tag):
|
@@ -342,15 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
342
588
|
self._handle_list(tag, doc)
|
343
589
|
|
344
590
|
elif tag_name in {"p", "address", "summary"}:
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
591
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
592
|
+
tag, find_parent_annotation=True
|
593
|
+
)
|
594
|
+
annotated_texts = text_list.simplify_text_elements()
|
595
|
+
for part in annotated_texts.split_by_newline():
|
596
|
+
with self.use_inline_group(part, doc):
|
597
|
+
for annotated_text in part:
|
598
|
+
if seg := annotated_text.text.strip():
|
599
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
600
|
+
doc.add_text(
|
601
|
+
parent=self.parents[self.level],
|
602
|
+
label=DocItemLabel.TEXT,
|
603
|
+
text=seg_clean,
|
604
|
+
content_layer=self.content_layer,
|
605
|
+
hyperlink=annotated_text.hyperlink,
|
606
|
+
)
|
607
|
+
|
354
608
|
for img_tag in tag("img"):
|
355
609
|
if isinstance(img_tag, Tag):
|
356
610
|
self._emit_image(img_tag, doc)
|
@@ -369,13 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
369
623
|
|
370
624
|
elif tag_name in {"pre", "code"}:
|
371
625
|
# handle monospace code snippets (pre).
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
626
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
627
|
+
tag, find_parent_annotation=True
|
628
|
+
)
|
629
|
+
annotated_texts = text_list.simplify_text_elements()
|
630
|
+
with self.use_inline_group(annotated_texts, doc):
|
631
|
+
for annotated_text in annotated_texts:
|
632
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
633
|
+
annotated_text.text.strip()
|
634
|
+
)
|
635
|
+
doc.add_code(
|
636
|
+
parent=self.parents[self.level],
|
637
|
+
text=text_clean,
|
638
|
+
content_layer=self.content_layer,
|
639
|
+
hyperlink=annotated_text.hyperlink,
|
640
|
+
)
|
379
641
|
|
380
642
|
elif tag_name == "details":
|
381
643
|
# handle details and its content.
|
@@ -392,18 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
392
654
|
|
393
655
|
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
394
656
|
figure = img_tag.find_parent("figure")
|
395
|
-
caption:
|
657
|
+
caption: AnnotatedTextList = AnnotatedTextList()
|
658
|
+
|
659
|
+
# check if the figure has a link - this is HACK:
|
660
|
+
def get_img_hyperlink(img_tag):
|
661
|
+
this_parent = img_tag.parent
|
662
|
+
while this_parent is not None:
|
663
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
664
|
+
return this_parent.get("href")
|
665
|
+
this_parent = this_parent.parent
|
666
|
+
return None
|
667
|
+
|
668
|
+
if img_hyperlink := get_img_hyperlink(img_tag):
|
669
|
+
caption.append(
|
670
|
+
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
|
671
|
+
)
|
672
|
+
|
396
673
|
if isinstance(figure, Tag):
|
397
674
|
caption_tag = figure.find("figcaption", recursive=False)
|
398
675
|
if isinstance(caption_tag, Tag):
|
399
|
-
caption =
|
400
|
-
|
401
|
-
|
676
|
+
caption = self._extract_text_and_hyperlink_recursively(
|
677
|
+
caption_tag, find_parent_annotation=True
|
678
|
+
)
|
679
|
+
if not caption and img_tag.get("alt"):
|
680
|
+
caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
|
681
|
+
|
682
|
+
caption_anno_text = caption.to_single_text_element()
|
402
683
|
|
403
684
|
caption_item: Optional[TextItem] = None
|
404
|
-
if
|
685
|
+
if caption_anno_text.text:
|
686
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
687
|
+
caption_anno_text.text.strip()
|
688
|
+
)
|
689
|
+
print(caption_anno_text)
|
405
690
|
caption_item = doc.add_text(
|
406
|
-
DocItemLabel.CAPTION,
|
691
|
+
label=DocItemLabel.CAPTION,
|
692
|
+
text=text_clean,
|
693
|
+
orig=caption_anno_text.text,
|
694
|
+
content_layer=self.content_layer,
|
695
|
+
hyperlink=caption_anno_text.hyperlink,
|
407
696
|
)
|
408
697
|
|
409
698
|
doc.add_picture(
|
@@ -442,6 +731,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
442
731
|
|
443
732
|
return "".join(parts)
|
444
733
|
|
734
|
+
@staticmethod
|
735
|
+
def _clean_unicode(text: str) -> str:
|
736
|
+
"""Replace typical Unicode characters in HTML for text processing.
|
737
|
+
|
738
|
+
Several Unicode characters (e.g., non-printable or formatting) are typically
|
739
|
+
found in HTML but are worth replacing to sanitize text and ensure consistency
|
740
|
+
in text processing tasks.
|
741
|
+
|
742
|
+
Args:
|
743
|
+
text: The original text.
|
744
|
+
|
745
|
+
Returns:
|
746
|
+
The sanitized text without typical Unicode characters.
|
747
|
+
"""
|
748
|
+
replacements = {
|
749
|
+
"\u00a0": " ", # non-breaking space
|
750
|
+
"\u200b": "", # zero-width space
|
751
|
+
"\u200c": "", # zero-width non-joiner
|
752
|
+
"\u200d": "", # zero-width joiner
|
753
|
+
"\u2010": "-", # hyphen
|
754
|
+
"\u2011": "-", # non-breaking hyphen
|
755
|
+
"\u2012": "-", # dash
|
756
|
+
"\u2013": "-", # dash
|
757
|
+
"\u2014": "-", # dash
|
758
|
+
"\u2015": "-", # horizontal bar
|
759
|
+
"\u2018": "'", # left single quotation mark
|
760
|
+
"\u2019": "'", # right single quotation mark
|
761
|
+
"\u201c": '"', # left double quotation mark
|
762
|
+
"\u201d": '"', # right double quotation mark
|
763
|
+
"\u2026": "...", # ellipsis
|
764
|
+
"\u00ad": "", # soft hyphen
|
765
|
+
"\ufeff": "", # zero width non-break space
|
766
|
+
"\u202f": " ", # narrow non-break space
|
767
|
+
"\u2060": "", # word joiner
|
768
|
+
}
|
769
|
+
for raw, clean in replacements.items():
|
770
|
+
text = text.replace(raw, clean)
|
771
|
+
|
772
|
+
return text
|
773
|
+
|
445
774
|
@staticmethod
|
446
775
|
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
447
776
|
"""Extract colspan and rowspan values from a table cell tag.
|
@@ -454,9 +783,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
454
783
|
str(cell.get("colspan", "1")),
|
455
784
|
str(cell.get("rowspan", "1")),
|
456
785
|
)
|
786
|
+
|
787
|
+
def _extract_num(s: str) -> int:
|
788
|
+
if s and s[0].isnumeric():
|
789
|
+
match = re.search(r"\d+", s)
|
790
|
+
if match:
|
791
|
+
return int(match.group())
|
792
|
+
return 1
|
793
|
+
|
457
794
|
int_spans: tuple[int, int] = (
|
458
|
-
|
459
|
-
|
795
|
+
_extract_num(raw_spans[0]),
|
796
|
+
_extract_num(raw_spans[1]),
|
460
797
|
)
|
461
798
|
|
462
799
|
return int_spans
|