docling 2.44.0__py3-none-any.whl → 2.46.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,11 @@
1
1
  import logging
2
2
  import re
3
+ from contextlib import contextmanager
4
+ from copy import deepcopy
3
5
  from io import BytesIO
4
6
  from pathlib import Path
5
7
  from typing import Final, Optional, Union, cast
8
+ from urllib.parse import urljoin
6
9
 
7
10
  from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
8
11
  from bs4.element import PreformattedString
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
18
21
  TextItem,
19
22
  )
20
23
  from docling_core.types.doc.document import ContentLayer
21
- from pydantic import BaseModel
24
+ from pydantic import AnyUrl, BaseModel, ValidationError
22
25
  from typing_extensions import override
23
26
 
24
27
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -35,6 +38,7 @@ _BLOCK_TAGS: Final = {
35
38
  "address",
36
39
  "details",
37
40
  "figure",
41
+ "footer",
38
42
  "h1",
39
43
  "h2",
40
44
  "h3",
@@ -56,12 +60,76 @@ class _Context(BaseModel):
56
60
  list_start_by_ref: dict[str, int] = {}
57
61
 
58
62
 
63
+ class AnnotatedText(BaseModel):
64
+ text: str
65
+ hyperlink: Union[AnyUrl, Path, None] = None
66
+
67
+
68
+ class AnnotatedTextList(list):
69
+ def to_single_text_element(self) -> AnnotatedText:
70
+ current_h = None
71
+ current_text = ""
72
+ for at in self:
73
+ t = at.text
74
+ h = at.hyperlink
75
+ current_text += t.strip() + " "
76
+ if h is not None and current_h is None:
77
+ current_h = h
78
+ elif h is not None and current_h is not None and h != current_h:
79
+ _log.warning(
80
+ f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
81
+ )
82
+ return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
83
+
84
+ def simplify_text_elements(self) -> "AnnotatedTextList":
85
+ simplified = AnnotatedTextList()
86
+ if not self:
87
+ return self
88
+ text = self[0].text
89
+ hyperlink = self[0].hyperlink
90
+ last_elm = text
91
+ for i in range(1, len(self)):
92
+ if hyperlink == self[i].hyperlink:
93
+ sep = " "
94
+ if not self[i].text.strip() or not last_elm.strip():
95
+ sep = ""
96
+ text += sep + self[i].text
97
+ last_elm = self[i].text
98
+ else:
99
+ simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
100
+ text = self[i].text
101
+ last_elm = text
102
+ hyperlink = self[i].hyperlink
103
+ if text:
104
+ simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
105
+ return simplified
106
+
107
+ def split_by_newline(self):
108
+ super_list = []
109
+ active_annotated_text_list = AnnotatedTextList()
110
+ for el in self:
111
+ sub_texts = el.text.split("\n")
112
+ if len(sub_texts) == 1:
113
+ active_annotated_text_list.append(el)
114
+ else:
115
+ for text in sub_texts:
116
+ sub_el = deepcopy(el)
117
+ sub_el.text = text
118
+ active_annotated_text_list.append(sub_el)
119
+ super_list.append(active_annotated_text_list)
120
+ active_annotated_text_list = AnnotatedTextList()
121
+ if active_annotated_text_list:
122
+ super_list.append(active_annotated_text_list)
123
+ return super_list
124
+
125
+
59
126
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
60
127
  @override
61
128
  def __init__(
62
129
  self,
63
130
  in_doc: InputDocument,
64
131
  path_or_stream: Union[BytesIO, Path],
132
+ original_url: Optional[AnyUrl] = None,
65
133
  ):
66
134
  super().__init__(in_doc, path_or_stream)
67
135
  self.soup: Optional[Tag] = None
@@ -74,6 +142,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
74
142
  self.ctx = _Context()
75
143
  for i in range(self.max_levels):
76
144
  self.parents[i] = None
145
+ self.hyperlink = None
146
+ self.original_url = original_url
77
147
 
78
148
  try:
79
149
  raw = (
@@ -160,26 +230,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
160
230
  element: The XML tag to parse.
161
231
  doc: The Docling document to be updated with the parsed content.
162
232
  """
163
- buffer: list[str] = []
233
+ buffer: AnnotatedTextList = AnnotatedTextList()
164
234
 
165
235
  def flush_buffer():
166
236
  if not buffer:
167
237
  return
168
- text = "".join(buffer).strip()
238
+ annotated_text_list = buffer.simplify_text_elements()
239
+ parts = annotated_text_list.split_by_newline()
169
240
  buffer.clear()
170
- if not text:
241
+
242
+ if not "".join([el.text for el in annotated_text_list]):
171
243
  return
172
- for part in text.split("\n"):
173
- seg = part.strip()
174
- seg_clean = HTMLDocumentBackend._clean_unicode(seg)
175
- if seg:
176
- doc.add_text(
177
- label=DocItemLabel.TEXT,
178
- text=seg_clean,
179
- orig=seg,
180
- parent=self.parents[self.level],
181
- content_layer=self.content_layer,
182
- )
244
+
245
+ for annotated_text_list in parts:
246
+ with self.use_inline_group(annotated_text_list, doc):
247
+ for annotated_text in annotated_text_list:
248
+ if annotated_text.text.strip():
249
+ seg_clean = HTMLDocumentBackend._clean_unicode(
250
+ annotated_text.text.strip()
251
+ )
252
+ doc.add_text(
253
+ parent=self.parents[self.level],
254
+ label=DocItemLabel.TEXT,
255
+ text=seg_clean,
256
+ content_layer=self.content_layer,
257
+ hyperlink=annotated_text.hyperlink,
258
+ )
183
259
 
184
260
  for node in element.contents:
185
261
  if isinstance(node, Tag):
@@ -187,6 +263,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
187
263
  if name == "img":
188
264
  flush_buffer()
189
265
  self._emit_image(node, doc)
266
+ elif name == "a":
267
+ with self.use_hyperlink(node):
268
+ self._walk(node, doc)
190
269
  elif name in _BLOCK_TAGS:
191
270
  flush_buffer()
192
271
  self._handle_block(node, doc)
@@ -194,28 +273,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
194
273
  flush_buffer()
195
274
  self._walk(node, doc)
196
275
  else:
197
- buffer.append(node.text)
276
+ buffer.extend(
277
+ self._extract_text_and_hyperlink_recursively(
278
+ node, find_parent_annotation=True, keep_newlines=True
279
+ )
280
+ )
198
281
  elif isinstance(node, NavigableString) and not isinstance(
199
282
  node, PreformattedString
200
283
  ):
201
- buffer.append(str(node))
284
+ if str(node).strip("\n\r") == "":
285
+ flush_buffer()
286
+ else:
287
+ buffer.extend(
288
+ self._extract_text_and_hyperlink_recursively(
289
+ node, find_parent_annotation=True, keep_newlines=True
290
+ )
291
+ )
202
292
 
203
293
  flush_buffer()
204
294
 
295
+ def _extract_text_and_hyperlink_recursively(
296
+ self,
297
+ item: PageElement,
298
+ ignore_list=False,
299
+ find_parent_annotation=False,
300
+ keep_newlines=False,
301
+ ) -> AnnotatedTextList:
302
+ result: AnnotatedTextList = AnnotatedTextList()
303
+
304
+ # If find_parent_annotation, make sure that we keep track of
305
+ # any a-tag that has been present in the DOM-parents already.
306
+ if find_parent_annotation:
307
+ this_parent = item.parent
308
+ while this_parent is not None:
309
+ if this_parent.name == "a" and this_parent.get("href"):
310
+ with self.use_hyperlink(this_parent):
311
+ return self._extract_text_and_hyperlink_recursively(
312
+ item, ignore_list
313
+ )
314
+ this_parent = this_parent.parent
315
+
316
+ if isinstance(item, PreformattedString):
317
+ return AnnotatedTextList()
318
+
319
+ if isinstance(item, NavigableString):
320
+ text = item.strip()
321
+ if text:
322
+ return AnnotatedTextList(
323
+ [AnnotatedText(text=text, hyperlink=self.hyperlink)]
324
+ )
325
+ if keep_newlines and item.strip("\n\r") == "":
326
+ return AnnotatedTextList(
327
+ [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
328
+ )
329
+ return AnnotatedTextList()
330
+
331
+ tag = cast(Tag, item)
332
+ if not ignore_list or (tag.name not in ["ul", "ol"]):
333
+ for child in tag:
334
+ if isinstance(child, Tag) and child.name == "a":
335
+ with self.use_hyperlink(child):
336
+ result.extend(
337
+ self._extract_text_and_hyperlink_recursively(
338
+ child, ignore_list, keep_newlines=keep_newlines
339
+ )
340
+ )
341
+ else:
342
+ # Recursively get the child's text content
343
+ result.extend(
344
+ self._extract_text_and_hyperlink_recursively(
345
+ child, ignore_list, keep_newlines=keep_newlines
346
+ )
347
+ )
348
+ return result
349
+
350
+ @contextmanager
351
+ def use_hyperlink(self, tag):
352
+ this_href = tag.get("href")
353
+ if this_href is None:
354
+ yield None
355
+ else:
356
+ if this_href:
357
+ old_hyperlink = self.hyperlink
358
+ if self.original_url is not None:
359
+ this_href = urljoin(self.original_url, this_href)
360
+ # ugly fix for relative links since pydantic does not support them.
361
+ try:
362
+ AnyUrl(this_href)
363
+ except ValidationError:
364
+ this_href = Path(this_href)
365
+ self.hyperlink = this_href
366
+ try:
367
+ yield None
368
+ finally:
369
+ if this_href:
370
+ self.hyperlink = old_hyperlink
371
+
372
+ @contextmanager
373
+ def use_inline_group(
374
+ self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
375
+ ):
376
+ """Create an inline group for annotated texts.
377
+
378
+ Checks if annotated_text_list has more than one item and if so creates an inline
379
+ group in which the text elements can then be generated. While the context manager
380
+ is active the inline group is set as the current parent.
381
+
382
+ Args:
383
+ annotated_text_list (AnnotatedTextList): Annotated text
384
+ doc (DoclingDocument): Currently used document
385
+
386
+ Yields:
387
+ None: _description_
388
+ """
389
+ if len(annotated_text_list) > 1:
390
+ inline_fmt = doc.add_group(
391
+ label=GroupLabel.INLINE,
392
+ parent=self.parents[self.level],
393
+ content_layer=self.content_layer,
394
+ )
395
+ self.parents[self.level + 1] = inline_fmt
396
+ self.level += 1
397
+ try:
398
+ yield None
399
+ finally:
400
+ self.parents[self.level] = None
401
+ self.level -= 1
402
+ else:
403
+ yield None
404
+
205
405
  def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
206
406
  tag_name = tag.name.lower()
207
407
  # set default content layer to BODY as soon as we encounter a heading
208
408
  self.content_layer = ContentLayer.BODY
209
409
  level = int(tag_name[1])
210
- text = tag.get_text(strip=True, separator=" ")
211
- text_clean = HTMLDocumentBackend._clean_unicode(text)
410
+ annotated_text_list = self._extract_text_and_hyperlink_recursively(
411
+ tag, find_parent_annotation=True
412
+ )
413
+ annotated_text = annotated_text_list.to_single_text_element()
414
+ text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
212
415
  # the first level is for the title item
213
416
  if level == 1:
214
417
  for key in self.parents.keys():
215
418
  self.parents[key] = None
216
419
  self.level = 0
217
420
  self.parents[self.level + 1] = doc.add_title(
218
- text=text_clean, orig=text, content_layer=self.content_layer
421
+ text_clean,
422
+ content_layer=self.content_layer,
423
+ hyperlink=annotated_text.hyperlink,
219
424
  )
220
425
  # the other levels need to be lowered by 1 if a title was set
221
426
  else:
@@ -241,9 +446,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
241
446
  self.parents[self.level + 1] = doc.add_heading(
242
447
  parent=self.parents[self.level],
243
448
  text=text_clean,
244
- orig=text,
449
+ orig=annotated_text.text,
245
450
  level=self.level,
246
451
  content_layer=self.content_layer,
452
+ hyperlink=annotated_text.hyperlink,
247
453
  )
248
454
  self.level += 1
249
455
  for img_tag in tag("img"):
@@ -292,37 +498,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
292
498
  marker = ""
293
499
 
294
500
  # 2) extract only the "direct" text from this <li>
295
- parts: list[str] = []
296
- for child in li.contents:
297
- if isinstance(child, NavigableString) and not isinstance(
298
- child, PreformattedString
299
- ):
300
- parts.append(child)
301
- elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
302
- text_part = HTMLDocumentBackend.get_text(child)
303
- if text_part:
304
- parts.append(text_part)
305
- li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
306
- li_clean = HTMLDocumentBackend._clean_unicode(li_text)
501
+ parts = self._extract_text_and_hyperlink_recursively(
502
+ li, ignore_list=True, find_parent_annotation=True
503
+ )
504
+ min_parts = parts.simplify_text_elements()
505
+ li_text = re.sub(
506
+ r"\s+|\n+", " ", "".join([el.text for el in min_parts])
507
+ ).strip()
307
508
 
308
509
  # 3) add the list item
309
510
  if li_text:
310
- self.parents[self.level + 1] = doc.add_list_item(
311
- text=li_clean,
312
- enumerated=is_ordered,
313
- marker=marker,
314
- orig=li_text,
315
- parent=list_group,
316
- content_layer=self.content_layer,
317
- )
318
-
319
- # 4) recurse into any nested lists, attaching them to this <li> item
320
- for sublist in li({"ul", "ol"}, recursive=False):
321
- if isinstance(sublist, Tag):
322
- self.level += 1
323
- self._handle_block(sublist, doc)
324
- self.parents[self.level + 1] = None
325
- self.level -= 1
511
+ if len(min_parts) > 1:
512
+ # create an empty list element in order to hook the inline group onto that one
513
+ self.parents[self.level + 1] = doc.add_list_item(
514
+ text="",
515
+ enumerated=is_ordered,
516
+ marker=marker,
517
+ parent=list_group,
518
+ content_layer=self.content_layer,
519
+ )
520
+ self.level += 1
521
+ with self.use_inline_group(min_parts, doc):
522
+ for annotated_text in min_parts:
523
+ li_text = re.sub(
524
+ r"\s+|\n+", " ", annotated_text.text
525
+ ).strip()
526
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
527
+ doc.add_text(
528
+ parent=self.parents[self.level],
529
+ label=DocItemLabel.TEXT,
530
+ text=li_clean,
531
+ content_layer=self.content_layer,
532
+ hyperlink=annotated_text.hyperlink,
533
+ )
534
+
535
+ # 4) recurse into any nested lists, attaching them to this <li> item
536
+ for sublist in li({"ul", "ol"}, recursive=False):
537
+ if isinstance(sublist, Tag):
538
+ self._handle_block(sublist, doc)
539
+
540
+ # now the list element with inline group is not a parent anymore
541
+ self.parents[self.level] = None
542
+ self.level -= 1
543
+ else:
544
+ annotated_text = min_parts[0]
545
+ li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
546
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
547
+ self.parents[self.level + 1] = doc.add_list_item(
548
+ text=li_clean,
549
+ enumerated=is_ordered,
550
+ marker=marker,
551
+ orig=li_text,
552
+ parent=list_group,
553
+ content_layer=self.content_layer,
554
+ hyperlink=annotated_text.hyperlink,
555
+ )
556
+
557
+ # 4) recurse into any nested lists, attaching them to this <li> item
558
+ for sublist in li({"ul", "ol"}, recursive=False):
559
+ if isinstance(sublist, Tag):
560
+ self.level += 1
561
+ self._handle_block(sublist, doc)
562
+ self.parents[self.level + 1] = None
563
+ self.level -= 1
326
564
  else:
327
565
  for sublist in li({"ul", "ol"}, recursive=False):
328
566
  if isinstance(sublist, Tag):
@@ -351,17 +589,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
351
589
  self._handle_list(tag, doc)
352
590
 
353
591
  elif tag_name in {"p", "address", "summary"}:
354
- for part in tag.text.split("\n"):
355
- seg = part.strip()
356
- seg_clean = HTMLDocumentBackend._clean_unicode(seg)
357
- if seg:
358
- doc.add_text(
359
- label=DocItemLabel.TEXT,
360
- text=seg_clean,
361
- orig=seg,
362
- parent=self.parents[self.level],
363
- content_layer=self.content_layer,
364
- )
592
+ text_list = self._extract_text_and_hyperlink_recursively(
593
+ tag, find_parent_annotation=True
594
+ )
595
+ annotated_texts = text_list.simplify_text_elements()
596
+ for part in annotated_texts.split_by_newline():
597
+ with self.use_inline_group(part, doc):
598
+ for annotated_text in part:
599
+ if seg := annotated_text.text.strip():
600
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
601
+ doc.add_text(
602
+ parent=self.parents[self.level],
603
+ label=DocItemLabel.TEXT,
604
+ text=seg_clean,
605
+ content_layer=self.content_layer,
606
+ hyperlink=annotated_text.hyperlink,
607
+ )
608
+
365
609
  for img_tag in tag("img"):
366
610
  if isinstance(img_tag, Tag):
367
611
  self._emit_image(img_tag, doc)
@@ -380,20 +624,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
380
624
 
381
625
  elif tag_name in {"pre", "code"}:
382
626
  # handle monospace code snippets (pre).
383
- text = tag.get_text(strip=True)
384
- text_clean = HTMLDocumentBackend._clean_unicode(text)
385
- if text:
386
- doc.add_code(
387
- parent=self.parents[self.level],
388
- text=text_clean,
389
- orig=text,
390
- content_layer=self.content_layer,
391
- )
627
+ text_list = self._extract_text_and_hyperlink_recursively(
628
+ tag, find_parent_annotation=True
629
+ )
630
+ annotated_texts = text_list.simplify_text_elements()
631
+ with self.use_inline_group(annotated_texts, doc):
632
+ for annotated_text in annotated_texts:
633
+ text_clean = HTMLDocumentBackend._clean_unicode(
634
+ annotated_text.text.strip()
635
+ )
636
+ doc.add_code(
637
+ parent=self.parents[self.level],
638
+ text=text_clean,
639
+ content_layer=self.content_layer,
640
+ hyperlink=annotated_text.hyperlink,
641
+ )
392
642
 
393
- elif tag_name == "details":
394
- # handle details and its content.
643
+ elif tag_name in {"details", "footer"}:
644
+ if tag_name == "footer":
645
+ current_layer = self.content_layer
646
+ self.content_layer = ContentLayer.FURNITURE
395
647
  self.parents[self.level + 1] = doc.add_group(
396
- name="details",
648
+ name=tag_name,
397
649
  label=GroupLabel.SECTION,
398
650
  parent=self.parents[self.level],
399
651
  content_layer=self.content_layer,
@@ -402,25 +654,49 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
402
654
  self._walk(tag, doc)
403
655
  self.parents[self.level + 1] = None
404
656
  self.level -= 1
657
+ if tag_name == "footer":
658
+ self.content_layer = current_layer
405
659
 
406
660
  def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
407
661
  figure = img_tag.find_parent("figure")
408
- caption: str = ""
662
+ caption: AnnotatedTextList = AnnotatedTextList()
663
+
664
+ # check if the figure has a link - this is HACK:
665
+ def get_img_hyperlink(img_tag):
666
+ this_parent = img_tag.parent
667
+ while this_parent is not None:
668
+ if this_parent.name == "a" and this_parent.get("href"):
669
+ return this_parent.get("href")
670
+ this_parent = this_parent.parent
671
+ return None
672
+
673
+ if img_hyperlink := get_img_hyperlink(img_tag):
674
+ caption.append(
675
+ AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
676
+ )
677
+
409
678
  if isinstance(figure, Tag):
410
679
  caption_tag = figure.find("figcaption", recursive=False)
411
680
  if isinstance(caption_tag, Tag):
412
- caption = caption_tag.get_text()
413
- if not caption:
414
- caption = str(img_tag.get("alt", "")).strip()
681
+ caption = self._extract_text_and_hyperlink_recursively(
682
+ caption_tag, find_parent_annotation=True
683
+ )
684
+ if not caption and img_tag.get("alt"):
685
+ caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
686
+
687
+ caption_anno_text = caption.to_single_text_element()
415
688
 
416
689
  caption_item: Optional[TextItem] = None
417
- if caption:
418
- caption_clean = HTMLDocumentBackend._clean_unicode(caption)
690
+ if caption_anno_text.text:
691
+ text_clean = HTMLDocumentBackend._clean_unicode(
692
+ caption_anno_text.text.strip()
693
+ )
419
694
  caption_item = doc.add_text(
420
695
  label=DocItemLabel.CAPTION,
421
- text=caption_clean,
422
- orig=caption,
696
+ text=text_clean,
697
+ orig=caption_anno_text.text,
423
698
  content_layer=self.content_layer,
699
+ hyperlink=caption_anno_text.hyperlink,
424
700
  )
425
701
 
426
702
  doc.add_picture(