docling 2.44.0__tar.gz → 2.45.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {docling-2.44.0 → docling-2.45.0}/PKG-INFO +1 -1
  2. {docling-2.44.0 → docling-2.45.0}/docling/backend/html_backend.py +349 -77
  3. docling-2.45.0/docling/backend/mets_gbs_backend.py +399 -0
  4. {docling-2.44.0 → docling-2.45.0}/docling/backend/pdf_backend.py +3 -3
  5. {docling-2.44.0 → docling-2.45.0}/docling/cli/main.py +10 -0
  6. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/base_models.py +3 -0
  7. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/document.py +26 -0
  8. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/pipeline_options_vlm_model.py +8 -2
  9. {docling-2.44.0 → docling-2.45.0}/docling/document_converter.py +4 -0
  10. {docling-2.44.0 → docling-2.45.0}/docling/models/api_vlm_model.py +2 -5
  11. {docling-2.44.0 → docling-2.45.0}/docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
  12. {docling-2.44.0 → docling-2.45.0}/docling/models/vlm_models_inline/mlx_model.py +2 -4
  13. {docling-2.44.0 → docling-2.45.0}/docling/pipeline/base_pipeline.py +7 -4
  14. {docling-2.44.0 → docling-2.45.0}/docling.egg-info/PKG-INFO +1 -1
  15. {docling-2.44.0 → docling-2.45.0}/docling.egg-info/SOURCES.txt +2 -0
  16. {docling-2.44.0 → docling-2.45.0}/pyproject.toml +1 -1
  17. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_html.py +20 -0
  18. docling-2.45.0/tests/test_backend_mets_gbs.py +77 -0
  19. {docling-2.44.0 → docling-2.45.0}/LICENSE +0 -0
  20. {docling-2.44.0 → docling-2.45.0}/README.md +0 -0
  21. {docling-2.44.0 → docling-2.45.0}/docling/__init__.py +0 -0
  22. {docling-2.44.0 → docling-2.45.0}/docling/backend/__init__.py +0 -0
  23. {docling-2.44.0 → docling-2.45.0}/docling/backend/abstract_backend.py +0 -0
  24. {docling-2.44.0 → docling-2.45.0}/docling/backend/asciidoc_backend.py +0 -0
  25. {docling-2.44.0 → docling-2.45.0}/docling/backend/csv_backend.py +0 -0
  26. {docling-2.44.0 → docling-2.45.0}/docling/backend/docling_parse_backend.py +0 -0
  27. {docling-2.44.0 → docling-2.45.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  28. {docling-2.44.0 → docling-2.45.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  29. {docling-2.44.0 → docling-2.45.0}/docling/backend/docx/__init__.py +0 -0
  30. {docling-2.44.0 → docling-2.45.0}/docling/backend/docx/latex/__init__.py +0 -0
  31. {docling-2.44.0 → docling-2.45.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  32. {docling-2.44.0 → docling-2.45.0}/docling/backend/docx/latex/omml.py +0 -0
  33. {docling-2.44.0 → docling-2.45.0}/docling/backend/json/__init__.py +0 -0
  34. {docling-2.44.0 → docling-2.45.0}/docling/backend/json/docling_json_backend.py +0 -0
  35. {docling-2.44.0 → docling-2.45.0}/docling/backend/md_backend.py +0 -0
  36. {docling-2.44.0 → docling-2.45.0}/docling/backend/msexcel_backend.py +0 -0
  37. {docling-2.44.0 → docling-2.45.0}/docling/backend/mspowerpoint_backend.py +0 -0
  38. {docling-2.44.0 → docling-2.45.0}/docling/backend/msword_backend.py +0 -0
  39. {docling-2.44.0 → docling-2.45.0}/docling/backend/noop_backend.py +0 -0
  40. {docling-2.44.0 → docling-2.45.0}/docling/backend/pypdfium2_backend.py +0 -0
  41. {docling-2.44.0 → docling-2.45.0}/docling/backend/xml/__init__.py +0 -0
  42. {docling-2.44.0 → docling-2.45.0}/docling/backend/xml/jats_backend.py +0 -0
  43. {docling-2.44.0 → docling-2.45.0}/docling/backend/xml/uspto_backend.py +0 -0
  44. {docling-2.44.0 → docling-2.45.0}/docling/chunking/__init__.py +0 -0
  45. {docling-2.44.0 → docling-2.45.0}/docling/cli/__init__.py +0 -0
  46. {docling-2.44.0 → docling-2.45.0}/docling/cli/models.py +0 -0
  47. {docling-2.44.0 → docling-2.45.0}/docling/cli/tools.py +0 -0
  48. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/__init__.py +0 -0
  49. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/accelerator_options.py +0 -0
  50. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/asr_model_specs.py +0 -0
  51. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/layout_model_specs.py +0 -0
  52. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/pipeline_options.py +0 -0
  53. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  54. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/settings.py +0 -0
  55. {docling-2.44.0 → docling-2.45.0}/docling/datamodel/vlm_model_specs.py +0 -0
  56. {docling-2.44.0 → docling-2.45.0}/docling/exceptions.py +0 -0
  57. {docling-2.44.0 → docling-2.45.0}/docling/models/__init__.py +0 -0
  58. {docling-2.44.0 → docling-2.45.0}/docling/models/base_model.py +0 -0
  59. {docling-2.44.0 → docling-2.45.0}/docling/models/base_ocr_model.py +0 -0
  60. {docling-2.44.0 → docling-2.45.0}/docling/models/code_formula_model.py +0 -0
  61. {docling-2.44.0 → docling-2.45.0}/docling/models/document_picture_classifier.py +0 -0
  62. {docling-2.44.0 → docling-2.45.0}/docling/models/easyocr_model.py +0 -0
  63. {docling-2.44.0 → docling-2.45.0}/docling/models/factories/__init__.py +0 -0
  64. {docling-2.44.0 → docling-2.45.0}/docling/models/factories/base_factory.py +0 -0
  65. {docling-2.44.0 → docling-2.45.0}/docling/models/factories/ocr_factory.py +0 -0
  66. {docling-2.44.0 → docling-2.45.0}/docling/models/factories/picture_description_factory.py +0 -0
  67. {docling-2.44.0 → docling-2.45.0}/docling/models/layout_model.py +0 -0
  68. {docling-2.44.0 → docling-2.45.0}/docling/models/ocr_mac_model.py +0 -0
  69. {docling-2.44.0 → docling-2.45.0}/docling/models/page_assemble_model.py +0 -0
  70. {docling-2.44.0 → docling-2.45.0}/docling/models/page_preprocessing_model.py +0 -0
  71. {docling-2.44.0 → docling-2.45.0}/docling/models/picture_description_api_model.py +0 -0
  72. {docling-2.44.0 → docling-2.45.0}/docling/models/picture_description_base_model.py +0 -0
  73. {docling-2.44.0 → docling-2.45.0}/docling/models/picture_description_vlm_model.py +0 -0
  74. {docling-2.44.0 → docling-2.45.0}/docling/models/plugins/__init__.py +0 -0
  75. {docling-2.44.0 → docling-2.45.0}/docling/models/plugins/defaults.py +0 -0
  76. {docling-2.44.0 → docling-2.45.0}/docling/models/rapid_ocr_model.py +0 -0
  77. {docling-2.44.0 → docling-2.45.0}/docling/models/readingorder_model.py +0 -0
  78. {docling-2.44.0 → docling-2.45.0}/docling/models/table_structure_model.py +0 -0
  79. {docling-2.44.0 → docling-2.45.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  80. {docling-2.44.0 → docling-2.45.0}/docling/models/tesseract_ocr_model.py +0 -0
  81. {docling-2.44.0 → docling-2.45.0}/docling/models/utils/__init__.py +0 -0
  82. {docling-2.44.0 → docling-2.45.0}/docling/models/utils/hf_model_download.py +0 -0
  83. {docling-2.44.0 → docling-2.45.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  84. {docling-2.44.0 → docling-2.45.0}/docling/pipeline/__init__.py +0 -0
  85. {docling-2.44.0 → docling-2.45.0}/docling/pipeline/asr_pipeline.py +0 -0
  86. {docling-2.44.0 → docling-2.45.0}/docling/pipeline/simple_pipeline.py +0 -0
  87. {docling-2.44.0 → docling-2.45.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  88. {docling-2.44.0 → docling-2.45.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
  89. {docling-2.44.0 → docling-2.45.0}/docling/pipeline/vlm_pipeline.py +0 -0
  90. {docling-2.44.0 → docling-2.45.0}/docling/py.typed +0 -0
  91. {docling-2.44.0 → docling-2.45.0}/docling/utils/__init__.py +0 -0
  92. {docling-2.44.0 → docling-2.45.0}/docling/utils/accelerator_utils.py +0 -0
  93. {docling-2.44.0 → docling-2.45.0}/docling/utils/api_image_request.py +0 -0
  94. {docling-2.44.0 → docling-2.45.0}/docling/utils/export.py +0 -0
  95. {docling-2.44.0 → docling-2.45.0}/docling/utils/glm_utils.py +0 -0
  96. {docling-2.44.0 → docling-2.45.0}/docling/utils/layout_postprocessor.py +0 -0
  97. {docling-2.44.0 → docling-2.45.0}/docling/utils/locks.py +0 -0
  98. {docling-2.44.0 → docling-2.45.0}/docling/utils/model_downloader.py +0 -0
  99. {docling-2.44.0 → docling-2.45.0}/docling/utils/ocr_utils.py +0 -0
  100. {docling-2.44.0 → docling-2.45.0}/docling/utils/orientation.py +0 -0
  101. {docling-2.44.0 → docling-2.45.0}/docling/utils/profiling.py +0 -0
  102. {docling-2.44.0 → docling-2.45.0}/docling/utils/utils.py +0 -0
  103. {docling-2.44.0 → docling-2.45.0}/docling/utils/visualization.py +0 -0
  104. {docling-2.44.0 → docling-2.45.0}/docling.egg-info/dependency_links.txt +0 -0
  105. {docling-2.44.0 → docling-2.45.0}/docling.egg-info/entry_points.txt +0 -0
  106. {docling-2.44.0 → docling-2.45.0}/docling.egg-info/requires.txt +0 -0
  107. {docling-2.44.0 → docling-2.45.0}/docling.egg-info/top_level.txt +0 -0
  108. {docling-2.44.0 → docling-2.45.0}/setup.cfg +0 -0
  109. {docling-2.44.0 → docling-2.45.0}/tests/test_asr_pipeline.py +0 -0
  110. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_asciidoc.py +0 -0
  111. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_csv.py +0 -0
  112. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_docling_json.py +0 -0
  113. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_docling_parse.py +0 -0
  114. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_docling_parse_v2.py +0 -0
  115. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_docling_parse_v4.py +0 -0
  116. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_jats.py +0 -0
  117. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_markdown.py +0 -0
  118. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_msexcel.py +0 -0
  119. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_msword.py +0 -0
  120. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_patent_uspto.py +0 -0
  121. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_pdfium.py +0 -0
  122. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_pptx.py +0 -0
  123. {docling-2.44.0 → docling-2.45.0}/tests/test_backend_webp.py +0 -0
  124. {docling-2.44.0 → docling-2.45.0}/tests/test_cli.py +0 -0
  125. {docling-2.44.0 → docling-2.45.0}/tests/test_code_formula.py +0 -0
  126. {docling-2.44.0 → docling-2.45.0}/tests/test_data_gen_flag.py +0 -0
  127. {docling-2.44.0 → docling-2.45.0}/tests/test_document_picture_classifier.py +0 -0
  128. {docling-2.44.0 → docling-2.45.0}/tests/test_e2e_conversion.py +0 -0
  129. {docling-2.44.0 → docling-2.45.0}/tests/test_e2e_ocr_conversion.py +0 -0
  130. {docling-2.44.0 → docling-2.45.0}/tests/test_input_doc.py +0 -0
  131. {docling-2.44.0 → docling-2.45.0}/tests/test_interfaces.py +0 -0
  132. {docling-2.44.0 → docling-2.45.0}/tests/test_invalid_input.py +0 -0
  133. {docling-2.44.0 → docling-2.45.0}/tests/test_legacy_format_transform.py +0 -0
  134. {docling-2.44.0 → docling-2.45.0}/tests/test_ocr_utils.py +0 -0
  135. {docling-2.44.0 → docling-2.45.0}/tests/test_options.py +0 -0
  136. {docling-2.44.0 → docling-2.45.0}/tests/test_settings_load.py +0 -0
  137. {docling-2.44.0 → docling-2.45.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.44.0
3
+ Version: 2.45.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -1,8 +1,11 @@
1
1
  import logging
2
2
  import re
3
+ from contextlib import contextmanager
4
+ from copy import deepcopy
3
5
  from io import BytesIO
4
6
  from pathlib import Path
5
7
  from typing import Final, Optional, Union, cast
8
+ from urllib.parse import urljoin
6
9
 
7
10
  from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
8
11
  from bs4.element import PreformattedString
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
18
21
  TextItem,
19
22
  )
20
23
  from docling_core.types.doc.document import ContentLayer
21
- from pydantic import BaseModel
24
+ from pydantic import AnyUrl, BaseModel, ValidationError
22
25
  from typing_extensions import override
23
26
 
24
27
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -56,12 +59,76 @@ class _Context(BaseModel):
56
59
  list_start_by_ref: dict[str, int] = {}
57
60
 
58
61
 
62
+ class AnnotatedText(BaseModel):
63
+ text: str
64
+ hyperlink: Union[AnyUrl, Path, None] = None
65
+
66
+
67
+ class AnnotatedTextList(list):
68
+ def to_single_text_element(self) -> AnnotatedText:
69
+ current_h = None
70
+ current_text = ""
71
+ for at in self:
72
+ t = at.text
73
+ h = at.hyperlink
74
+ current_text += t.strip() + " "
75
+ if h is not None and current_h is None:
76
+ current_h = h
77
+ elif h is not None and current_h is not None and h != current_h:
78
+ _log.warning(
79
+ f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
80
+ )
81
+ return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
82
+
83
+ def simplify_text_elements(self) -> "AnnotatedTextList":
84
+ simplified = AnnotatedTextList()
85
+ if not self:
86
+ return self
87
+ text = self[0].text
88
+ hyperlink = self[0].hyperlink
89
+ last_elm = text
90
+ for i in range(1, len(self)):
91
+ if hyperlink == self[i].hyperlink:
92
+ sep = " "
93
+ if not self[i].text.strip() or not last_elm.strip():
94
+ sep = ""
95
+ text += sep + self[i].text
96
+ last_elm = self[i].text
97
+ else:
98
+ simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
99
+ text = self[i].text
100
+ last_elm = text
101
+ hyperlink = self[i].hyperlink
102
+ if text:
103
+ simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
104
+ return simplified
105
+
106
+ def split_by_newline(self):
107
+ super_list = []
108
+ active_annotated_text_list = AnnotatedTextList()
109
+ for el in self:
110
+ sub_texts = el.text.split("\n")
111
+ if len(sub_texts) == 1:
112
+ active_annotated_text_list.append(el)
113
+ else:
114
+ for text in sub_texts:
115
+ sub_el = deepcopy(el)
116
+ sub_el.text = text
117
+ active_annotated_text_list.append(sub_el)
118
+ super_list.append(active_annotated_text_list)
119
+ active_annotated_text_list = AnnotatedTextList()
120
+ if active_annotated_text_list:
121
+ super_list.append(active_annotated_text_list)
122
+ return super_list
123
+
124
+
59
125
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
60
126
  @override
61
127
  def __init__(
62
128
  self,
63
129
  in_doc: InputDocument,
64
130
  path_or_stream: Union[BytesIO, Path],
131
+ original_url: Optional[AnyUrl] = None,
65
132
  ):
66
133
  super().__init__(in_doc, path_or_stream)
67
134
  self.soup: Optional[Tag] = None
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
74
141
  self.ctx = _Context()
75
142
  for i in range(self.max_levels):
76
143
  self.parents[i] = None
144
+ self.hyperlink = None
145
+ self.original_url = original_url
77
146
 
78
147
  try:
79
148
  raw = (
@@ -160,26 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
160
229
  element: The XML tag to parse.
161
230
  doc: The Docling document to be updated with the parsed content.
162
231
  """
163
- buffer: list[str] = []
232
+ buffer: AnnotatedTextList = AnnotatedTextList()
164
233
 
165
234
  def flush_buffer():
166
235
  if not buffer:
167
236
  return
168
- text = "".join(buffer).strip()
237
+ annotated_text_list = buffer.simplify_text_elements()
238
+ parts = annotated_text_list.split_by_newline()
169
239
  buffer.clear()
170
- if not text:
240
+
241
+ if not "".join([el.text for el in annotated_text_list]):
171
242
  return
172
- for part in text.split("\n"):
173
- seg = part.strip()
174
- seg_clean = HTMLDocumentBackend._clean_unicode(seg)
175
- if seg:
176
- doc.add_text(
177
- label=DocItemLabel.TEXT,
178
- text=seg_clean,
179
- orig=seg,
180
- parent=self.parents[self.level],
181
- content_layer=self.content_layer,
182
- )
243
+
244
+ for annotated_text_list in parts:
245
+ with self.use_inline_group(annotated_text_list, doc):
246
+ for annotated_text in annotated_text_list:
247
+ if annotated_text.text.strip():
248
+ seg_clean = HTMLDocumentBackend._clean_unicode(
249
+ annotated_text.text.strip()
250
+ )
251
+ doc.add_text(
252
+ parent=self.parents[self.level],
253
+ label=DocItemLabel.TEXT,
254
+ text=seg_clean,
255
+ content_layer=self.content_layer,
256
+ hyperlink=annotated_text.hyperlink,
257
+ )
183
258
 
184
259
  for node in element.contents:
185
260
  if isinstance(node, Tag):
@@ -187,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
187
262
  if name == "img":
188
263
  flush_buffer()
189
264
  self._emit_image(node, doc)
265
+ elif name == "a":
266
+ with self.use_hyperlink(node):
267
+ self._walk(node, doc)
190
268
  elif name in _BLOCK_TAGS:
191
269
  flush_buffer()
192
270
  self._handle_block(node, doc)
@@ -194,28 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
194
272
  flush_buffer()
195
273
  self._walk(node, doc)
196
274
  else:
197
- buffer.append(node.text)
275
+ buffer.extend(
276
+ self._extract_text_and_hyperlink_recursively(
277
+ node, find_parent_annotation=True, keep_newlines=True
278
+ )
279
+ )
198
280
  elif isinstance(node, NavigableString) and not isinstance(
199
281
  node, PreformattedString
200
282
  ):
201
- buffer.append(str(node))
283
+ if str(node).strip("\n\r") == "":
284
+ flush_buffer()
285
+ else:
286
+ buffer.extend(
287
+ self._extract_text_and_hyperlink_recursively(
288
+ node, find_parent_annotation=True, keep_newlines=True
289
+ )
290
+ )
202
291
 
203
292
  flush_buffer()
204
293
 
294
+ def _extract_text_and_hyperlink_recursively(
295
+ self,
296
+ item: PageElement,
297
+ ignore_list=False,
298
+ find_parent_annotation=False,
299
+ keep_newlines=False,
300
+ ) -> AnnotatedTextList:
301
+ result: AnnotatedTextList = AnnotatedTextList()
302
+
303
+ # If find_parent_annotation, make sure that we keep track of
304
+ # any a-tag that has been present in the DOM-parents already.
305
+ if find_parent_annotation:
306
+ this_parent = item.parent
307
+ while this_parent is not None:
308
+ if this_parent.name == "a" and this_parent.get("href"):
309
+ with self.use_hyperlink(this_parent):
310
+ return self._extract_text_and_hyperlink_recursively(
311
+ item, ignore_list
312
+ )
313
+ this_parent = this_parent.parent
314
+
315
+ if isinstance(item, PreformattedString):
316
+ return AnnotatedTextList()
317
+
318
+ if isinstance(item, NavigableString):
319
+ text = item.strip()
320
+ if text:
321
+ return AnnotatedTextList(
322
+ [AnnotatedText(text=text, hyperlink=self.hyperlink)]
323
+ )
324
+ if keep_newlines and item.strip("\n\r") == "":
325
+ return AnnotatedTextList(
326
+ [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
327
+ )
328
+ return AnnotatedTextList()
329
+
330
+ tag = cast(Tag, item)
331
+ if not ignore_list or (tag.name not in ["ul", "ol"]):
332
+ for child in tag:
333
+ if isinstance(child, Tag) and child.name == "a":
334
+ with self.use_hyperlink(child):
335
+ result.extend(
336
+ self._extract_text_and_hyperlink_recursively(
337
+ child, ignore_list, keep_newlines=keep_newlines
338
+ )
339
+ )
340
+ else:
341
+ # Recursively get the child's text content
342
+ result.extend(
343
+ self._extract_text_and_hyperlink_recursively(
344
+ child, ignore_list, keep_newlines=keep_newlines
345
+ )
346
+ )
347
+ return result
348
+
349
+ @contextmanager
350
+ def use_hyperlink(self, tag):
351
+ this_href = tag.get("href")
352
+ if this_href is None:
353
+ yield None
354
+ else:
355
+ if this_href:
356
+ old_hyperlink = self.hyperlink
357
+ if self.original_url is not None:
358
+ this_href = urljoin(self.original_url, this_href)
359
+ # ugly fix for relative links since pydantic does not support them.
360
+ try:
361
+ AnyUrl(this_href)
362
+ except ValidationError:
363
+ this_href = Path(this_href)
364
+ self.hyperlink = this_href
365
+ try:
366
+ yield None
367
+ finally:
368
+ if this_href:
369
+ self.hyperlink = old_hyperlink
370
+
371
+ @contextmanager
372
+ def use_inline_group(
373
+ self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
374
+ ):
375
+ """Create an inline group for annotated texts.
376
+
377
+ Checks if annotated_text_list has more than one item and if so creates an inline
378
+ group in which the text elements can then be generated. While the context manager
379
+ is active the inline group is set as the current parent.
380
+
381
+ Args:
382
+ annotated_text_list (AnnotatedTextList): Annotated text
383
+ doc (DoclingDocument): Currently used document
384
+
385
+ Yields:
386
+ None: _description_
387
+ """
388
+ if len(annotated_text_list) > 1:
389
+ inline_fmt = doc.add_group(
390
+ label=GroupLabel.INLINE,
391
+ parent=self.parents[self.level],
392
+ content_layer=self.content_layer,
393
+ )
394
+ self.parents[self.level + 1] = inline_fmt
395
+ self.level += 1
396
+ try:
397
+ yield None
398
+ finally:
399
+ self.parents[self.level] = None
400
+ self.level -= 1
401
+ else:
402
+ yield None
403
+
205
404
  def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
206
405
  tag_name = tag.name.lower()
207
406
  # set default content layer to BODY as soon as we encounter a heading
208
407
  self.content_layer = ContentLayer.BODY
209
408
  level = int(tag_name[1])
210
- text = tag.get_text(strip=True, separator=" ")
211
- text_clean = HTMLDocumentBackend._clean_unicode(text)
409
+ annotated_text_list = self._extract_text_and_hyperlink_recursively(
410
+ tag, find_parent_annotation=True
411
+ )
412
+ annotated_text = annotated_text_list.to_single_text_element()
413
+ text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
212
414
  # the first level is for the title item
213
415
  if level == 1:
214
416
  for key in self.parents.keys():
215
417
  self.parents[key] = None
216
418
  self.level = 0
217
419
  self.parents[self.level + 1] = doc.add_title(
218
- text=text_clean, orig=text, content_layer=self.content_layer
420
+ text_clean,
421
+ content_layer=self.content_layer,
422
+ hyperlink=annotated_text.hyperlink,
219
423
  )
220
424
  # the other levels need to be lowered by 1 if a title was set
221
425
  else:
@@ -241,9 +445,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
241
445
  self.parents[self.level + 1] = doc.add_heading(
242
446
  parent=self.parents[self.level],
243
447
  text=text_clean,
244
- orig=text,
448
+ orig=annotated_text.text,
245
449
  level=self.level,
246
450
  content_layer=self.content_layer,
451
+ hyperlink=annotated_text.hyperlink,
247
452
  )
248
453
  self.level += 1
249
454
  for img_tag in tag("img"):
@@ -292,37 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
292
497
  marker = ""
293
498
 
294
499
  # 2) extract only the "direct" text from this <li>
295
- parts: list[str] = []
296
- for child in li.contents:
297
- if isinstance(child, NavigableString) and not isinstance(
298
- child, PreformattedString
299
- ):
300
- parts.append(child)
301
- elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
302
- text_part = HTMLDocumentBackend.get_text(child)
303
- if text_part:
304
- parts.append(text_part)
305
- li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
306
- li_clean = HTMLDocumentBackend._clean_unicode(li_text)
500
+ parts = self._extract_text_and_hyperlink_recursively(
501
+ li, ignore_list=True, find_parent_annotation=True
502
+ )
503
+ min_parts = parts.simplify_text_elements()
504
+ li_text = re.sub(
505
+ r"\s+|\n+", " ", "".join([el.text for el in min_parts])
506
+ ).strip()
307
507
 
308
508
  # 3) add the list item
309
509
  if li_text:
310
- self.parents[self.level + 1] = doc.add_list_item(
311
- text=li_clean,
312
- enumerated=is_ordered,
313
- marker=marker,
314
- orig=li_text,
315
- parent=list_group,
316
- content_layer=self.content_layer,
317
- )
318
-
319
- # 4) recurse into any nested lists, attaching them to this <li> item
320
- for sublist in li({"ul", "ol"}, recursive=False):
321
- if isinstance(sublist, Tag):
322
- self.level += 1
323
- self._handle_block(sublist, doc)
324
- self.parents[self.level + 1] = None
325
- self.level -= 1
510
+ if len(min_parts) > 1:
511
+ # create an empty list element in order to hook the inline group onto that one
512
+ self.parents[self.level + 1] = doc.add_list_item(
513
+ text="",
514
+ enumerated=is_ordered,
515
+ marker=marker,
516
+ parent=list_group,
517
+ content_layer=self.content_layer,
518
+ )
519
+ self.level += 1
520
+ with self.use_inline_group(min_parts, doc):
521
+ for annotated_text in min_parts:
522
+ li_text = re.sub(
523
+ r"\s+|\n+", " ", annotated_text.text
524
+ ).strip()
525
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
526
+ doc.add_text(
527
+ parent=self.parents[self.level],
528
+ label=DocItemLabel.TEXT,
529
+ text=li_clean,
530
+ content_layer=self.content_layer,
531
+ hyperlink=annotated_text.hyperlink,
532
+ )
533
+
534
+ # 4) recurse into any nested lists, attaching them to this <li> item
535
+ for sublist in li({"ul", "ol"}, recursive=False):
536
+ if isinstance(sublist, Tag):
537
+ self._handle_block(sublist, doc)
538
+
539
+ # now the list element with inline group is not a parent anymore
540
+ self.parents[self.level] = None
541
+ self.level -= 1
542
+ else:
543
+ annotated_text = min_parts[0]
544
+ li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
545
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
546
+ self.parents[self.level + 1] = doc.add_list_item(
547
+ text=li_clean,
548
+ enumerated=is_ordered,
549
+ marker=marker,
550
+ orig=li_text,
551
+ parent=list_group,
552
+ content_layer=self.content_layer,
553
+ hyperlink=annotated_text.hyperlink,
554
+ )
555
+
556
+ # 4) recurse into any nested lists, attaching them to this <li> item
557
+ for sublist in li({"ul", "ol"}, recursive=False):
558
+ if isinstance(sublist, Tag):
559
+ self.level += 1
560
+ self._handle_block(sublist, doc)
561
+ self.parents[self.level + 1] = None
562
+ self.level -= 1
326
563
  else:
327
564
  for sublist in li({"ul", "ol"}, recursive=False):
328
565
  if isinstance(sublist, Tag):
@@ -351,17 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
351
588
  self._handle_list(tag, doc)
352
589
 
353
590
  elif tag_name in {"p", "address", "summary"}:
354
- for part in tag.text.split("\n"):
355
- seg = part.strip()
356
- seg_clean = HTMLDocumentBackend._clean_unicode(seg)
357
- if seg:
358
- doc.add_text(
359
- label=DocItemLabel.TEXT,
360
- text=seg_clean,
361
- orig=seg,
362
- parent=self.parents[self.level],
363
- content_layer=self.content_layer,
364
- )
591
+ text_list = self._extract_text_and_hyperlink_recursively(
592
+ tag, find_parent_annotation=True
593
+ )
594
+ annotated_texts = text_list.simplify_text_elements()
595
+ for part in annotated_texts.split_by_newline():
596
+ with self.use_inline_group(part, doc):
597
+ for annotated_text in part:
598
+ if seg := annotated_text.text.strip():
599
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
600
+ doc.add_text(
601
+ parent=self.parents[self.level],
602
+ label=DocItemLabel.TEXT,
603
+ text=seg_clean,
604
+ content_layer=self.content_layer,
605
+ hyperlink=annotated_text.hyperlink,
606
+ )
607
+
365
608
  for img_tag in tag("img"):
366
609
  if isinstance(img_tag, Tag):
367
610
  self._emit_image(img_tag, doc)
@@ -380,15 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
380
623
 
381
624
  elif tag_name in {"pre", "code"}:
382
625
  # handle monospace code snippets (pre).
383
- text = tag.get_text(strip=True)
384
- text_clean = HTMLDocumentBackend._clean_unicode(text)
385
- if text:
386
- doc.add_code(
387
- parent=self.parents[self.level],
388
- text=text_clean,
389
- orig=text,
390
- content_layer=self.content_layer,
391
- )
626
+ text_list = self._extract_text_and_hyperlink_recursively(
627
+ tag, find_parent_annotation=True
628
+ )
629
+ annotated_texts = text_list.simplify_text_elements()
630
+ with self.use_inline_group(annotated_texts, doc):
631
+ for annotated_text in annotated_texts:
632
+ text_clean = HTMLDocumentBackend._clean_unicode(
633
+ annotated_text.text.strip()
634
+ )
635
+ doc.add_code(
636
+ parent=self.parents[self.level],
637
+ text=text_clean,
638
+ content_layer=self.content_layer,
639
+ hyperlink=annotated_text.hyperlink,
640
+ )
392
641
 
393
642
  elif tag_name == "details":
394
643
  # handle details and its content.
@@ -405,22 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
405
654
 
406
655
  def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
407
656
  figure = img_tag.find_parent("figure")
408
- caption: str = ""
657
+ caption: AnnotatedTextList = AnnotatedTextList()
658
+
659
+ # check if the figure has a link - this is HACK:
660
+ def get_img_hyperlink(img_tag):
661
+ this_parent = img_tag.parent
662
+ while this_parent is not None:
663
+ if this_parent.name == "a" and this_parent.get("href"):
664
+ return this_parent.get("href")
665
+ this_parent = this_parent.parent
666
+ return None
667
+
668
+ if img_hyperlink := get_img_hyperlink(img_tag):
669
+ caption.append(
670
+ AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
671
+ )
672
+
409
673
  if isinstance(figure, Tag):
410
674
  caption_tag = figure.find("figcaption", recursive=False)
411
675
  if isinstance(caption_tag, Tag):
412
- caption = caption_tag.get_text()
413
- if not caption:
414
- caption = str(img_tag.get("alt", "")).strip()
676
+ caption = self._extract_text_and_hyperlink_recursively(
677
+ caption_tag, find_parent_annotation=True
678
+ )
679
+ if not caption and img_tag.get("alt"):
680
+ caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
681
+
682
+ caption_anno_text = caption.to_single_text_element()
415
683
 
416
684
  caption_item: Optional[TextItem] = None
417
- if caption:
418
- caption_clean = HTMLDocumentBackend._clean_unicode(caption)
685
+ if caption_anno_text.text:
686
+ text_clean = HTMLDocumentBackend._clean_unicode(
687
+ caption_anno_text.text.strip()
688
+ )
689
+ print(caption_anno_text)
419
690
  caption_item = doc.add_text(
420
691
  label=DocItemLabel.CAPTION,
421
- text=caption_clean,
422
- orig=caption,
692
+ text=text_clean,
693
+ orig=caption_anno_text.text,
423
694
  content_layer=self.content_layer,
695
+ hyperlink=caption_anno_text.hyperlink,
424
696
  )
425
697
 
426
698
  doc.add_picture(