docling 2.43.0__tar.gz → 2.45.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {docling-2.43.0 → docling-2.45.0}/PKG-INFO +2 -2
  2. {docling-2.43.0 → docling-2.45.0}/docling/backend/html_backend.py +406 -69
  3. docling-2.45.0/docling/backend/mets_gbs_backend.py +399 -0
  4. {docling-2.43.0 → docling-2.45.0}/docling/backend/pdf_backend.py +3 -3
  5. {docling-2.43.0 → docling-2.45.0}/docling/cli/main.py +16 -0
  6. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/base_models.py +3 -0
  7. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/document.py +26 -0
  8. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/pipeline_options_vlm_model.py +8 -2
  9. {docling-2.43.0 → docling-2.45.0}/docling/document_converter.py +34 -0
  10. {docling-2.43.0 → docling-2.45.0}/docling/models/api_vlm_model.py +2 -5
  11. {docling-2.43.0 → docling-2.45.0}/docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
  12. {docling-2.43.0 → docling-2.45.0}/docling/models/vlm_models_inline/mlx_model.py +4 -6
  13. {docling-2.43.0 → docling-2.45.0}/docling/pipeline/base_pipeline.py +7 -4
  14. {docling-2.43.0 → docling-2.45.0}/docling.egg-info/PKG-INFO +2 -2
  15. {docling-2.43.0 → docling-2.45.0}/docling.egg-info/SOURCES.txt +2 -0
  16. {docling-2.43.0 → docling-2.45.0}/docling.egg-info/requires.txt +1 -1
  17. {docling-2.43.0 → docling-2.45.0}/pyproject.toml +2 -2
  18. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_html.py +36 -0
  19. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_markdown.py +52 -1
  20. docling-2.45.0/tests/test_backend_mets_gbs.py +77 -0
  21. {docling-2.43.0 → docling-2.45.0}/LICENSE +0 -0
  22. {docling-2.43.0 → docling-2.45.0}/README.md +0 -0
  23. {docling-2.43.0 → docling-2.45.0}/docling/__init__.py +0 -0
  24. {docling-2.43.0 → docling-2.45.0}/docling/backend/__init__.py +0 -0
  25. {docling-2.43.0 → docling-2.45.0}/docling/backend/abstract_backend.py +0 -0
  26. {docling-2.43.0 → docling-2.45.0}/docling/backend/asciidoc_backend.py +0 -0
  27. {docling-2.43.0 → docling-2.45.0}/docling/backend/csv_backend.py +0 -0
  28. {docling-2.43.0 → docling-2.45.0}/docling/backend/docling_parse_backend.py +0 -0
  29. {docling-2.43.0 → docling-2.45.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  30. {docling-2.43.0 → docling-2.45.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  31. {docling-2.43.0 → docling-2.45.0}/docling/backend/docx/__init__.py +0 -0
  32. {docling-2.43.0 → docling-2.45.0}/docling/backend/docx/latex/__init__.py +0 -0
  33. {docling-2.43.0 → docling-2.45.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  34. {docling-2.43.0 → docling-2.45.0}/docling/backend/docx/latex/omml.py +0 -0
  35. {docling-2.43.0 → docling-2.45.0}/docling/backend/json/__init__.py +0 -0
  36. {docling-2.43.0 → docling-2.45.0}/docling/backend/json/docling_json_backend.py +0 -0
  37. {docling-2.43.0 → docling-2.45.0}/docling/backend/md_backend.py +0 -0
  38. {docling-2.43.0 → docling-2.45.0}/docling/backend/msexcel_backend.py +0 -0
  39. {docling-2.43.0 → docling-2.45.0}/docling/backend/mspowerpoint_backend.py +0 -0
  40. {docling-2.43.0 → docling-2.45.0}/docling/backend/msword_backend.py +0 -0
  41. {docling-2.43.0 → docling-2.45.0}/docling/backend/noop_backend.py +0 -0
  42. {docling-2.43.0 → docling-2.45.0}/docling/backend/pypdfium2_backend.py +0 -0
  43. {docling-2.43.0 → docling-2.45.0}/docling/backend/xml/__init__.py +0 -0
  44. {docling-2.43.0 → docling-2.45.0}/docling/backend/xml/jats_backend.py +0 -0
  45. {docling-2.43.0 → docling-2.45.0}/docling/backend/xml/uspto_backend.py +0 -0
  46. {docling-2.43.0 → docling-2.45.0}/docling/chunking/__init__.py +0 -0
  47. {docling-2.43.0 → docling-2.45.0}/docling/cli/__init__.py +0 -0
  48. {docling-2.43.0 → docling-2.45.0}/docling/cli/models.py +0 -0
  49. {docling-2.43.0 → docling-2.45.0}/docling/cli/tools.py +0 -0
  50. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/__init__.py +0 -0
  51. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/accelerator_options.py +0 -0
  52. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/asr_model_specs.py +0 -0
  53. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/layout_model_specs.py +0 -0
  54. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/pipeline_options.py +0 -0
  55. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  56. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/settings.py +0 -0
  57. {docling-2.43.0 → docling-2.45.0}/docling/datamodel/vlm_model_specs.py +0 -0
  58. {docling-2.43.0 → docling-2.45.0}/docling/exceptions.py +0 -0
  59. {docling-2.43.0 → docling-2.45.0}/docling/models/__init__.py +0 -0
  60. {docling-2.43.0 → docling-2.45.0}/docling/models/base_model.py +0 -0
  61. {docling-2.43.0 → docling-2.45.0}/docling/models/base_ocr_model.py +0 -0
  62. {docling-2.43.0 → docling-2.45.0}/docling/models/code_formula_model.py +0 -0
  63. {docling-2.43.0 → docling-2.45.0}/docling/models/document_picture_classifier.py +0 -0
  64. {docling-2.43.0 → docling-2.45.0}/docling/models/easyocr_model.py +0 -0
  65. {docling-2.43.0 → docling-2.45.0}/docling/models/factories/__init__.py +0 -0
  66. {docling-2.43.0 → docling-2.45.0}/docling/models/factories/base_factory.py +0 -0
  67. {docling-2.43.0 → docling-2.45.0}/docling/models/factories/ocr_factory.py +0 -0
  68. {docling-2.43.0 → docling-2.45.0}/docling/models/factories/picture_description_factory.py +0 -0
  69. {docling-2.43.0 → docling-2.45.0}/docling/models/layout_model.py +0 -0
  70. {docling-2.43.0 → docling-2.45.0}/docling/models/ocr_mac_model.py +0 -0
  71. {docling-2.43.0 → docling-2.45.0}/docling/models/page_assemble_model.py +0 -0
  72. {docling-2.43.0 → docling-2.45.0}/docling/models/page_preprocessing_model.py +0 -0
  73. {docling-2.43.0 → docling-2.45.0}/docling/models/picture_description_api_model.py +0 -0
  74. {docling-2.43.0 → docling-2.45.0}/docling/models/picture_description_base_model.py +0 -0
  75. {docling-2.43.0 → docling-2.45.0}/docling/models/picture_description_vlm_model.py +0 -0
  76. {docling-2.43.0 → docling-2.45.0}/docling/models/plugins/__init__.py +0 -0
  77. {docling-2.43.0 → docling-2.45.0}/docling/models/plugins/defaults.py +0 -0
  78. {docling-2.43.0 → docling-2.45.0}/docling/models/rapid_ocr_model.py +0 -0
  79. {docling-2.43.0 → docling-2.45.0}/docling/models/readingorder_model.py +0 -0
  80. {docling-2.43.0 → docling-2.45.0}/docling/models/table_structure_model.py +0 -0
  81. {docling-2.43.0 → docling-2.45.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  82. {docling-2.43.0 → docling-2.45.0}/docling/models/tesseract_ocr_model.py +0 -0
  83. {docling-2.43.0 → docling-2.45.0}/docling/models/utils/__init__.py +0 -0
  84. {docling-2.43.0 → docling-2.45.0}/docling/models/utils/hf_model_download.py +0 -0
  85. {docling-2.43.0 → docling-2.45.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  86. {docling-2.43.0 → docling-2.45.0}/docling/pipeline/__init__.py +0 -0
  87. {docling-2.43.0 → docling-2.45.0}/docling/pipeline/asr_pipeline.py +0 -0
  88. {docling-2.43.0 → docling-2.45.0}/docling/pipeline/simple_pipeline.py +0 -0
  89. {docling-2.43.0 → docling-2.45.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  90. {docling-2.43.0 → docling-2.45.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
  91. {docling-2.43.0 → docling-2.45.0}/docling/pipeline/vlm_pipeline.py +0 -0
  92. {docling-2.43.0 → docling-2.45.0}/docling/py.typed +0 -0
  93. {docling-2.43.0 → docling-2.45.0}/docling/utils/__init__.py +0 -0
  94. {docling-2.43.0 → docling-2.45.0}/docling/utils/accelerator_utils.py +0 -0
  95. {docling-2.43.0 → docling-2.45.0}/docling/utils/api_image_request.py +0 -0
  96. {docling-2.43.0 → docling-2.45.0}/docling/utils/export.py +0 -0
  97. {docling-2.43.0 → docling-2.45.0}/docling/utils/glm_utils.py +0 -0
  98. {docling-2.43.0 → docling-2.45.0}/docling/utils/layout_postprocessor.py +0 -0
  99. {docling-2.43.0 → docling-2.45.0}/docling/utils/locks.py +0 -0
  100. {docling-2.43.0 → docling-2.45.0}/docling/utils/model_downloader.py +0 -0
  101. {docling-2.43.0 → docling-2.45.0}/docling/utils/ocr_utils.py +0 -0
  102. {docling-2.43.0 → docling-2.45.0}/docling/utils/orientation.py +0 -0
  103. {docling-2.43.0 → docling-2.45.0}/docling/utils/profiling.py +0 -0
  104. {docling-2.43.0 → docling-2.45.0}/docling/utils/utils.py +0 -0
  105. {docling-2.43.0 → docling-2.45.0}/docling/utils/visualization.py +0 -0
  106. {docling-2.43.0 → docling-2.45.0}/docling.egg-info/dependency_links.txt +0 -0
  107. {docling-2.43.0 → docling-2.45.0}/docling.egg-info/entry_points.txt +0 -0
  108. {docling-2.43.0 → docling-2.45.0}/docling.egg-info/top_level.txt +0 -0
  109. {docling-2.43.0 → docling-2.45.0}/setup.cfg +0 -0
  110. {docling-2.43.0 → docling-2.45.0}/tests/test_asr_pipeline.py +0 -0
  111. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_asciidoc.py +0 -0
  112. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_csv.py +0 -0
  113. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_docling_json.py +0 -0
  114. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_docling_parse.py +0 -0
  115. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_docling_parse_v2.py +0 -0
  116. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_docling_parse_v4.py +0 -0
  117. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_jats.py +0 -0
  118. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_msexcel.py +0 -0
  119. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_msword.py +0 -0
  120. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_patent_uspto.py +0 -0
  121. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_pdfium.py +0 -0
  122. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_pptx.py +0 -0
  123. {docling-2.43.0 → docling-2.45.0}/tests/test_backend_webp.py +0 -0
  124. {docling-2.43.0 → docling-2.45.0}/tests/test_cli.py +0 -0
  125. {docling-2.43.0 → docling-2.45.0}/tests/test_code_formula.py +0 -0
  126. {docling-2.43.0 → docling-2.45.0}/tests/test_data_gen_flag.py +0 -0
  127. {docling-2.43.0 → docling-2.45.0}/tests/test_document_picture_classifier.py +0 -0
  128. {docling-2.43.0 → docling-2.45.0}/tests/test_e2e_conversion.py +0 -0
  129. {docling-2.43.0 → docling-2.45.0}/tests/test_e2e_ocr_conversion.py +0 -0
  130. {docling-2.43.0 → docling-2.45.0}/tests/test_input_doc.py +0 -0
  131. {docling-2.43.0 → docling-2.45.0}/tests/test_interfaces.py +0 -0
  132. {docling-2.43.0 → docling-2.45.0}/tests/test_invalid_input.py +0 -0
  133. {docling-2.43.0 → docling-2.45.0}/tests/test_legacy_format_transform.py +0 -0
  134. {docling-2.43.0 → docling-2.45.0}/tests/test_ocr_utils.py +0 -0
  135. {docling-2.43.0 → docling-2.45.0}/tests/test_options.py +0 -0
  136. {docling-2.43.0 → docling-2.45.0}/tests/test_settings_load.py +0 -0
  137. {docling-2.43.0 → docling-2.45.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.43.0
3
+ Version: 2.45.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
58
58
  Provides-Extra: vlm
59
59
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
60
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
- Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
61
+ Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
62
  Provides-Extra: rapidocr
63
63
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
64
64
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -1,8 +1,11 @@
1
1
  import logging
2
2
  import re
3
+ from contextlib import contextmanager
4
+ from copy import deepcopy
3
5
  from io import BytesIO
4
6
  from pathlib import Path
5
7
  from typing import Final, Optional, Union, cast
8
+ from urllib.parse import urljoin
6
9
 
7
10
  from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
8
11
  from bs4.element import PreformattedString
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
18
21
  TextItem,
19
22
  )
20
23
  from docling_core.types.doc.document import ContentLayer
21
- from pydantic import BaseModel
24
+ from pydantic import AnyUrl, BaseModel, ValidationError
22
25
  from typing_extensions import override
23
26
 
24
27
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -56,12 +59,76 @@ class _Context(BaseModel):
56
59
  list_start_by_ref: dict[str, int] = {}
57
60
 
58
61
 
62
+ class AnnotatedText(BaseModel):
63
+ text: str
64
+ hyperlink: Union[AnyUrl, Path, None] = None
65
+
66
+
67
+ class AnnotatedTextList(list):
68
+ def to_single_text_element(self) -> AnnotatedText:
69
+ current_h = None
70
+ current_text = ""
71
+ for at in self:
72
+ t = at.text
73
+ h = at.hyperlink
74
+ current_text += t.strip() + " "
75
+ if h is not None and current_h is None:
76
+ current_h = h
77
+ elif h is not None and current_h is not None and h != current_h:
78
+ _log.warning(
79
+ f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
80
+ )
81
+ return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
82
+
83
+ def simplify_text_elements(self) -> "AnnotatedTextList":
84
+ simplified = AnnotatedTextList()
85
+ if not self:
86
+ return self
87
+ text = self[0].text
88
+ hyperlink = self[0].hyperlink
89
+ last_elm = text
90
+ for i in range(1, len(self)):
91
+ if hyperlink == self[i].hyperlink:
92
+ sep = " "
93
+ if not self[i].text.strip() or not last_elm.strip():
94
+ sep = ""
95
+ text += sep + self[i].text
96
+ last_elm = self[i].text
97
+ else:
98
+ simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
99
+ text = self[i].text
100
+ last_elm = text
101
+ hyperlink = self[i].hyperlink
102
+ if text:
103
+ simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
104
+ return simplified
105
+
106
+ def split_by_newline(self):
107
+ super_list = []
108
+ active_annotated_text_list = AnnotatedTextList()
109
+ for el in self:
110
+ sub_texts = el.text.split("\n")
111
+ if len(sub_texts) == 1:
112
+ active_annotated_text_list.append(el)
113
+ else:
114
+ for text in sub_texts:
115
+ sub_el = deepcopy(el)
116
+ sub_el.text = text
117
+ active_annotated_text_list.append(sub_el)
118
+ super_list.append(active_annotated_text_list)
119
+ active_annotated_text_list = AnnotatedTextList()
120
+ if active_annotated_text_list:
121
+ super_list.append(active_annotated_text_list)
122
+ return super_list
123
+
124
+
59
125
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
60
126
  @override
61
127
  def __init__(
62
128
  self,
63
129
  in_doc: InputDocument,
64
130
  path_or_stream: Union[BytesIO, Path],
131
+ original_url: Optional[AnyUrl] = None,
65
132
  ):
66
133
  super().__init__(in_doc, path_or_stream)
67
134
  self.soup: Optional[Tag] = None
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
74
141
  self.ctx = _Context()
75
142
  for i in range(self.max_levels):
76
143
  self.parents[i] = None
144
+ self.hyperlink = None
145
+ self.original_url = original_url
77
146
 
78
147
  try:
79
148
  raw = (
@@ -125,8 +194,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
125
194
  # set the title as furniture, since it is part of the document metadata
126
195
  title = self.soup.title
127
196
  if title:
197
+ title_text = title.get_text(separator=" ", strip=True)
198
+ title_clean = HTMLDocumentBackend._clean_unicode(title_text)
128
199
  doc.add_title(
129
- text=title.get_text(separator=" ", strip=True),
200
+ text=title_clean,
201
+ orig=title_text,
130
202
  content_layer=ContentLayer.FURNITURE,
131
203
  )
132
204
  # remove scripts/styles
@@ -157,24 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
157
229
  element: The XML tag to parse.
158
230
  doc: The Docling document to be updated with the parsed content.
159
231
  """
160
- buffer: list[str] = []
232
+ buffer: AnnotatedTextList = AnnotatedTextList()
161
233
 
162
234
  def flush_buffer():
163
235
  if not buffer:
164
236
  return
165
- text = "".join(buffer).strip()
237
+ annotated_text_list = buffer.simplify_text_elements()
238
+ parts = annotated_text_list.split_by_newline()
166
239
  buffer.clear()
167
- if not text:
240
+
241
+ if not "".join([el.text for el in annotated_text_list]):
168
242
  return
169
- for part in text.split("\n"):
170
- seg = part.strip()
171
- if seg:
172
- doc.add_text(
173
- DocItemLabel.TEXT,
174
- seg,
175
- parent=self.parents[self.level],
176
- content_layer=self.content_layer,
177
- )
243
+
244
+ for annotated_text_list in parts:
245
+ with self.use_inline_group(annotated_text_list, doc):
246
+ for annotated_text in annotated_text_list:
247
+ if annotated_text.text.strip():
248
+ seg_clean = HTMLDocumentBackend._clean_unicode(
249
+ annotated_text.text.strip()
250
+ )
251
+ doc.add_text(
252
+ parent=self.parents[self.level],
253
+ label=DocItemLabel.TEXT,
254
+ text=seg_clean,
255
+ content_layer=self.content_layer,
256
+ hyperlink=annotated_text.hyperlink,
257
+ )
178
258
 
179
259
  for node in element.contents:
180
260
  if isinstance(node, Tag):
@@ -182,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
182
262
  if name == "img":
183
263
  flush_buffer()
184
264
  self._emit_image(node, doc)
265
+ elif name == "a":
266
+ with self.use_hyperlink(node):
267
+ self._walk(node, doc)
185
268
  elif name in _BLOCK_TAGS:
186
269
  flush_buffer()
187
270
  self._handle_block(node, doc)
@@ -189,27 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
189
272
  flush_buffer()
190
273
  self._walk(node, doc)
191
274
  else:
192
- buffer.append(node.text)
275
+ buffer.extend(
276
+ self._extract_text_and_hyperlink_recursively(
277
+ node, find_parent_annotation=True, keep_newlines=True
278
+ )
279
+ )
193
280
  elif isinstance(node, NavigableString) and not isinstance(
194
281
  node, PreformattedString
195
282
  ):
196
- buffer.append(str(node))
283
+ if str(node).strip("\n\r") == "":
284
+ flush_buffer()
285
+ else:
286
+ buffer.extend(
287
+ self._extract_text_and_hyperlink_recursively(
288
+ node, find_parent_annotation=True, keep_newlines=True
289
+ )
290
+ )
197
291
 
198
292
  flush_buffer()
199
293
 
294
+ def _extract_text_and_hyperlink_recursively(
295
+ self,
296
+ item: PageElement,
297
+ ignore_list=False,
298
+ find_parent_annotation=False,
299
+ keep_newlines=False,
300
+ ) -> AnnotatedTextList:
301
+ result: AnnotatedTextList = AnnotatedTextList()
302
+
303
+ # If find_parent_annotation, make sure that we keep track of
304
+ # any a-tag that has been present in the DOM-parents already.
305
+ if find_parent_annotation:
306
+ this_parent = item.parent
307
+ while this_parent is not None:
308
+ if this_parent.name == "a" and this_parent.get("href"):
309
+ with self.use_hyperlink(this_parent):
310
+ return self._extract_text_and_hyperlink_recursively(
311
+ item, ignore_list
312
+ )
313
+ this_parent = this_parent.parent
314
+
315
+ if isinstance(item, PreformattedString):
316
+ return AnnotatedTextList()
317
+
318
+ if isinstance(item, NavigableString):
319
+ text = item.strip()
320
+ if text:
321
+ return AnnotatedTextList(
322
+ [AnnotatedText(text=text, hyperlink=self.hyperlink)]
323
+ )
324
+ if keep_newlines and item.strip("\n\r") == "":
325
+ return AnnotatedTextList(
326
+ [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
327
+ )
328
+ return AnnotatedTextList()
329
+
330
+ tag = cast(Tag, item)
331
+ if not ignore_list or (tag.name not in ["ul", "ol"]):
332
+ for child in tag:
333
+ if isinstance(child, Tag) and child.name == "a":
334
+ with self.use_hyperlink(child):
335
+ result.extend(
336
+ self._extract_text_and_hyperlink_recursively(
337
+ child, ignore_list, keep_newlines=keep_newlines
338
+ )
339
+ )
340
+ else:
341
+ # Recursively get the child's text content
342
+ result.extend(
343
+ self._extract_text_and_hyperlink_recursively(
344
+ child, ignore_list, keep_newlines=keep_newlines
345
+ )
346
+ )
347
+ return result
348
+
349
+ @contextmanager
350
+ def use_hyperlink(self, tag):
351
+ this_href = tag.get("href")
352
+ if this_href is None:
353
+ yield None
354
+ else:
355
+ if this_href:
356
+ old_hyperlink = self.hyperlink
357
+ if self.original_url is not None:
358
+ this_href = urljoin(self.original_url, this_href)
359
+ # ugly fix for relative links since pydantic does not support them.
360
+ try:
361
+ AnyUrl(this_href)
362
+ except ValidationError:
363
+ this_href = Path(this_href)
364
+ self.hyperlink = this_href
365
+ try:
366
+ yield None
367
+ finally:
368
+ if this_href:
369
+ self.hyperlink = old_hyperlink
370
+
371
+ @contextmanager
372
+ def use_inline_group(
373
+ self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
374
+ ):
375
+ """Create an inline group for annotated texts.
376
+
377
+ Checks if annotated_text_list has more than one item and if so creates an inline
378
+ group in which the text elements can then be generated. While the context manager
379
+ is active the inline group is set as the current parent.
380
+
381
+ Args:
382
+ annotated_text_list (AnnotatedTextList): Annotated text
383
+ doc (DoclingDocument): Currently used document
384
+
385
+ Yields:
386
+ None: _description_
387
+ """
388
+ if len(annotated_text_list) > 1:
389
+ inline_fmt = doc.add_group(
390
+ label=GroupLabel.INLINE,
391
+ parent=self.parents[self.level],
392
+ content_layer=self.content_layer,
393
+ )
394
+ self.parents[self.level + 1] = inline_fmt
395
+ self.level += 1
396
+ try:
397
+ yield None
398
+ finally:
399
+ self.parents[self.level] = None
400
+ self.level -= 1
401
+ else:
402
+ yield None
403
+
200
404
  def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
201
405
  tag_name = tag.name.lower()
202
406
  # set default content layer to BODY as soon as we encounter a heading
203
407
  self.content_layer = ContentLayer.BODY
204
408
  level = int(tag_name[1])
205
- text = tag.get_text(strip=True, separator=" ")
409
+ annotated_text_list = self._extract_text_and_hyperlink_recursively(
410
+ tag, find_parent_annotation=True
411
+ )
412
+ annotated_text = annotated_text_list.to_single_text_element()
413
+ text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
206
414
  # the first level is for the title item
207
415
  if level == 1:
208
416
  for key in self.parents.keys():
209
417
  self.parents[key] = None
210
418
  self.level = 0
211
419
  self.parents[self.level + 1] = doc.add_title(
212
- text, content_layer=self.content_layer
420
+ text_clean,
421
+ content_layer=self.content_layer,
422
+ hyperlink=annotated_text.hyperlink,
213
423
  )
214
424
  # the other levels need to be lowered by 1 if a title was set
215
425
  else:
@@ -234,9 +444,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
234
444
  self.level = level
235
445
  self.parents[self.level + 1] = doc.add_heading(
236
446
  parent=self.parents[self.level],
237
- text=text,
447
+ text=text_clean,
448
+ orig=annotated_text.text,
238
449
  level=self.level,
239
450
  content_layer=self.content_layer,
451
+ hyperlink=annotated_text.hyperlink,
240
452
  )
241
453
  self.level += 1
242
454
  for img_tag in tag("img"):
@@ -285,35 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
285
497
  marker = ""
286
498
 
287
499
  # 2) extract only the "direct" text from this <li>
288
- parts: list[str] = []
289
- for child in li.contents:
290
- if isinstance(child, NavigableString) and not isinstance(
291
- child, PreformattedString
292
- ):
293
- parts.append(child)
294
- elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
295
- text_part = HTMLDocumentBackend.get_text(child)
296
- if text_part:
297
- parts.append(text_part)
298
- li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
500
+ parts = self._extract_text_and_hyperlink_recursively(
501
+ li, ignore_list=True, find_parent_annotation=True
502
+ )
503
+ min_parts = parts.simplify_text_elements()
504
+ li_text = re.sub(
505
+ r"\s+|\n+", " ", "".join([el.text for el in min_parts])
506
+ ).strip()
299
507
 
300
508
  # 3) add the list item
301
509
  if li_text:
302
- self.parents[self.level + 1] = doc.add_list_item(
303
- text=li_text,
304
- enumerated=is_ordered,
305
- marker=marker,
306
- parent=list_group,
307
- content_layer=self.content_layer,
308
- )
309
-
310
- # 4) recurse into any nested lists, attaching them to this <li> item
311
- for sublist in li({"ul", "ol"}, recursive=False):
312
- if isinstance(sublist, Tag):
313
- self.level += 1
314
- self._handle_block(sublist, doc)
315
- self.parents[self.level + 1] = None
316
- self.level -= 1
510
+ if len(min_parts) > 1:
511
+ # create an empty list element in order to hook the inline group onto that one
512
+ self.parents[self.level + 1] = doc.add_list_item(
513
+ text="",
514
+ enumerated=is_ordered,
515
+ marker=marker,
516
+ parent=list_group,
517
+ content_layer=self.content_layer,
518
+ )
519
+ self.level += 1
520
+ with self.use_inline_group(min_parts, doc):
521
+ for annotated_text in min_parts:
522
+ li_text = re.sub(
523
+ r"\s+|\n+", " ", annotated_text.text
524
+ ).strip()
525
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
526
+ doc.add_text(
527
+ parent=self.parents[self.level],
528
+ label=DocItemLabel.TEXT,
529
+ text=li_clean,
530
+ content_layer=self.content_layer,
531
+ hyperlink=annotated_text.hyperlink,
532
+ )
533
+
534
+ # 4) recurse into any nested lists, attaching them to this <li> item
535
+ for sublist in li({"ul", "ol"}, recursive=False):
536
+ if isinstance(sublist, Tag):
537
+ self._handle_block(sublist, doc)
538
+
539
+ # now the list element with inline group is not a parent anymore
540
+ self.parents[self.level] = None
541
+ self.level -= 1
542
+ else:
543
+ annotated_text = min_parts[0]
544
+ li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
545
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
546
+ self.parents[self.level + 1] = doc.add_list_item(
547
+ text=li_clean,
548
+ enumerated=is_ordered,
549
+ marker=marker,
550
+ orig=li_text,
551
+ parent=list_group,
552
+ content_layer=self.content_layer,
553
+ hyperlink=annotated_text.hyperlink,
554
+ )
555
+
556
+ # 4) recurse into any nested lists, attaching them to this <li> item
557
+ for sublist in li({"ul", "ol"}, recursive=False):
558
+ if isinstance(sublist, Tag):
559
+ self.level += 1
560
+ self._handle_block(sublist, doc)
561
+ self.parents[self.level + 1] = None
562
+ self.level -= 1
317
563
  else:
318
564
  for sublist in li({"ul", "ol"}, recursive=False):
319
565
  if isinstance(sublist, Tag):
@@ -342,15 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
342
588
  self._handle_list(tag, doc)
343
589
 
344
590
  elif tag_name in {"p", "address", "summary"}:
345
- for part in tag.text.split("\n"):
346
- seg = part.strip()
347
- if seg:
348
- doc.add_text(
349
- parent=self.parents[self.level],
350
- label=DocItemLabel.TEXT,
351
- text=seg,
352
- content_layer=self.content_layer,
353
- )
591
+ text_list = self._extract_text_and_hyperlink_recursively(
592
+ tag, find_parent_annotation=True
593
+ )
594
+ annotated_texts = text_list.simplify_text_elements()
595
+ for part in annotated_texts.split_by_newline():
596
+ with self.use_inline_group(part, doc):
597
+ for annotated_text in part:
598
+ if seg := annotated_text.text.strip():
599
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
600
+ doc.add_text(
601
+ parent=self.parents[self.level],
602
+ label=DocItemLabel.TEXT,
603
+ text=seg_clean,
604
+ content_layer=self.content_layer,
605
+ hyperlink=annotated_text.hyperlink,
606
+ )
607
+
354
608
  for img_tag in tag("img"):
355
609
  if isinstance(img_tag, Tag):
356
610
  self._emit_image(img_tag, doc)
@@ -369,13 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
369
623
 
370
624
  elif tag_name in {"pre", "code"}:
371
625
  # handle monospace code snippets (pre).
372
- text = tag.get_text(strip=True)
373
- if text:
374
- doc.add_code(
375
- parent=self.parents[self.level],
376
- text=text,
377
- content_layer=self.content_layer,
378
- )
626
+ text_list = self._extract_text_and_hyperlink_recursively(
627
+ tag, find_parent_annotation=True
628
+ )
629
+ annotated_texts = text_list.simplify_text_elements()
630
+ with self.use_inline_group(annotated_texts, doc):
631
+ for annotated_text in annotated_texts:
632
+ text_clean = HTMLDocumentBackend._clean_unicode(
633
+ annotated_text.text.strip()
634
+ )
635
+ doc.add_code(
636
+ parent=self.parents[self.level],
637
+ text=text_clean,
638
+ content_layer=self.content_layer,
639
+ hyperlink=annotated_text.hyperlink,
640
+ )
379
641
 
380
642
  elif tag_name == "details":
381
643
  # handle details and its content.
@@ -392,18 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
392
654
 
393
655
  def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
394
656
  figure = img_tag.find_parent("figure")
395
- caption: str = ""
657
+ caption: AnnotatedTextList = AnnotatedTextList()
658
+
659
+ # check if the figure has a link - this is HACK:
660
+ def get_img_hyperlink(img_tag):
661
+ this_parent = img_tag.parent
662
+ while this_parent is not None:
663
+ if this_parent.name == "a" and this_parent.get("href"):
664
+ return this_parent.get("href")
665
+ this_parent = this_parent.parent
666
+ return None
667
+
668
+ if img_hyperlink := get_img_hyperlink(img_tag):
669
+ caption.append(
670
+ AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
671
+ )
672
+
396
673
  if isinstance(figure, Tag):
397
674
  caption_tag = figure.find("figcaption", recursive=False)
398
675
  if isinstance(caption_tag, Tag):
399
- caption = caption_tag.get_text()
400
- if not caption:
401
- caption = str(img_tag.get("alt", "")).strip()
676
+ caption = self._extract_text_and_hyperlink_recursively(
677
+ caption_tag, find_parent_annotation=True
678
+ )
679
+ if not caption and img_tag.get("alt"):
680
+ caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
681
+
682
+ caption_anno_text = caption.to_single_text_element()
402
683
 
403
684
  caption_item: Optional[TextItem] = None
404
- if caption:
685
+ if caption_anno_text.text:
686
+ text_clean = HTMLDocumentBackend._clean_unicode(
687
+ caption_anno_text.text.strip()
688
+ )
689
+ print(caption_anno_text)
405
690
  caption_item = doc.add_text(
406
- DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
691
+ label=DocItemLabel.CAPTION,
692
+ text=text_clean,
693
+ orig=caption_anno_text.text,
694
+ content_layer=self.content_layer,
695
+ hyperlink=caption_anno_text.hyperlink,
407
696
  )
408
697
 
409
698
  doc.add_picture(
@@ -442,6 +731,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
442
731
 
443
732
  return "".join(parts)
444
733
 
734
+ @staticmethod
735
+ def _clean_unicode(text: str) -> str:
736
+ """Replace typical Unicode characters in HTML for text processing.
737
+
738
+ Several Unicode characters (e.g., non-printable or formatting) are typically
739
+ found in HTML but are worth replacing to sanitize text and ensure consistency
740
+ in text processing tasks.
741
+
742
+ Args:
743
+ text: The original text.
744
+
745
+ Returns:
746
+ The sanitized text without typical Unicode characters.
747
+ """
748
+ replacements = {
749
+ "\u00a0": " ", # non-breaking space
750
+ "\u200b": "", # zero-width space
751
+ "\u200c": "", # zero-width non-joiner
752
+ "\u200d": "", # zero-width joiner
753
+ "\u2010": "-", # hyphen
754
+ "\u2011": "-", # non-breaking hyphen
755
+ "\u2012": "-", # dash
756
+ "\u2013": "-", # dash
757
+ "\u2014": "-", # dash
758
+ "\u2015": "-", # horizontal bar
759
+ "\u2018": "'", # left single quotation mark
760
+ "\u2019": "'", # right single quotation mark
761
+ "\u201c": '"', # left double quotation mark
762
+ "\u201d": '"', # right double quotation mark
763
+ "\u2026": "...", # ellipsis
764
+ "\u00ad": "", # soft hyphen
765
+ "\ufeff": "", # zero width non-break space
766
+ "\u202f": " ", # narrow non-break space
767
+ "\u2060": "", # word joiner
768
+ }
769
+ for raw, clean in replacements.items():
770
+ text = text.replace(raw, clean)
771
+
772
+ return text
773
+
445
774
  @staticmethod
446
775
  def _get_cell_spans(cell: Tag) -> tuple[int, int]:
447
776
  """Extract colspan and rowspan values from a table cell tag.
@@ -454,9 +783,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
454
783
  str(cell.get("colspan", "1")),
455
784
  str(cell.get("rowspan", "1")),
456
785
  )
786
+
787
+ def _extract_num(s: str) -> int:
788
+ if s and s[0].isnumeric():
789
+ match = re.search(r"\d+", s)
790
+ if match:
791
+ return int(match.group())
792
+ return 1
793
+
457
794
  int_spans: tuple[int, int] = (
458
- int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
459
- int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
795
+ _extract_num(raw_spans[0]),
796
+ _extract_num(raw_spans[1]),
460
797
  )
461
798
 
462
799
  return int_spans