docling 2.44.0__tar.gz → 2.45.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.44.0 → docling-2.45.0}/PKG-INFO +1 -1
- {docling-2.44.0 → docling-2.45.0}/docling/backend/html_backend.py +349 -77
- docling-2.45.0/docling/backend/mets_gbs_backend.py +399 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/pdf_backend.py +3 -3
- {docling-2.44.0 → docling-2.45.0}/docling/cli/main.py +10 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/base_models.py +3 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/document.py +26 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/pipeline_options_vlm_model.py +8 -2
- {docling-2.44.0 → docling-2.45.0}/docling/document_converter.py +4 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/api_vlm_model.py +2 -5
- {docling-2.44.0 → docling-2.45.0}/docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
- {docling-2.44.0 → docling-2.45.0}/docling/models/vlm_models_inline/mlx_model.py +2 -4
- {docling-2.44.0 → docling-2.45.0}/docling/pipeline/base_pipeline.py +7 -4
- {docling-2.44.0 → docling-2.45.0}/docling.egg-info/PKG-INFO +1 -1
- {docling-2.44.0 → docling-2.45.0}/docling.egg-info/SOURCES.txt +2 -0
- {docling-2.44.0 → docling-2.45.0}/pyproject.toml +1 -1
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_html.py +20 -0
- docling-2.45.0/tests/test_backend_mets_gbs.py +77 -0
- {docling-2.44.0 → docling-2.45.0}/LICENSE +0 -0
- {docling-2.44.0 → docling-2.45.0}/README.md +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/md_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/chunking/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/cli/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/cli/models.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/cli/tools.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/settings.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/exceptions.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/base_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/layout_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/py.typed +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/__init__.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/export.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/locks.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/orientation.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/profiling.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/utils.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling/utils/visualization.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling.egg-info/requires.txt +0 -0
- {docling-2.44.0 → docling-2.45.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.44.0 → docling-2.45.0}/setup.cfg +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_csv.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_jats.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_msword.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_backend_webp.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_cli.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_code_formula.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_input_doc.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_interfaces.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_invalid_input.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_options.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_settings_load.py +0 -0
- {docling-2.44.0 → docling-2.45.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.45.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -1,8 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
+
from contextlib import contextmanager
|
4
|
+
from copy import deepcopy
|
3
5
|
from io import BytesIO
|
4
6
|
from pathlib import Path
|
5
7
|
from typing import Final, Optional, Union, cast
|
8
|
+
from urllib.parse import urljoin
|
6
9
|
|
7
10
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
8
11
|
from bs4.element import PreformattedString
|
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
|
|
18
21
|
TextItem,
|
19
22
|
)
|
20
23
|
from docling_core.types.doc.document import ContentLayer
|
21
|
-
from pydantic import BaseModel
|
24
|
+
from pydantic import AnyUrl, BaseModel, ValidationError
|
22
25
|
from typing_extensions import override
|
23
26
|
|
24
27
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
@@ -56,12 +59,76 @@ class _Context(BaseModel):
|
|
56
59
|
list_start_by_ref: dict[str, int] = {}
|
57
60
|
|
58
61
|
|
62
|
+
class AnnotatedText(BaseModel):
|
63
|
+
text: str
|
64
|
+
hyperlink: Union[AnyUrl, Path, None] = None
|
65
|
+
|
66
|
+
|
67
|
+
class AnnotatedTextList(list):
|
68
|
+
def to_single_text_element(self) -> AnnotatedText:
|
69
|
+
current_h = None
|
70
|
+
current_text = ""
|
71
|
+
for at in self:
|
72
|
+
t = at.text
|
73
|
+
h = at.hyperlink
|
74
|
+
current_text += t.strip() + " "
|
75
|
+
if h is not None and current_h is None:
|
76
|
+
current_h = h
|
77
|
+
elif h is not None and current_h is not None and h != current_h:
|
78
|
+
_log.warning(
|
79
|
+
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
80
|
+
)
|
81
|
+
return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
|
82
|
+
|
83
|
+
def simplify_text_elements(self) -> "AnnotatedTextList":
|
84
|
+
simplified = AnnotatedTextList()
|
85
|
+
if not self:
|
86
|
+
return self
|
87
|
+
text = self[0].text
|
88
|
+
hyperlink = self[0].hyperlink
|
89
|
+
last_elm = text
|
90
|
+
for i in range(1, len(self)):
|
91
|
+
if hyperlink == self[i].hyperlink:
|
92
|
+
sep = " "
|
93
|
+
if not self[i].text.strip() or not last_elm.strip():
|
94
|
+
sep = ""
|
95
|
+
text += sep + self[i].text
|
96
|
+
last_elm = self[i].text
|
97
|
+
else:
|
98
|
+
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
99
|
+
text = self[i].text
|
100
|
+
last_elm = text
|
101
|
+
hyperlink = self[i].hyperlink
|
102
|
+
if text:
|
103
|
+
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
104
|
+
return simplified
|
105
|
+
|
106
|
+
def split_by_newline(self):
|
107
|
+
super_list = []
|
108
|
+
active_annotated_text_list = AnnotatedTextList()
|
109
|
+
for el in self:
|
110
|
+
sub_texts = el.text.split("\n")
|
111
|
+
if len(sub_texts) == 1:
|
112
|
+
active_annotated_text_list.append(el)
|
113
|
+
else:
|
114
|
+
for text in sub_texts:
|
115
|
+
sub_el = deepcopy(el)
|
116
|
+
sub_el.text = text
|
117
|
+
active_annotated_text_list.append(sub_el)
|
118
|
+
super_list.append(active_annotated_text_list)
|
119
|
+
active_annotated_text_list = AnnotatedTextList()
|
120
|
+
if active_annotated_text_list:
|
121
|
+
super_list.append(active_annotated_text_list)
|
122
|
+
return super_list
|
123
|
+
|
124
|
+
|
59
125
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
60
126
|
@override
|
61
127
|
def __init__(
|
62
128
|
self,
|
63
129
|
in_doc: InputDocument,
|
64
130
|
path_or_stream: Union[BytesIO, Path],
|
131
|
+
original_url: Optional[AnyUrl] = None,
|
65
132
|
):
|
66
133
|
super().__init__(in_doc, path_or_stream)
|
67
134
|
self.soup: Optional[Tag] = None
|
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
74
141
|
self.ctx = _Context()
|
75
142
|
for i in range(self.max_levels):
|
76
143
|
self.parents[i] = None
|
144
|
+
self.hyperlink = None
|
145
|
+
self.original_url = original_url
|
77
146
|
|
78
147
|
try:
|
79
148
|
raw = (
|
@@ -160,26 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
160
229
|
element: The XML tag to parse.
|
161
230
|
doc: The Docling document to be updated with the parsed content.
|
162
231
|
"""
|
163
|
-
buffer:
|
232
|
+
buffer: AnnotatedTextList = AnnotatedTextList()
|
164
233
|
|
165
234
|
def flush_buffer():
|
166
235
|
if not buffer:
|
167
236
|
return
|
168
|
-
|
237
|
+
annotated_text_list = buffer.simplify_text_elements()
|
238
|
+
parts = annotated_text_list.split_by_newline()
|
169
239
|
buffer.clear()
|
170
|
-
|
240
|
+
|
241
|
+
if not "".join([el.text for el in annotated_text_list]):
|
171
242
|
return
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
243
|
+
|
244
|
+
for annotated_text_list in parts:
|
245
|
+
with self.use_inline_group(annotated_text_list, doc):
|
246
|
+
for annotated_text in annotated_text_list:
|
247
|
+
if annotated_text.text.strip():
|
248
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(
|
249
|
+
annotated_text.text.strip()
|
250
|
+
)
|
251
|
+
doc.add_text(
|
252
|
+
parent=self.parents[self.level],
|
253
|
+
label=DocItemLabel.TEXT,
|
254
|
+
text=seg_clean,
|
255
|
+
content_layer=self.content_layer,
|
256
|
+
hyperlink=annotated_text.hyperlink,
|
257
|
+
)
|
183
258
|
|
184
259
|
for node in element.contents:
|
185
260
|
if isinstance(node, Tag):
|
@@ -187,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
187
262
|
if name == "img":
|
188
263
|
flush_buffer()
|
189
264
|
self._emit_image(node, doc)
|
265
|
+
elif name == "a":
|
266
|
+
with self.use_hyperlink(node):
|
267
|
+
self._walk(node, doc)
|
190
268
|
elif name in _BLOCK_TAGS:
|
191
269
|
flush_buffer()
|
192
270
|
self._handle_block(node, doc)
|
@@ -194,28 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
194
272
|
flush_buffer()
|
195
273
|
self._walk(node, doc)
|
196
274
|
else:
|
197
|
-
buffer.
|
275
|
+
buffer.extend(
|
276
|
+
self._extract_text_and_hyperlink_recursively(
|
277
|
+
node, find_parent_annotation=True, keep_newlines=True
|
278
|
+
)
|
279
|
+
)
|
198
280
|
elif isinstance(node, NavigableString) and not isinstance(
|
199
281
|
node, PreformattedString
|
200
282
|
):
|
201
|
-
|
283
|
+
if str(node).strip("\n\r") == "":
|
284
|
+
flush_buffer()
|
285
|
+
else:
|
286
|
+
buffer.extend(
|
287
|
+
self._extract_text_and_hyperlink_recursively(
|
288
|
+
node, find_parent_annotation=True, keep_newlines=True
|
289
|
+
)
|
290
|
+
)
|
202
291
|
|
203
292
|
flush_buffer()
|
204
293
|
|
294
|
+
def _extract_text_and_hyperlink_recursively(
|
295
|
+
self,
|
296
|
+
item: PageElement,
|
297
|
+
ignore_list=False,
|
298
|
+
find_parent_annotation=False,
|
299
|
+
keep_newlines=False,
|
300
|
+
) -> AnnotatedTextList:
|
301
|
+
result: AnnotatedTextList = AnnotatedTextList()
|
302
|
+
|
303
|
+
# If find_parent_annotation, make sure that we keep track of
|
304
|
+
# any a-tag that has been present in the DOM-parents already.
|
305
|
+
if find_parent_annotation:
|
306
|
+
this_parent = item.parent
|
307
|
+
while this_parent is not None:
|
308
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
309
|
+
with self.use_hyperlink(this_parent):
|
310
|
+
return self._extract_text_and_hyperlink_recursively(
|
311
|
+
item, ignore_list
|
312
|
+
)
|
313
|
+
this_parent = this_parent.parent
|
314
|
+
|
315
|
+
if isinstance(item, PreformattedString):
|
316
|
+
return AnnotatedTextList()
|
317
|
+
|
318
|
+
if isinstance(item, NavigableString):
|
319
|
+
text = item.strip()
|
320
|
+
if text:
|
321
|
+
return AnnotatedTextList(
|
322
|
+
[AnnotatedText(text=text, hyperlink=self.hyperlink)]
|
323
|
+
)
|
324
|
+
if keep_newlines and item.strip("\n\r") == "":
|
325
|
+
return AnnotatedTextList(
|
326
|
+
[AnnotatedText(text="\n", hyperlink=self.hyperlink)]
|
327
|
+
)
|
328
|
+
return AnnotatedTextList()
|
329
|
+
|
330
|
+
tag = cast(Tag, item)
|
331
|
+
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
332
|
+
for child in tag:
|
333
|
+
if isinstance(child, Tag) and child.name == "a":
|
334
|
+
with self.use_hyperlink(child):
|
335
|
+
result.extend(
|
336
|
+
self._extract_text_and_hyperlink_recursively(
|
337
|
+
child, ignore_list, keep_newlines=keep_newlines
|
338
|
+
)
|
339
|
+
)
|
340
|
+
else:
|
341
|
+
# Recursively get the child's text content
|
342
|
+
result.extend(
|
343
|
+
self._extract_text_and_hyperlink_recursively(
|
344
|
+
child, ignore_list, keep_newlines=keep_newlines
|
345
|
+
)
|
346
|
+
)
|
347
|
+
return result
|
348
|
+
|
349
|
+
@contextmanager
|
350
|
+
def use_hyperlink(self, tag):
|
351
|
+
this_href = tag.get("href")
|
352
|
+
if this_href is None:
|
353
|
+
yield None
|
354
|
+
else:
|
355
|
+
if this_href:
|
356
|
+
old_hyperlink = self.hyperlink
|
357
|
+
if self.original_url is not None:
|
358
|
+
this_href = urljoin(self.original_url, this_href)
|
359
|
+
# ugly fix for relative links since pydantic does not support them.
|
360
|
+
try:
|
361
|
+
AnyUrl(this_href)
|
362
|
+
except ValidationError:
|
363
|
+
this_href = Path(this_href)
|
364
|
+
self.hyperlink = this_href
|
365
|
+
try:
|
366
|
+
yield None
|
367
|
+
finally:
|
368
|
+
if this_href:
|
369
|
+
self.hyperlink = old_hyperlink
|
370
|
+
|
371
|
+
@contextmanager
|
372
|
+
def use_inline_group(
|
373
|
+
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
374
|
+
):
|
375
|
+
"""Create an inline group for annotated texts.
|
376
|
+
|
377
|
+
Checks if annotated_text_list has more than one item and if so creates an inline
|
378
|
+
group in which the text elements can then be generated. While the context manager
|
379
|
+
is active the inline group is set as the current parent.
|
380
|
+
|
381
|
+
Args:
|
382
|
+
annotated_text_list (AnnotatedTextList): Annotated text
|
383
|
+
doc (DoclingDocument): Currently used document
|
384
|
+
|
385
|
+
Yields:
|
386
|
+
None: _description_
|
387
|
+
"""
|
388
|
+
if len(annotated_text_list) > 1:
|
389
|
+
inline_fmt = doc.add_group(
|
390
|
+
label=GroupLabel.INLINE,
|
391
|
+
parent=self.parents[self.level],
|
392
|
+
content_layer=self.content_layer,
|
393
|
+
)
|
394
|
+
self.parents[self.level + 1] = inline_fmt
|
395
|
+
self.level += 1
|
396
|
+
try:
|
397
|
+
yield None
|
398
|
+
finally:
|
399
|
+
self.parents[self.level] = None
|
400
|
+
self.level -= 1
|
401
|
+
else:
|
402
|
+
yield None
|
403
|
+
|
205
404
|
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
206
405
|
tag_name = tag.name.lower()
|
207
406
|
# set default content layer to BODY as soon as we encounter a heading
|
208
407
|
self.content_layer = ContentLayer.BODY
|
209
408
|
level = int(tag_name[1])
|
210
|
-
|
211
|
-
|
409
|
+
annotated_text_list = self._extract_text_and_hyperlink_recursively(
|
410
|
+
tag, find_parent_annotation=True
|
411
|
+
)
|
412
|
+
annotated_text = annotated_text_list.to_single_text_element()
|
413
|
+
text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
|
212
414
|
# the first level is for the title item
|
213
415
|
if level == 1:
|
214
416
|
for key in self.parents.keys():
|
215
417
|
self.parents[key] = None
|
216
418
|
self.level = 0
|
217
419
|
self.parents[self.level + 1] = doc.add_title(
|
218
|
-
|
420
|
+
text_clean,
|
421
|
+
content_layer=self.content_layer,
|
422
|
+
hyperlink=annotated_text.hyperlink,
|
219
423
|
)
|
220
424
|
# the other levels need to be lowered by 1 if a title was set
|
221
425
|
else:
|
@@ -241,9 +445,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
241
445
|
self.parents[self.level + 1] = doc.add_heading(
|
242
446
|
parent=self.parents[self.level],
|
243
447
|
text=text_clean,
|
244
|
-
orig=text,
|
448
|
+
orig=annotated_text.text,
|
245
449
|
level=self.level,
|
246
450
|
content_layer=self.content_layer,
|
451
|
+
hyperlink=annotated_text.hyperlink,
|
247
452
|
)
|
248
453
|
self.level += 1
|
249
454
|
for img_tag in tag("img"):
|
@@ -292,37 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
292
497
|
marker = ""
|
293
498
|
|
294
499
|
# 2) extract only the "direct" text from this <li>
|
295
|
-
parts
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
text_part = HTMLDocumentBackend.get_text(child)
|
303
|
-
if text_part:
|
304
|
-
parts.append(text_part)
|
305
|
-
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
306
|
-
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
500
|
+
parts = self._extract_text_and_hyperlink_recursively(
|
501
|
+
li, ignore_list=True, find_parent_annotation=True
|
502
|
+
)
|
503
|
+
min_parts = parts.simplify_text_elements()
|
504
|
+
li_text = re.sub(
|
505
|
+
r"\s+|\n+", " ", "".join([el.text for el in min_parts])
|
506
|
+
).strip()
|
307
507
|
|
308
508
|
# 3) add the list item
|
309
509
|
if li_text:
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
510
|
+
if len(min_parts) > 1:
|
511
|
+
# create an empty list element in order to hook the inline group onto that one
|
512
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
513
|
+
text="",
|
514
|
+
enumerated=is_ordered,
|
515
|
+
marker=marker,
|
516
|
+
parent=list_group,
|
517
|
+
content_layer=self.content_layer,
|
518
|
+
)
|
519
|
+
self.level += 1
|
520
|
+
with self.use_inline_group(min_parts, doc):
|
521
|
+
for annotated_text in min_parts:
|
522
|
+
li_text = re.sub(
|
523
|
+
r"\s+|\n+", " ", annotated_text.text
|
524
|
+
).strip()
|
525
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
526
|
+
doc.add_text(
|
527
|
+
parent=self.parents[self.level],
|
528
|
+
label=DocItemLabel.TEXT,
|
529
|
+
text=li_clean,
|
530
|
+
content_layer=self.content_layer,
|
531
|
+
hyperlink=annotated_text.hyperlink,
|
532
|
+
)
|
533
|
+
|
534
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
535
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
536
|
+
if isinstance(sublist, Tag):
|
537
|
+
self._handle_block(sublist, doc)
|
538
|
+
|
539
|
+
# now the list element with inline group is not a parent anymore
|
540
|
+
self.parents[self.level] = None
|
541
|
+
self.level -= 1
|
542
|
+
else:
|
543
|
+
annotated_text = min_parts[0]
|
544
|
+
li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
|
545
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
546
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
547
|
+
text=li_clean,
|
548
|
+
enumerated=is_ordered,
|
549
|
+
marker=marker,
|
550
|
+
orig=li_text,
|
551
|
+
parent=list_group,
|
552
|
+
content_layer=self.content_layer,
|
553
|
+
hyperlink=annotated_text.hyperlink,
|
554
|
+
)
|
555
|
+
|
556
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
557
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
558
|
+
if isinstance(sublist, Tag):
|
559
|
+
self.level += 1
|
560
|
+
self._handle_block(sublist, doc)
|
561
|
+
self.parents[self.level + 1] = None
|
562
|
+
self.level -= 1
|
326
563
|
else:
|
327
564
|
for sublist in li({"ul", "ol"}, recursive=False):
|
328
565
|
if isinstance(sublist, Tag):
|
@@ -351,17 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
351
588
|
self._handle_list(tag, doc)
|
352
589
|
|
353
590
|
elif tag_name in {"p", "address", "summary"}:
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
591
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
592
|
+
tag, find_parent_annotation=True
|
593
|
+
)
|
594
|
+
annotated_texts = text_list.simplify_text_elements()
|
595
|
+
for part in annotated_texts.split_by_newline():
|
596
|
+
with self.use_inline_group(part, doc):
|
597
|
+
for annotated_text in part:
|
598
|
+
if seg := annotated_text.text.strip():
|
599
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
600
|
+
doc.add_text(
|
601
|
+
parent=self.parents[self.level],
|
602
|
+
label=DocItemLabel.TEXT,
|
603
|
+
text=seg_clean,
|
604
|
+
content_layer=self.content_layer,
|
605
|
+
hyperlink=annotated_text.hyperlink,
|
606
|
+
)
|
607
|
+
|
365
608
|
for img_tag in tag("img"):
|
366
609
|
if isinstance(img_tag, Tag):
|
367
610
|
self._emit_image(img_tag, doc)
|
@@ -380,15 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
380
623
|
|
381
624
|
elif tag_name in {"pre", "code"}:
|
382
625
|
# handle monospace code snippets (pre).
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
626
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
627
|
+
tag, find_parent_annotation=True
|
628
|
+
)
|
629
|
+
annotated_texts = text_list.simplify_text_elements()
|
630
|
+
with self.use_inline_group(annotated_texts, doc):
|
631
|
+
for annotated_text in annotated_texts:
|
632
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
633
|
+
annotated_text.text.strip()
|
634
|
+
)
|
635
|
+
doc.add_code(
|
636
|
+
parent=self.parents[self.level],
|
637
|
+
text=text_clean,
|
638
|
+
content_layer=self.content_layer,
|
639
|
+
hyperlink=annotated_text.hyperlink,
|
640
|
+
)
|
392
641
|
|
393
642
|
elif tag_name == "details":
|
394
643
|
# handle details and its content.
|
@@ -405,22 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
405
654
|
|
406
655
|
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
407
656
|
figure = img_tag.find_parent("figure")
|
408
|
-
caption:
|
657
|
+
caption: AnnotatedTextList = AnnotatedTextList()
|
658
|
+
|
659
|
+
# check if the figure has a link - this is HACK:
|
660
|
+
def get_img_hyperlink(img_tag):
|
661
|
+
this_parent = img_tag.parent
|
662
|
+
while this_parent is not None:
|
663
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
664
|
+
return this_parent.get("href")
|
665
|
+
this_parent = this_parent.parent
|
666
|
+
return None
|
667
|
+
|
668
|
+
if img_hyperlink := get_img_hyperlink(img_tag):
|
669
|
+
caption.append(
|
670
|
+
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
|
671
|
+
)
|
672
|
+
|
409
673
|
if isinstance(figure, Tag):
|
410
674
|
caption_tag = figure.find("figcaption", recursive=False)
|
411
675
|
if isinstance(caption_tag, Tag):
|
412
|
-
caption =
|
413
|
-
|
414
|
-
|
676
|
+
caption = self._extract_text_and_hyperlink_recursively(
|
677
|
+
caption_tag, find_parent_annotation=True
|
678
|
+
)
|
679
|
+
if not caption and img_tag.get("alt"):
|
680
|
+
caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
|
681
|
+
|
682
|
+
caption_anno_text = caption.to_single_text_element()
|
415
683
|
|
416
684
|
caption_item: Optional[TextItem] = None
|
417
|
-
if
|
418
|
-
|
685
|
+
if caption_anno_text.text:
|
686
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
687
|
+
caption_anno_text.text.strip()
|
688
|
+
)
|
689
|
+
print(caption_anno_text)
|
419
690
|
caption_item = doc.add_text(
|
420
691
|
label=DocItemLabel.CAPTION,
|
421
|
-
text=
|
422
|
-
orig=
|
692
|
+
text=text_clean,
|
693
|
+
orig=caption_anno_text.text,
|
423
694
|
content_layer=self.content_layer,
|
695
|
+
hyperlink=caption_anno_text.hyperlink,
|
424
696
|
)
|
425
697
|
|
426
698
|
doc.add_picture(
|