docling 2.23.0__tar.gz → 2.24.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.23.0 → docling-2.24.0}/PKG-INFO +3 -4
- {docling-2.23.0 → docling-2.24.0}/docling/backend/html_backend.py +152 -149
- {docling-2.23.0 → docling-2.24.0}/docling/backend/xml/jats_backend.py +6 -68
- {docling-2.23.0 → docling-2.24.0}/docling/backend/xml/uspto_backend.py +48 -27
- {docling-2.23.0 → docling-2.24.0}/docling/models/page_assemble_model.py +8 -0
- docling-2.24.0/docling/models/readingorder_model.py +389 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/tesseract_ocr_cli_model.py +3 -1
- {docling-2.23.0 → docling-2.24.0}/docling/pipeline/standard_pdf_pipeline.py +2 -2
- {docling-2.23.0 → docling-2.24.0}/pyproject.toml +3 -6
- docling-2.23.0/docling/models/ds_glm_model.py +0 -386
- {docling-2.23.0 → docling-2.24.0}/LICENSE +0 -0
- {docling-2.23.0 → docling-2.24.0}/README.md +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/md_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/chunking/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/cli/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/cli/main.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/cli/models.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/cli/tools.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/datamodel/document.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/datamodel/settings.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/document_converter.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/exceptions.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/base_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/layout_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/py.typed +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/__init__.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/export.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/profiling.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/utils.py +0 -0
- {docling-2.23.0 → docling-2.24.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.24.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -25,11 +25,10 @@ Provides-Extra: ocrmac
|
|
25
25
|
Provides-Extra: rapidocr
|
26
26
|
Provides-Extra: tesserocr
|
27
27
|
Provides-Extra: vlm
|
28
|
-
Requires-Dist: beautifulsoup4 (>=4.12.3,<
|
28
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
29
29
|
Requires-Dist: certifi (>=2024.7.4)
|
30
|
-
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
31
30
|
Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
|
32
|
-
Requires-Dist: docling-ibm-models (>=3.
|
31
|
+
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
32
|
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
34
33
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
35
34
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
@@ -1,17 +1,20 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Optional,
|
4
|
+
from typing import Optional, Union, cast
|
5
5
|
|
6
|
-
from bs4 import BeautifulSoup, Tag
|
6
|
+
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
7
7
|
from docling_core.types.doc import (
|
8
|
+
DocItem,
|
8
9
|
DocItemLabel,
|
9
10
|
DoclingDocument,
|
10
11
|
DocumentOrigin,
|
12
|
+
GroupItem,
|
11
13
|
GroupLabel,
|
12
14
|
TableCell,
|
13
15
|
TableData,
|
14
16
|
)
|
17
|
+
from typing_extensions import override
|
15
18
|
|
16
19
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
17
20
|
from docling.datamodel.base_models import InputFormat
|
@@ -21,6 +24,7 @@ _log = logging.getLogger(__name__)
|
|
21
24
|
|
22
25
|
|
23
26
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
27
|
+
@override
|
24
28
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
25
29
|
super().__init__(in_doc, path_or_stream)
|
26
30
|
_log.debug("About to init HTML backend...")
|
@@ -30,10 +34,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
30
34
|
# Initialise the parents for the hierarchy
|
31
35
|
self.max_levels = 10
|
32
36
|
self.level = 0
|
33
|
-
self.parents = {}
|
37
|
+
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
34
38
|
for i in range(0, self.max_levels):
|
35
39
|
self.parents[i] = None
|
36
|
-
self.labels = {} # type: ignore
|
37
40
|
|
38
41
|
try:
|
39
42
|
if isinstance(self.path_or_stream, BytesIO):
|
@@ -48,13 +51,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
48
51
|
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
49
52
|
) from e
|
50
53
|
|
54
|
+
@override
|
51
55
|
def is_valid(self) -> bool:
|
52
56
|
return self.soup is not None
|
53
57
|
|
54
58
|
@classmethod
|
59
|
+
@override
|
55
60
|
def supports_pagination(cls) -> bool:
|
56
61
|
return False
|
57
62
|
|
63
|
+
@override
|
58
64
|
def unload(self):
|
59
65
|
if isinstance(self.path_or_stream, BytesIO):
|
60
66
|
self.path_or_stream.close()
|
@@ -62,9 +68,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
62
68
|
self.path_or_stream = None
|
63
69
|
|
64
70
|
@classmethod
|
65
|
-
|
71
|
+
@override
|
72
|
+
def supported_formats(cls) -> set[InputFormat]:
|
66
73
|
return {InputFormat.HTML}
|
67
74
|
|
75
|
+
@override
|
68
76
|
def convert(self) -> DoclingDocument:
|
69
77
|
# access self.path_or_stream to load stuff
|
70
78
|
origin = DocumentOrigin(
|
@@ -80,98 +88,73 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
80
88
|
assert self.soup is not None
|
81
89
|
content = self.soup.body or self.soup
|
82
90
|
# Replace <br> tags with newline characters
|
83
|
-
for br in content
|
84
|
-
br.replace_with("\n")
|
85
|
-
|
91
|
+
for br in content("br"):
|
92
|
+
br.replace_with(NavigableString("\n"))
|
93
|
+
self.walk(content, doc)
|
86
94
|
else:
|
87
95
|
raise RuntimeError(
|
88
96
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
89
97
|
)
|
90
98
|
return doc
|
91
99
|
|
92
|
-
def walk(self,
|
93
|
-
|
94
|
-
|
95
|
-
|
100
|
+
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
101
|
+
# Iterate over elements in the body of the document
|
102
|
+
for element in tag.children:
|
103
|
+
if isinstance(element, Tag):
|
96
104
|
try:
|
97
|
-
self.
|
105
|
+
self.analyze_tag(cast(Tag, element), doc)
|
98
106
|
except Exception as exc_child:
|
99
|
-
|
100
|
-
|
101
|
-
|
107
|
+
_log.error(
|
108
|
+
f"Error processing child from tag{tag.name}: {exc_child}"
|
109
|
+
)
|
102
110
|
raise exc_child
|
103
111
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
"""
|
114
|
-
|
115
|
-
|
116
|
-
self.
|
112
|
+
return
|
113
|
+
|
114
|
+
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
115
|
+
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
116
|
+
self.handle_header(tag, doc)
|
117
|
+
elif tag.name in ["p"]:
|
118
|
+
self.handle_paragraph(tag, doc)
|
119
|
+
elif tag.name in ["pre"]:
|
120
|
+
self.handle_code(tag, doc)
|
121
|
+
elif tag.name in ["ul", "ol"]:
|
122
|
+
self.handle_list(tag, doc)
|
123
|
+
elif tag.name in ["li"]:
|
124
|
+
self.handle_list_item(tag, doc)
|
125
|
+
elif tag.name == "table":
|
126
|
+
self.handle_table(tag, doc)
|
127
|
+
elif tag.name == "figure":
|
128
|
+
self.handle_figure(tag, doc)
|
129
|
+
elif tag.name == "img":
|
130
|
+
self.handle_image(doc)
|
117
131
|
else:
|
118
|
-
self.
|
119
|
-
|
120
|
-
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
121
|
-
self.handle_header(element, idx, doc)
|
122
|
-
elif element.name in ["p"]:
|
123
|
-
self.handle_paragraph(element, idx, doc)
|
124
|
-
elif element.name in ["pre"]:
|
125
|
-
self.handle_code(element, idx, doc)
|
126
|
-
elif element.name in ["ul", "ol"]:
|
127
|
-
self.handle_list(element, idx, doc)
|
128
|
-
elif element.name in ["li"]:
|
129
|
-
self.handle_listitem(element, idx, doc)
|
130
|
-
elif element.name == "table":
|
131
|
-
self.handle_table(element, idx, doc)
|
132
|
-
elif element.name == "figure":
|
133
|
-
self.handle_figure(element, idx, doc)
|
134
|
-
elif element.name == "img":
|
135
|
-
self.handle_image(element, idx, doc)
|
136
|
-
else:
|
137
|
-
self.walk(element, doc)
|
132
|
+
self.walk(tag, doc)
|
138
133
|
|
139
|
-
def
|
140
|
-
"""Get the
|
141
|
-
|
142
|
-
if isinstance(text, str):
|
143
|
-
return text.strip()
|
134
|
+
def get_text(self, item: PageElement) -> str:
|
135
|
+
"""Get the text content of a tag."""
|
136
|
+
parts: list[str] = self.extract_text_recursively(item)
|
144
137
|
|
145
|
-
return ""
|
138
|
+
return "".join(parts) + " "
|
146
139
|
|
147
140
|
# Function to recursively extract text from all child nodes
|
148
|
-
def extract_text_recursively(self, item:
|
149
|
-
result = []
|
141
|
+
def extract_text_recursively(self, item: PageElement) -> list[str]:
|
142
|
+
result: list[str] = []
|
150
143
|
|
151
|
-
if isinstance(item,
|
144
|
+
if isinstance(item, NavigableString):
|
152
145
|
return [item]
|
153
146
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
except:
|
164
|
-
_log.warn("item has no children")
|
165
|
-
pass
|
166
|
-
|
167
|
-
return "".join(result) + " "
|
168
|
-
|
169
|
-
def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
|
147
|
+
tag = cast(Tag, item)
|
148
|
+
if tag.name not in ["ul", "ol"]:
|
149
|
+
for child in tag:
|
150
|
+
# Recursively get the child's text content
|
151
|
+
result.extend(self.extract_text_recursively(child))
|
152
|
+
|
153
|
+
return ["".join(result) + " "]
|
154
|
+
|
155
|
+
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
170
156
|
"""Handles header tags (h1, h2, etc.)."""
|
171
157
|
hlevel = int(element.name.replace("h", ""))
|
172
|
-
slevel = hlevel - 1
|
173
|
-
|
174
|
-
label = DocItemLabel.SECTION_HEADER
|
175
158
|
text = element.text.strip()
|
176
159
|
|
177
160
|
if hlevel == 1:
|
@@ -197,7 +180,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
197
180
|
elif hlevel < self.level:
|
198
181
|
|
199
182
|
# remove the tail
|
200
|
-
for key
|
183
|
+
for key in self.parents.keys():
|
201
184
|
if key > hlevel:
|
202
185
|
self.parents[key] = None
|
203
186
|
self.level = hlevel
|
@@ -208,27 +191,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
208
191
|
level=hlevel,
|
209
192
|
)
|
210
193
|
|
211
|
-
def handle_code(self, element: Tag,
|
194
|
+
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
212
195
|
"""Handles monospace code snippets (pre)."""
|
213
196
|
if element.text is None:
|
214
197
|
return
|
215
198
|
text = element.text.strip()
|
216
|
-
|
217
|
-
|
218
|
-
return
|
219
|
-
doc.add_code(parent=self.parents[self.level], text=text)
|
199
|
+
if text:
|
200
|
+
doc.add_code(parent=self.parents[self.level], text=text)
|
220
201
|
|
221
|
-
def handle_paragraph(self, element: Tag,
|
202
|
+
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
222
203
|
"""Handles paragraph tags (p)."""
|
223
204
|
if element.text is None:
|
224
205
|
return
|
225
206
|
text = element.text.strip()
|
226
207
|
label = DocItemLabel.PARAGRAPH
|
227
|
-
if
|
228
|
-
|
229
|
-
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
208
|
+
if text:
|
209
|
+
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
230
210
|
|
231
|
-
def handle_list(self, element: Tag,
|
211
|
+
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
232
212
|
"""Handles list tags (ul, ol) and their list items."""
|
233
213
|
|
234
214
|
if element.name == "ul":
|
@@ -250,25 +230,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
250
230
|
self.parents[self.level + 1] = None
|
251
231
|
self.level -= 1
|
252
232
|
|
253
|
-
def
|
233
|
+
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
|
254
234
|
"""Handles listitem tags (li)."""
|
255
|
-
|
235
|
+
nested_list = element.find(["ul", "ol"])
|
256
236
|
|
257
|
-
|
258
|
-
|
237
|
+
parent = self.parents[self.level]
|
238
|
+
if parent is None:
|
239
|
+
_log.warning(f"list-item has no parent in DoclingDocument: {element}")
|
240
|
+
return
|
241
|
+
parent_label: str = parent.label
|
242
|
+
index_in_list = len(parent.children) + 1
|
259
243
|
|
260
|
-
if
|
261
|
-
name = element.name
|
244
|
+
if nested_list:
|
262
245
|
# Text in list item can be hidden within hierarchy, hence
|
263
246
|
# we need to extract it recursively
|
264
|
-
text = self.
|
247
|
+
text: str = self.get_text(element)
|
265
248
|
# Flatten text, remove break lines:
|
266
249
|
text = text.replace("\n", "").replace("\r", "")
|
267
250
|
text = " ".join(text.split()).strip()
|
268
251
|
|
269
252
|
marker = ""
|
270
253
|
enumerated = False
|
271
|
-
if
|
254
|
+
if parent_label == GroupLabel.ORDERED_LIST:
|
272
255
|
marker = str(index_in_list)
|
273
256
|
enumerated = True
|
274
257
|
|
@@ -278,7 +261,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
278
261
|
text=text,
|
279
262
|
enumerated=enumerated,
|
280
263
|
marker=marker,
|
281
|
-
parent=
|
264
|
+
parent=parent,
|
282
265
|
)
|
283
266
|
self.level += 1
|
284
267
|
|
@@ -287,74 +270,94 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
287
270
|
self.parents[self.level + 1] = None
|
288
271
|
self.level -= 1
|
289
272
|
|
290
|
-
elif
|
273
|
+
elif element.text.strip():
|
291
274
|
text = element.text.strip()
|
292
275
|
|
293
276
|
marker = ""
|
294
277
|
enumerated = False
|
295
|
-
if
|
278
|
+
if parent_label == GroupLabel.ORDERED_LIST:
|
296
279
|
marker = f"{str(index_in_list)}."
|
297
280
|
enumerated = True
|
298
281
|
doc.add_list_item(
|
299
282
|
text=text,
|
300
283
|
enumerated=enumerated,
|
301
284
|
marker=marker,
|
302
|
-
parent=
|
285
|
+
parent=parent,
|
303
286
|
)
|
304
287
|
else:
|
305
|
-
_log.
|
306
|
-
|
307
|
-
def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
|
308
|
-
"""Handles table tags."""
|
288
|
+
_log.warning(f"list-item has no text: {element}")
|
309
289
|
|
290
|
+
@staticmethod
|
291
|
+
def parse_table_data(element: Tag) -> Optional[TableData]:
|
310
292
|
nested_tables = element.find("table")
|
311
293
|
if nested_tables is not None:
|
312
|
-
_log.
|
313
|
-
return
|
294
|
+
_log.warning("Skipping nested table.")
|
295
|
+
return None
|
314
296
|
|
315
297
|
# Count the number of rows (number of <tr> elements)
|
316
|
-
num_rows = len(element
|
298
|
+
num_rows = len(element("tr"))
|
317
299
|
|
318
300
|
# Find the number of columns (taking into account colspan)
|
319
301
|
num_cols = 0
|
320
|
-
for row in element
|
302
|
+
for row in element("tr"):
|
321
303
|
col_count = 0
|
322
|
-
|
323
|
-
|
304
|
+
if not isinstance(row, Tag):
|
305
|
+
continue
|
306
|
+
for cell in row(["td", "th"]):
|
307
|
+
if not isinstance(row, Tag):
|
308
|
+
continue
|
309
|
+
val = cast(Tag, cell).get("colspan", "1")
|
310
|
+
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
324
311
|
col_count += colspan
|
325
312
|
num_cols = max(num_cols, col_count)
|
326
313
|
|
327
|
-
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
314
|
+
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
328
315
|
|
329
316
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
330
317
|
|
331
318
|
# Iterate over the rows in the table
|
332
|
-
for row_idx, row in enumerate(element
|
319
|
+
for row_idx, row in enumerate(element("tr")):
|
320
|
+
if not isinstance(row, Tag):
|
321
|
+
continue
|
333
322
|
|
334
323
|
# For each row, find all the column cells (both <td> and <th>)
|
335
|
-
cells = row
|
324
|
+
cells = row(["td", "th"])
|
336
325
|
|
337
326
|
# Check if each cell in the row is a header -> means it is a column header
|
338
327
|
col_header = True
|
339
|
-
for
|
340
|
-
if html_cell.name == "td":
|
328
|
+
for html_cell in cells:
|
329
|
+
if isinstance(html_cell, Tag) and html_cell.name == "td":
|
341
330
|
col_header = False
|
342
331
|
|
332
|
+
# Extract the text content of each cell
|
343
333
|
col_idx = 0
|
344
|
-
|
345
|
-
|
346
|
-
|
334
|
+
for html_cell in cells:
|
335
|
+
if not isinstance(html_cell, Tag):
|
336
|
+
continue
|
337
|
+
|
338
|
+
# extract inline formulas
|
339
|
+
for formula in html_cell("inline-formula"):
|
340
|
+
math_parts = formula.text.split("$$")
|
341
|
+
if len(math_parts) == 3:
|
342
|
+
math_formula = f"$${math_parts[1]}$$"
|
343
|
+
formula.replace_with(NavigableString(math_formula))
|
344
|
+
|
345
|
+
# TODO: extract content correctly from table-cells with lists
|
347
346
|
text = html_cell.text
|
348
|
-
try:
|
349
|
-
text = self.extract_table_cell_text(html_cell)
|
350
|
-
except Exception as exc:
|
351
|
-
_log.warn("exception: ", exc)
|
352
|
-
exit(-1)
|
353
347
|
|
354
348
|
# label = html_cell.name
|
355
|
-
|
356
|
-
col_span =
|
357
|
-
|
349
|
+
col_val = html_cell.get("colspan", "1")
|
350
|
+
col_span = (
|
351
|
+
int(col_val)
|
352
|
+
if isinstance(col_val, str) and col_val.isnumeric()
|
353
|
+
else 1
|
354
|
+
)
|
355
|
+
row_val = html_cell.get("rowspan", "1")
|
356
|
+
row_span = (
|
357
|
+
int(row_val)
|
358
|
+
if isinstance(row_val, str) and row_val.isnumeric()
|
359
|
+
else 1
|
360
|
+
)
|
358
361
|
|
359
362
|
while grid[row_idx][col_idx] is not None:
|
360
363
|
col_idx += 1
|
@@ -362,7 +365,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
362
365
|
for c in range(col_span):
|
363
366
|
grid[row_idx + r][col_idx + c] = text
|
364
367
|
|
365
|
-
|
368
|
+
table_cell = TableCell(
|
366
369
|
text=text,
|
367
370
|
row_span=row_span,
|
368
371
|
col_span=col_span,
|
@@ -373,57 +376,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
373
376
|
col_header=col_header,
|
374
377
|
row_header=((not col_header) and html_cell.name == "th"),
|
375
378
|
)
|
376
|
-
data.table_cells.append(
|
379
|
+
data.table_cells.append(table_cell)
|
377
380
|
|
378
|
-
|
381
|
+
return data
|
379
382
|
|
380
|
-
def
|
383
|
+
def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
|
384
|
+
"""Handles table tags."""
|
385
|
+
|
386
|
+
table_data = HTMLDocumentBackend.parse_table_data(element)
|
387
|
+
|
388
|
+
if table_data is not None:
|
389
|
+
doc.add_table(data=table_data, parent=self.parents[self.level])
|
390
|
+
|
391
|
+
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
381
392
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
382
393
|
result = []
|
383
394
|
bullet_char = "*" # Default bullet character for unordered lists
|
384
395
|
|
385
396
|
if list_element.name == "ol": # For ordered lists, use numbers
|
386
|
-
for i, li in enumerate(list_element
|
397
|
+
for i, li in enumerate(list_element("li", recursive=False), 1):
|
398
|
+
if not isinstance(li, Tag):
|
399
|
+
continue
|
387
400
|
# Add numbering for ordered lists
|
388
401
|
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
|
389
402
|
# Handle nested lists
|
390
403
|
nested_list = li.find(["ul", "ol"])
|
391
|
-
if nested_list:
|
404
|
+
if isinstance(nested_list, Tag):
|
392
405
|
result.extend(self.get_list_text(nested_list, level + 1))
|
393
406
|
elif list_element.name == "ul": # For unordered lists, use bullet points
|
394
|
-
for li in list_element
|
407
|
+
for li in list_element("li", recursive=False):
|
408
|
+
if not isinstance(li, Tag):
|
409
|
+
continue
|
395
410
|
# Add bullet points for unordered lists
|
396
411
|
result.append(
|
397
412
|
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
|
398
413
|
)
|
399
414
|
# Handle nested lists
|
400
415
|
nested_list = li.find(["ul", "ol"])
|
401
|
-
if nested_list:
|
416
|
+
if isinstance(nested_list, Tag):
|
402
417
|
result.extend(self.get_list_text(nested_list, level + 1))
|
403
418
|
|
404
419
|
return result
|
405
420
|
|
406
|
-
def
|
407
|
-
"""Extract text from a table cell, including lists with indents."""
|
408
|
-
contains_lists = cell.find(["ul", "ol"])
|
409
|
-
if contains_lists is None:
|
410
|
-
return cell.text
|
411
|
-
else:
|
412
|
-
_log.debug(
|
413
|
-
"should extract the content correctly for table-cells with lists ..."
|
414
|
-
)
|
415
|
-
return cell.text
|
416
|
-
|
417
|
-
def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
|
421
|
+
def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
|
418
422
|
"""Handles image tags (img)."""
|
419
423
|
|
420
424
|
# Extract the image URI from the <img> tag
|
421
425
|
# image_uri = root.xpath('//figure//img/@src')[0]
|
422
426
|
|
423
427
|
contains_captions = element.find(["figcaption"])
|
424
|
-
if contains_captions
|
428
|
+
if not isinstance(contains_captions, Tag):
|
425
429
|
doc.add_picture(parent=self.parents[self.level], caption=None)
|
426
|
-
|
427
430
|
else:
|
428
431
|
texts = []
|
429
432
|
for item in contains_captions:
|
@@ -437,6 +440,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
437
440
|
caption=fig_caption,
|
438
441
|
)
|
439
442
|
|
440
|
-
def handle_image(self,
|
443
|
+
def handle_image(self, doc: DoclingDocument) -> None:
|
441
444
|
"""Handles image tags (img)."""
|
442
445
|
doc.add_picture(parent=self.parents[self.level], caption=None)
|
@@ -4,7 +4,7 @@ from io import BytesIO
|
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import Final, Optional, Union
|
6
6
|
|
7
|
-
from bs4 import BeautifulSoup
|
7
|
+
from bs4 import BeautifulSoup, Tag
|
8
8
|
from docling_core.types.doc import (
|
9
9
|
DocItemLabel,
|
10
10
|
DoclingDocument,
|
@@ -12,14 +12,13 @@ from docling_core.types.doc import (
|
|
12
12
|
GroupItem,
|
13
13
|
GroupLabel,
|
14
14
|
NodeItem,
|
15
|
-
TableCell,
|
16
|
-
TableData,
|
17
15
|
TextItem,
|
18
16
|
)
|
19
17
|
from lxml import etree
|
20
18
|
from typing_extensions import TypedDict, override
|
21
19
|
|
22
20
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
21
|
+
from docling.backend.html_backend import HTMLDocumentBackend
|
23
22
|
from docling.datamodel.base_models import InputFormat
|
24
23
|
from docling.datamodel.document import InputDocument
|
25
24
|
|
@@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
540
539
|
) -> None:
|
541
540
|
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
|
542
541
|
table_tag = soup.find("table")
|
543
|
-
|
544
|
-
nested_tables = table_tag.find("table")
|
545
|
-
if nested_tables:
|
546
|
-
_log.warning(f"Skipping nested table in {str(self.file)}")
|
542
|
+
if not isinstance(table_tag, Tag):
|
547
543
|
return
|
548
544
|
|
549
|
-
|
550
|
-
num_rows = len(table_tag.find_all("tr"))
|
551
|
-
|
552
|
-
# Find the number of columns (taking into account colspan)
|
553
|
-
num_cols = 0
|
554
|
-
for row in table_tag.find_all("tr"):
|
555
|
-
col_count = 0
|
556
|
-
for cell in row.find_all(["td", "th"]):
|
557
|
-
colspan = int(cell.get("colspan", 1))
|
558
|
-
col_count += colspan
|
559
|
-
num_cols = max(num_cols, col_count)
|
560
|
-
|
561
|
-
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
562
|
-
|
563
|
-
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
564
|
-
|
565
|
-
# Iterate over the rows in the table
|
566
|
-
for row_idx, row in enumerate(table_tag.find_all("tr")):
|
567
|
-
# For each row, find all the column cells (both <td> and <th>)
|
568
|
-
cells = row.find_all(["td", "th"])
|
569
|
-
|
570
|
-
# Check if each cell in the row is a header -> means it is a column header
|
571
|
-
col_header = True
|
572
|
-
for j, html_cell in enumerate(cells):
|
573
|
-
if html_cell.name == "td":
|
574
|
-
col_header = False
|
575
|
-
|
576
|
-
# Extract and print the text content of each cell
|
577
|
-
col_idx = 0
|
578
|
-
for _, html_cell in enumerate(cells):
|
579
|
-
# extract inline formulas
|
580
|
-
for formula in html_cell.find_all("inline-formula"):
|
581
|
-
math_parts = formula.text.split("$$")
|
582
|
-
if len(math_parts) == 3:
|
583
|
-
math_formula = f"$${math_parts[1]}$$"
|
584
|
-
formula.replaceWith(math_formula)
|
585
|
-
text = html_cell.text
|
586
|
-
|
587
|
-
col_span = int(html_cell.get("colspan", 1))
|
588
|
-
row_span = int(html_cell.get("rowspan", 1))
|
589
|
-
|
590
|
-
while grid[row_idx][col_idx] is not None:
|
591
|
-
col_idx += 1
|
592
|
-
for r in range(row_span):
|
593
|
-
for c in range(col_span):
|
594
|
-
grid[row_idx + r][col_idx + c] = text
|
595
|
-
|
596
|
-
cell = TableCell(
|
597
|
-
text=text,
|
598
|
-
row_span=row_span,
|
599
|
-
col_span=col_span,
|
600
|
-
start_row_offset_idx=row_idx,
|
601
|
-
end_row_offset_idx=row_idx + row_span,
|
602
|
-
start_col_offset_idx=col_idx,
|
603
|
-
end_col_offset_idx=col_idx + col_span,
|
604
|
-
col_header=col_header,
|
605
|
-
row_header=((not col_header) and html_cell.name == "th"),
|
606
|
-
)
|
607
|
-
data.table_cells.append(cell)
|
545
|
+
data = HTMLDocumentBackend.parse_table_data(table_tag)
|
608
546
|
|
609
547
|
# TODO: format label vs caption once styling is supported
|
610
548
|
label = table_xml_component["label"]
|
@@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
616
554
|
else None
|
617
555
|
)
|
618
556
|
|
619
|
-
|
557
|
+
if data is not None:
|
558
|
+
doc.add_table(data=data, parent=parent, caption=table_caption)
|
620
559
|
|
621
560
|
return
|
622
561
|
|
@@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
673
612
|
def _walk_linear(
|
674
613
|
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
675
614
|
) -> str:
|
676
|
-
# _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
|
677
615
|
skip_tags = ["term"]
|
678
616
|
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
|
679
617
|
new_parent: NodeItem = parent
|