docling 2.42.0__py3-none-any.whl → 2.42.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +299 -314
- docling/backend/msword_backend.py +10 -1
- docling/backend/pdf_backend.py +25 -1
- docling/pipeline/base_pipeline.py +7 -1
- docling/utils/layout_postprocessor.py +7 -2
- {docling-2.42.0.dist-info → docling-2.42.2.dist-info}/METADATA +2 -1
- {docling-2.42.0.dist-info → docling-2.42.2.dist-info}/RECORD +11 -11
- {docling-2.42.0.dist-info → docling-2.42.2.dist-info}/WHEEL +0 -0
- {docling-2.42.0.dist-info → docling-2.42.2.dist-info}/entry_points.txt +0 -0
- {docling-2.42.0.dist-info → docling-2.42.2.dist-info}/licenses/LICENSE +0 -0
- {docling-2.42.0.dist-info → docling-2.42.2.dist-info}/top_level.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import re
|
2
3
|
import traceback
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
@@ -15,6 +16,7 @@ from docling_core.types.doc import (
|
|
15
16
|
GroupLabel,
|
16
17
|
TableCell,
|
17
18
|
TableData,
|
19
|
+
TextItem,
|
18
20
|
)
|
19
21
|
from docling_core.types.doc.document import ContentLayer
|
20
22
|
from pydantic import BaseModel
|
@@ -26,10 +28,14 @@ from docling.datamodel.document import InputDocument
|
|
26
28
|
|
27
29
|
_log = logging.getLogger(__name__)
|
28
30
|
|
29
|
-
|
30
|
-
|
31
|
+
DEFAULT_IMAGE_WIDTH = 128
|
32
|
+
DEFAULT_IMAGE_HEIGHT = 128
|
33
|
+
|
34
|
+
# Tags that initiate distinct Docling items
|
35
|
+
_BLOCK_TAGS: Final = {
|
31
36
|
"address",
|
32
37
|
"details",
|
38
|
+
"figure",
|
33
39
|
"h1",
|
34
40
|
"h2",
|
35
41
|
"h3",
|
@@ -41,12 +47,9 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|
41
47
|
"code",
|
42
48
|
"ul",
|
43
49
|
"ol",
|
44
|
-
"li",
|
45
50
|
"summary",
|
46
51
|
"table",
|
47
|
-
|
48
|
-
"img",
|
49
|
-
]
|
52
|
+
}
|
50
53
|
|
51
54
|
|
52
55
|
class _Context(BaseModel):
|
@@ -56,12 +59,16 @@ class _Context(BaseModel):
|
|
56
59
|
|
57
60
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
58
61
|
@override
|
59
|
-
def __init__(
|
62
|
+
def __init__(
|
63
|
+
self,
|
64
|
+
in_doc: InputDocument,
|
65
|
+
path_or_stream: Union[BytesIO, Path],
|
66
|
+
):
|
60
67
|
super().__init__(in_doc, path_or_stream)
|
61
68
|
self.soup: Optional[Tag] = None
|
62
|
-
# HTML file:
|
63
69
|
self.path_or_stream = path_or_stream
|
64
|
-
|
70
|
+
|
71
|
+
# Initialize the parents for the hierarchy
|
65
72
|
self.max_levels = 10
|
66
73
|
self.level = 0
|
67
74
|
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
@@ -70,13 +77,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
70
77
|
self.parents[i] = None
|
71
78
|
|
72
79
|
try:
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
self.soup = BeautifulSoup(html_content, "html.parser")
|
80
|
+
raw = (
|
81
|
+
path_or_stream.getvalue()
|
82
|
+
if isinstance(path_or_stream, BytesIO)
|
83
|
+
else Path(path_or_stream).read_bytes()
|
84
|
+
)
|
85
|
+
self.soup = BeautifulSoup(raw, "html.parser")
|
80
86
|
except Exception as e:
|
81
87
|
raise RuntimeError(
|
82
88
|
"Could not initialize HTML backend for file with "
|
@@ -96,7 +102,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
96
102
|
def unload(self):
|
97
103
|
if isinstance(self.path_or_stream, BytesIO):
|
98
104
|
self.path_or_stream.close()
|
99
|
-
|
100
105
|
self.path_or_stream = None
|
101
106
|
|
102
107
|
@classmethod
|
@@ -106,211 +111,156 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
106
111
|
|
107
112
|
@override
|
108
113
|
def convert(self) -> DoclingDocument:
|
109
|
-
|
114
|
+
_log.debug("Starting HTML conversion...")
|
115
|
+
if not self.is_valid():
|
116
|
+
raise RuntimeError("Invalid HTML document.")
|
117
|
+
|
110
118
|
origin = DocumentOrigin(
|
111
119
|
filename=self.file.name or "file",
|
112
120
|
mimetype="text/html",
|
113
121
|
binary_hash=self.document_hash,
|
114
122
|
)
|
115
|
-
|
116
123
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
117
|
-
_log.debug("Trying to convert HTML...")
|
118
|
-
|
119
|
-
if self.is_valid():
|
120
|
-
assert self.soup is not None
|
121
|
-
content = self.soup.body or self.soup
|
122
|
-
# Replace <br> tags with newline characters
|
123
|
-
# TODO: remove style to avoid losing text from tags like i, b, span, ...
|
124
|
-
for br in content("br"):
|
125
|
-
br.replace_with(NavigableString("\n"))
|
126
|
-
|
127
|
-
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
128
|
-
self.content_layer = (
|
129
|
-
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
130
|
-
)
|
131
|
-
self.ctx = _Context() # reset context
|
132
|
-
self.walk(content, doc)
|
133
|
-
else:
|
134
|
-
raise RuntimeError(
|
135
|
-
f"Cannot convert doc with {self.document_hash} because the backend "
|
136
|
-
"failed to init."
|
137
|
-
)
|
138
|
-
return doc
|
139
124
|
|
140
|
-
|
141
|
-
#
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
)
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
):
|
163
|
-
text = text.strip()
|
164
|
-
if text and tag.name in ["div"]:
|
165
|
-
doc.add_text(
|
166
|
-
parent=self.parents[self.level],
|
167
|
-
label=DocItemLabel.TEXT,
|
168
|
-
text=text,
|
169
|
-
content_layer=self.content_layer,
|
170
|
-
)
|
171
|
-
text = ""
|
172
|
-
|
173
|
-
return
|
174
|
-
|
175
|
-
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
176
|
-
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
177
|
-
self.handle_header(tag, doc)
|
178
|
-
elif tag.name in ["p", "address", "summary"]:
|
179
|
-
self.handle_paragraph(tag, doc)
|
180
|
-
elif tag.name in ["pre", "code"]:
|
181
|
-
self.handle_code(tag, doc)
|
182
|
-
elif tag.name in ["ul", "ol"]:
|
183
|
-
self.handle_list(tag, doc)
|
184
|
-
elif tag.name in ["li"]:
|
185
|
-
self.handle_list_item(tag, doc)
|
186
|
-
elif tag.name == "table":
|
187
|
-
self.handle_table(tag, doc)
|
188
|
-
elif tag.name == "figure":
|
189
|
-
self.handle_figure(tag, doc)
|
190
|
-
elif tag.name == "img":
|
191
|
-
self.handle_image(tag, doc)
|
192
|
-
elif tag.name == "details":
|
193
|
-
self.handle_details(tag, doc)
|
194
|
-
else:
|
195
|
-
self.walk(tag, doc)
|
196
|
-
|
197
|
-
def get_text(self, item: PageElement) -> str:
|
198
|
-
"""Get the text content of a tag."""
|
199
|
-
parts: list[str] = self.extract_text_recursively(item)
|
200
|
-
|
201
|
-
return "".join(parts) + " "
|
202
|
-
|
203
|
-
# Function to recursively extract text from all child nodes
|
204
|
-
def extract_text_recursively(self, item: PageElement) -> list[str]:
|
205
|
-
result: list[str] = []
|
125
|
+
assert self.soup is not None
|
126
|
+
# set the title as furniture, since it is part of the document metadata
|
127
|
+
title = self.soup.title
|
128
|
+
if title:
|
129
|
+
doc.add_title(
|
130
|
+
text=title.get_text(separator=" ", strip=True),
|
131
|
+
content_layer=ContentLayer.FURNITURE,
|
132
|
+
)
|
133
|
+
# remove scripts/styles
|
134
|
+
for tag in self.soup(["script", "style"]):
|
135
|
+
tag.decompose()
|
136
|
+
content = self.soup.body or self.soup
|
137
|
+
# normalize <br> tags
|
138
|
+
for br in content("br"):
|
139
|
+
br.replace_with(NavigableString("\n"))
|
140
|
+
# set default content layer
|
141
|
+
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
142
|
+
self.content_layer = (
|
143
|
+
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
144
|
+
)
|
145
|
+
# reset context
|
146
|
+
self.ctx = _Context()
|
206
147
|
|
207
|
-
|
208
|
-
|
148
|
+
try:
|
149
|
+
self._walk(content, doc)
|
150
|
+
except Exception:
|
151
|
+
print(traceback.format_exc())
|
209
152
|
|
210
|
-
|
211
|
-
if tag.name not in ["ul", "ol"]:
|
212
|
-
for child in tag:
|
213
|
-
# Recursively get the child's text content
|
214
|
-
result.extend(self.extract_text_recursively(child))
|
153
|
+
return doc
|
215
154
|
|
216
|
-
|
155
|
+
def _walk(self, element: Tag, doc: DoclingDocument) -> None:
|
156
|
+
"""Parse an XML tag by recursively walking its content.
|
217
157
|
|
218
|
-
|
219
|
-
|
158
|
+
While walking, the method buffers inline text across tags like <b> or <span>,
|
159
|
+
emitting text nodes only at block boundaries.
|
220
160
|
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
161
|
+
Args:
|
162
|
+
element: The XML tag to parse.
|
163
|
+
doc: The Docling document to be updated with the parsed content.
|
164
|
+
"""
|
165
|
+
buffer: list[str] = []
|
166
|
+
|
167
|
+
def flush_buffer():
|
168
|
+
if not buffer:
|
169
|
+
return
|
170
|
+
text = "".join(buffer).strip()
|
171
|
+
buffer.clear()
|
172
|
+
if not text:
|
173
|
+
return
|
174
|
+
for part in text.split("\n"):
|
175
|
+
seg = part.strip()
|
176
|
+
if seg:
|
177
|
+
doc.add_text(
|
178
|
+
DocItemLabel.TEXT,
|
179
|
+
seg,
|
180
|
+
parent=self.parents[self.level],
|
181
|
+
content_layer=self.content_layer,
|
182
|
+
)
|
227
183
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
184
|
+
for node in element.contents:
|
185
|
+
if isinstance(node, Tag):
|
186
|
+
name = node.name.lower()
|
187
|
+
if name == "img":
|
188
|
+
flush_buffer()
|
189
|
+
self._emit_image(node, doc)
|
190
|
+
elif name in _BLOCK_TAGS:
|
191
|
+
flush_buffer()
|
192
|
+
self._handle_block(node, doc)
|
193
|
+
elif node.find(_BLOCK_TAGS):
|
194
|
+
flush_buffer()
|
195
|
+
self._walk(node, doc)
|
196
|
+
else:
|
197
|
+
buffer.append(node.text)
|
198
|
+
elif isinstance(node, NavigableString) and not isinstance(
|
199
|
+
node, PreformattedString
|
200
|
+
):
|
201
|
+
buffer.append(str(node))
|
232
202
|
|
233
|
-
|
234
|
-
"""Handles header tags (h1, h2, etc.)."""
|
235
|
-
hlevel = int(element.name.replace("h", ""))
|
236
|
-
text = element.text.strip()
|
203
|
+
flush_buffer()
|
237
204
|
|
205
|
+
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
206
|
+
tag_name = tag.name.lower()
|
207
|
+
# set default content layer to BODY as soon as we encounter a heading
|
238
208
|
self.content_layer = ContentLayer.BODY
|
239
|
-
|
240
|
-
|
209
|
+
level = int(tag_name[1])
|
210
|
+
text = tag.get_text(strip=True, separator=" ")
|
211
|
+
# the first level is for the title item
|
212
|
+
if level == 1:
|
241
213
|
for key in self.parents.keys():
|
242
214
|
self.parents[key] = None
|
243
|
-
|
244
|
-
self.level =
|
245
|
-
|
246
|
-
parent=self.parents[0],
|
247
|
-
label=DocItemLabel.TITLE,
|
248
|
-
text=text,
|
249
|
-
content_layer=self.content_layer,
|
215
|
+
self.level = 0
|
216
|
+
self.parents[self.level + 1] = doc.add_title(
|
217
|
+
text, content_layer=self.content_layer
|
250
218
|
)
|
219
|
+
# the other levels need to be lowered by 1 if a title was set
|
251
220
|
else:
|
252
|
-
|
221
|
+
level -= 1
|
222
|
+
if level > self.level:
|
253
223
|
# add invisible group
|
254
|
-
for i in range(self.level
|
255
|
-
|
256
|
-
|
224
|
+
for i in range(self.level, level):
|
225
|
+
_log.debug(f"Adding invisible group to level {i}")
|
226
|
+
self.parents[i + 1] = doc.add_group(
|
227
|
+
name=f"header-{i + 1}",
|
257
228
|
label=GroupLabel.SECTION,
|
258
|
-
parent=self.parents[i
|
229
|
+
parent=self.parents[i],
|
259
230
|
content_layer=self.content_layer,
|
260
231
|
)
|
261
|
-
self.level =
|
262
|
-
|
263
|
-
elif hlevel < self.level:
|
232
|
+
self.level = level
|
233
|
+
elif level < self.level:
|
264
234
|
# remove the tail
|
265
235
|
for key in self.parents.keys():
|
266
|
-
if key >
|
236
|
+
if key > level + 1:
|
237
|
+
_log.debug(f"Remove the tail of level {key}")
|
267
238
|
self.parents[key] = None
|
268
|
-
self.level =
|
269
|
-
|
270
|
-
self.parents[hlevel] = doc.add_heading(
|
271
|
-
parent=self.parents[hlevel - 1],
|
272
|
-
text=text,
|
273
|
-
level=hlevel - 1,
|
274
|
-
content_layer=self.content_layer,
|
275
|
-
)
|
276
|
-
|
277
|
-
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
278
|
-
"""Handles monospace code snippets (pre)."""
|
279
|
-
if element.text is None:
|
280
|
-
return
|
281
|
-
text = element.text.strip()
|
282
|
-
if text:
|
283
|
-
doc.add_code(
|
239
|
+
self.level = level
|
240
|
+
self.parents[self.level + 1] = doc.add_heading(
|
284
241
|
parent=self.parents[self.level],
|
285
242
|
text=text,
|
243
|
+
level=self.level,
|
286
244
|
content_layer=self.content_layer,
|
287
245
|
)
|
246
|
+
self.level += 1
|
247
|
+
for img_tag in tag("img"):
|
248
|
+
if isinstance(img_tag, Tag):
|
249
|
+
self._emit_image(img_tag, doc)
|
288
250
|
|
289
|
-
def
|
290
|
-
|
291
|
-
if element.text is None:
|
292
|
-
return
|
293
|
-
text = element.text.strip()
|
294
|
-
if text:
|
295
|
-
doc.add_text(
|
296
|
-
parent=self.parents[self.level],
|
297
|
-
label=DocItemLabel.TEXT,
|
298
|
-
text=text,
|
299
|
-
content_layer=self.content_layer,
|
300
|
-
)
|
301
|
-
|
302
|
-
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
303
|
-
"""Handles list tags (ul, ol) and their list items."""
|
304
|
-
|
251
|
+
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
|
252
|
+
tag_name = tag.name.lower()
|
305
253
|
start: Optional[int] = None
|
306
|
-
|
307
|
-
|
254
|
+
name: str = ""
|
255
|
+
is_ordered = tag_name == "ol"
|
256
|
+
if is_ordered:
|
257
|
+
start_attr = tag.get("start")
|
308
258
|
if isinstance(start_attr, str) and start_attr.isnumeric():
|
309
259
|
start = int(start_attr)
|
310
260
|
name = "ordered list" + (f" start {start}" if start is not None else "")
|
311
261
|
else:
|
312
262
|
name = "list"
|
313
|
-
#
|
263
|
+
# Create the list container
|
314
264
|
list_group = doc.add_list_group(
|
315
265
|
name=name,
|
316
266
|
parent=self.parents[self.level],
|
@@ -320,64 +270,182 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
320
270
|
self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
|
321
271
|
if is_ordered and start is not None:
|
322
272
|
self.ctx.list_start_by_ref[list_group.self_ref] = start
|
323
|
-
|
324
273
|
self.level += 1
|
325
274
|
|
326
|
-
|
275
|
+
# For each top-level <li> in this list
|
276
|
+
for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
|
277
|
+
if not isinstance(li, Tag):
|
278
|
+
continue
|
279
|
+
|
280
|
+
# sub-list items should be indented under main list items, but temporarily
|
281
|
+
# addressing invalid HTML (docling-core/issues/357)
|
282
|
+
if li.name in {"ul", "ol"}:
|
283
|
+
self._handle_block(li, doc)
|
284
|
+
|
285
|
+
else:
|
286
|
+
# 1) determine the marker
|
287
|
+
if is_ordered and start is not None:
|
288
|
+
marker = f"{start + len(list_group.children)}."
|
289
|
+
else:
|
290
|
+
marker = ""
|
291
|
+
|
292
|
+
# 2) extract only the "direct" text from this <li>
|
293
|
+
parts: list[str] = []
|
294
|
+
for child in li.contents:
|
295
|
+
if isinstance(child, NavigableString) and not isinstance(
|
296
|
+
child, PreformattedString
|
297
|
+
):
|
298
|
+
parts.append(child)
|
299
|
+
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
|
300
|
+
text_part = HTMLDocumentBackend.get_text(child)
|
301
|
+
if text_part:
|
302
|
+
parts.append(text_part)
|
303
|
+
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
304
|
+
|
305
|
+
# 3) add the list item
|
306
|
+
if li_text:
|
307
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
308
|
+
text=li_text,
|
309
|
+
enumerated=is_ordered,
|
310
|
+
marker=marker,
|
311
|
+
parent=list_group,
|
312
|
+
content_layer=self.content_layer,
|
313
|
+
)
|
314
|
+
|
315
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
316
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
317
|
+
if isinstance(sublist, Tag):
|
318
|
+
self.level += 1
|
319
|
+
self._handle_block(sublist, doc)
|
320
|
+
self.parents[self.level + 1] = None
|
321
|
+
self.level -= 1
|
322
|
+
else:
|
323
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
324
|
+
if isinstance(sublist, Tag):
|
325
|
+
self._handle_block(sublist, doc)
|
326
|
+
|
327
|
+
# 5) extract any images under this <li>
|
328
|
+
for img_tag in li("img"):
|
329
|
+
if isinstance(img_tag, Tag):
|
330
|
+
self._emit_image(img_tag, doc)
|
327
331
|
|
328
332
|
self.parents[self.level + 1] = None
|
329
333
|
self.level -= 1
|
330
334
|
|
331
|
-
def
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
335
|
+
def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
|
336
|
+
tag_name = tag.name.lower()
|
337
|
+
|
338
|
+
if tag_name == "figure":
|
339
|
+
img_tag = tag.find("img")
|
340
|
+
if isinstance(img_tag, Tag):
|
341
|
+
self._emit_image(img_tag, doc)
|
342
|
+
|
343
|
+
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
344
|
+
self._handle_heading(tag, doc)
|
345
|
+
|
346
|
+
elif tag_name in {"ul", "ol"}:
|
347
|
+
self._handle_list(tag, doc)
|
348
|
+
|
349
|
+
elif tag_name in {"p", "address", "summary"}:
|
350
|
+
for part in tag.text.split("\n"):
|
351
|
+
seg = part.strip()
|
352
|
+
if seg:
|
353
|
+
doc.add_text(
|
354
|
+
parent=self.parents[self.level],
|
355
|
+
label=DocItemLabel.TEXT,
|
356
|
+
text=seg,
|
357
|
+
content_layer=self.content_layer,
|
358
|
+
)
|
359
|
+
for img_tag in tag("img"):
|
360
|
+
if isinstance(img_tag, Tag):
|
361
|
+
self._emit_image(img_tag, doc)
|
362
|
+
|
363
|
+
elif tag_name == "table":
|
364
|
+
data = HTMLDocumentBackend.parse_table_data(tag)
|
365
|
+
for img_tag in tag("img"):
|
366
|
+
if isinstance(img_tag, Tag):
|
367
|
+
self._emit_image(tag, doc)
|
368
|
+
if data is not None:
|
369
|
+
doc.add_table(
|
370
|
+
data=data,
|
371
|
+
parent=self.parents[self.level],
|
360
372
|
content_layer=self.content_layer,
|
361
373
|
)
|
362
|
-
self.level += 1
|
363
|
-
self.walk(element, doc)
|
364
|
-
self.parents[self.level + 1] = None
|
365
|
-
self.level -= 1
|
366
|
-
else:
|
367
|
-
self.walk(element, doc)
|
368
374
|
|
369
|
-
elif
|
370
|
-
|
375
|
+
elif tag_name in {"pre", "code"}:
|
376
|
+
# handle monospace code snippets (pre).
|
377
|
+
text = tag.get_text(strip=True)
|
378
|
+
if text:
|
379
|
+
doc.add_code(
|
380
|
+
parent=self.parents[self.level],
|
381
|
+
text=text,
|
382
|
+
content_layer=self.content_layer,
|
383
|
+
)
|
371
384
|
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
385
|
+
elif tag_name == "details":
|
386
|
+
# handle details and its content.
|
387
|
+
self.parents[self.level + 1] = doc.add_group(
|
388
|
+
name="details",
|
389
|
+
label=GroupLabel.SECTION,
|
390
|
+
parent=self.parents[self.level],
|
377
391
|
content_layer=self.content_layer,
|
378
392
|
)
|
379
|
-
|
380
|
-
|
393
|
+
self.level += 1
|
394
|
+
self._walk(tag, doc)
|
395
|
+
self.parents[self.level + 1] = None
|
396
|
+
self.level -= 1
|
397
|
+
|
398
|
+
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
399
|
+
figure = img_tag.find_parent("figure")
|
400
|
+
caption: str = ""
|
401
|
+
if isinstance(figure, Tag):
|
402
|
+
caption_tag = figure.find("figcaption", recursive=False)
|
403
|
+
if isinstance(caption_tag, Tag):
|
404
|
+
caption = caption_tag.get_text()
|
405
|
+
if not caption:
|
406
|
+
caption = str(img_tag.get("alt", "")).strip()
|
407
|
+
|
408
|
+
caption_item: Optional[TextItem] = None
|
409
|
+
if caption:
|
410
|
+
caption_item = doc.add_text(
|
411
|
+
DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
|
412
|
+
)
|
413
|
+
|
414
|
+
doc.add_picture(
|
415
|
+
caption=caption_item,
|
416
|
+
parent=self.parents[self.level],
|
417
|
+
content_layer=self.content_layer,
|
418
|
+
)
|
419
|
+
|
420
|
+
@staticmethod
|
421
|
+
def get_text(item: PageElement) -> str:
|
422
|
+
"""Concatenate all child strings of a PageElement.
|
423
|
+
|
424
|
+
This method is equivalent to `PageElement.get_text()` but also considers
|
425
|
+
certain tags. When called on a <p> or <li> tags, it returns the text with a
|
426
|
+
trailing space, otherwise the text is concatenated without separators.
|
427
|
+
"""
|
428
|
+
|
429
|
+
def _extract_text_recursively(item: PageElement) -> list[str]:
|
430
|
+
"""Recursively extract text from all child nodes."""
|
431
|
+
result: list[str] = []
|
432
|
+
|
433
|
+
if isinstance(item, NavigableString):
|
434
|
+
result = [item]
|
435
|
+
elif isinstance(item, Tag):
|
436
|
+
tag = cast(Tag, item)
|
437
|
+
parts: list[str] = []
|
438
|
+
for child in tag:
|
439
|
+
parts.extend(_extract_text_recursively(child))
|
440
|
+
result.append(
|
441
|
+
"".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
|
442
|
+
)
|
443
|
+
|
444
|
+
return result
|
445
|
+
|
446
|
+
parts: list[str] = _extract_text_recursively(item)
|
447
|
+
|
448
|
+
return "".join(parts)
|
381
449
|
|
382
450
|
@staticmethod
|
383
451
|
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
@@ -472,9 +540,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
472
540
|
formula.replace_with(NavigableString(math_formula))
|
473
541
|
|
474
542
|
# TODO: extract content correctly from table-cells with lists
|
475
|
-
text = html_cell.
|
476
|
-
|
477
|
-
# label = html_cell.name
|
543
|
+
text = HTMLDocumentBackend.get_text(html_cell).strip()
|
478
544
|
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
479
545
|
if row_header:
|
480
546
|
row_span -= 1
|
@@ -502,84 +568,3 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
502
568
|
data.table_cells.append(table_cell)
|
503
569
|
|
504
570
|
return data
|
505
|
-
|
506
|
-
def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
|
507
|
-
"""Handles table tags."""
|
508
|
-
|
509
|
-
table_data = HTMLDocumentBackend.parse_table_data(element)
|
510
|
-
|
511
|
-
if table_data is not None:
|
512
|
-
doc.add_table(
|
513
|
-
data=table_data,
|
514
|
-
parent=self.parents[self.level],
|
515
|
-
content_layer=self.content_layer,
|
516
|
-
)
|
517
|
-
|
518
|
-
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
519
|
-
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
520
|
-
result = []
|
521
|
-
bullet_char = "*" # Default bullet character for unordered lists
|
522
|
-
|
523
|
-
if list_element.name == "ol": # For ordered lists, use numbers
|
524
|
-
for i, li in enumerate(list_element("li", recursive=False), 1):
|
525
|
-
if not isinstance(li, Tag):
|
526
|
-
continue
|
527
|
-
# Add numbering for ordered lists
|
528
|
-
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
|
529
|
-
# Handle nested lists
|
530
|
-
nested_list = li.find(["ul", "ol"])
|
531
|
-
if isinstance(nested_list, Tag):
|
532
|
-
result.extend(self.get_list_text(nested_list, level + 1))
|
533
|
-
elif list_element.name == "ul": # For unordered lists, use bullet points
|
534
|
-
for li in list_element("li", recursive=False):
|
535
|
-
if not isinstance(li, Tag):
|
536
|
-
continue
|
537
|
-
# Add bullet points for unordered lists
|
538
|
-
result.append(
|
539
|
-
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
|
540
|
-
)
|
541
|
-
# Handle nested lists
|
542
|
-
nested_list = li.find(["ul", "ol"])
|
543
|
-
if isinstance(nested_list, Tag):
|
544
|
-
result.extend(self.get_list_text(nested_list, level + 1))
|
545
|
-
|
546
|
-
return result
|
547
|
-
|
548
|
-
def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
|
549
|
-
"""Handles image tags (img)."""
|
550
|
-
|
551
|
-
# Extract the image URI from the <img> tag
|
552
|
-
# image_uri = root.xpath('//figure//img/@src')[0]
|
553
|
-
|
554
|
-
contains_captions = element.find(["figcaption"])
|
555
|
-
if not isinstance(contains_captions, Tag):
|
556
|
-
doc.add_picture(
|
557
|
-
parent=self.parents[self.level],
|
558
|
-
caption=None,
|
559
|
-
content_layer=self.content_layer,
|
560
|
-
)
|
561
|
-
else:
|
562
|
-
texts = []
|
563
|
-
for item in contains_captions:
|
564
|
-
texts.append(item.text)
|
565
|
-
|
566
|
-
fig_caption = doc.add_text(
|
567
|
-
label=DocItemLabel.CAPTION,
|
568
|
-
text=("".join(texts)).strip(),
|
569
|
-
content_layer=self.content_layer,
|
570
|
-
)
|
571
|
-
doc.add_picture(
|
572
|
-
parent=self.parents[self.level],
|
573
|
-
caption=fig_caption,
|
574
|
-
content_layer=self.content_layer,
|
575
|
-
)
|
576
|
-
|
577
|
-
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
|
578
|
-
"""Handles image tags (img)."""
|
579
|
-
_log.debug(f"ignoring <img> tags at the moment: {element}")
|
580
|
-
|
581
|
-
doc.add_picture(
|
582
|
-
parent=self.parents[self.level],
|
583
|
-
caption=None,
|
584
|
-
content_layer=self.content_layer,
|
585
|
-
)
|
@@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1104
1104
|
)
|
1105
1105
|
_log.debug(f" spanned before row {spanned_idx}")
|
1106
1106
|
|
1107
|
+
# Detect equations in cell text
|
1108
|
+
text, equations = self._handle_equations_in_text(
|
1109
|
+
element=cell._element, text=cell.text
|
1110
|
+
)
|
1111
|
+
if len(equations) == 0:
|
1112
|
+
text = cell.text
|
1113
|
+
else:
|
1114
|
+
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
1115
|
+
|
1107
1116
|
table_cell = TableCell(
|
1108
|
-
text=
|
1117
|
+
text=text,
|
1109
1118
|
row_span=spanned_idx - row_idx,
|
1110
1119
|
col_span=cell.grid_span,
|
1111
1120
|
start_row_offset_idx=row.grid_cols_before + row_idx,
|
docling/backend/pdf_backend.py
CHANGED
@@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
57
57
|
if self.input_format is InputFormat.IMAGE:
|
58
58
|
buf = BytesIO()
|
59
59
|
img = Image.open(self.path_or_stream)
|
60
|
-
|
60
|
+
|
61
|
+
# Handle multi-page TIFF images
|
62
|
+
if hasattr(img, "n_frames") and img.n_frames > 1:
|
63
|
+
# Extract all frames from multi-page image
|
64
|
+
frames = []
|
65
|
+
try:
|
66
|
+
for i in range(img.n_frames):
|
67
|
+
img.seek(i)
|
68
|
+
frame = img.copy().convert("RGB")
|
69
|
+
frames.append(frame)
|
70
|
+
except EOFError:
|
71
|
+
pass
|
72
|
+
|
73
|
+
# Save as multi-page PDF
|
74
|
+
if frames:
|
75
|
+
frames[0].save(
|
76
|
+
buf, "PDF", save_all=True, append_images=frames[1:]
|
77
|
+
)
|
78
|
+
else:
|
79
|
+
# Fallback to single page if frame extraction fails
|
80
|
+
img.convert("RGB").save(buf, "PDF")
|
81
|
+
else:
|
82
|
+
# Single page image - convert to RGB and save
|
83
|
+
img.convert("RGB").save(buf, "PDF")
|
84
|
+
|
61
85
|
buf.seek(0)
|
62
86
|
self.path_or_stream = buf
|
63
87
|
else:
|
@@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
217
217
|
return conv_res
|
218
218
|
|
219
219
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
220
|
-
status =
|
220
|
+
status = conv_res.status
|
221
|
+
if status in [
|
222
|
+
ConversionStatus.PENDING,
|
223
|
+
ConversionStatus.STARTED,
|
224
|
+
]: # preserves ConversionStatus.PARTIAL_SUCCESS
|
225
|
+
status = ConversionStatus.SUCCESS
|
226
|
+
|
221
227
|
for page in conv_res.pages:
|
222
228
|
if page._backend is None or not page._backend.is_valid():
|
223
229
|
conv_res.errors.append(
|
@@ -267,9 +267,14 @@ class LayoutPostprocessor:
|
|
267
267
|
# Initial cell assignment
|
268
268
|
clusters = self._assign_cells_to_clusters(clusters)
|
269
269
|
|
270
|
-
# Remove clusters with no cells (if keep_empty_clusters is False)
|
270
|
+
# Remove clusters with no cells (if keep_empty_clusters is False),
|
271
|
+
# but always keep clusters with label DocItemLabel.FORMULA
|
271
272
|
if not self.options.keep_empty_clusters:
|
272
|
-
clusters = [
|
273
|
+
clusters = [
|
274
|
+
cluster
|
275
|
+
for cluster in clusters
|
276
|
+
if cluster.cells or cluster.label == DocItemLabel.FORMULA
|
277
|
+
]
|
273
278
|
|
274
279
|
# Handle orphaned cells
|
275
280
|
unassigned = self._find_unassigned_cells(clusters)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.42.
|
3
|
+
Version: 2.42.2
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -89,6 +89,7 @@ Dynamic: license-file
|
|
89
89
|
[](https://opensource.org/licenses/MIT)
|
90
90
|
[](https://pepy.tech/projects/docling)
|
91
91
|
[](https://apify.com/vancura/docling)
|
92
|
+
[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
|
92
93
|
[](https://www.bestpractices.dev/projects/10101)
|
93
94
|
[](https://lfaidata.foundation/projects/)
|
94
95
|
|
@@ -9,13 +9,13 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
|
|
9
9
|
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
10
10
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
11
|
docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
|
12
|
-
docling/backend/html_backend.py,sha256=
|
12
|
+
docling/backend/html_backend.py,sha256=1Sohqc1xQETx6qPw27nT0QR4EdpDQg5DlrsK3rrgv7A,20413
|
13
13
|
docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
|
14
14
|
docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
|
15
15
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
16
|
-
docling/backend/msword_backend.py,sha256=
|
16
|
+
docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
|
17
17
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
18
|
-
docling/backend/pdf_backend.py,sha256=
|
18
|
+
docling/backend/pdf_backend.py,sha256=sUBrCz1zvt6E7sVl4xHtrkpTBClOK0vBV2lLi_TRHNg,3237
|
19
19
|
docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
|
20
20
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -74,7 +74,7 @@ docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6D
|
|
74
74
|
docling/models/vlm_models_inline/mlx_model.py,sha256=p-H6wG31iVRoOjsqYaCVa4pEzxMP3vzLcsUatMjDJDQ,5948
|
75
75
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
76
76
|
docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
|
77
|
-
docling/pipeline/base_pipeline.py,sha256=
|
77
|
+
docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
|
78
78
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
79
79
|
docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
|
80
80
|
docling/pipeline/vlm_pipeline.py,sha256=0lj8tbXNpYF8OLBoLqP2BZfFpTHi40RoHVfvO_Nah4Q,15349
|
@@ -83,7 +83,7 @@ docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zY
|
|
83
83
|
docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
|
84
84
|
docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
|
85
85
|
docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
|
86
|
-
docling/utils/layout_postprocessor.py,sha256=
|
86
|
+
docling/utils/layout_postprocessor.py,sha256=LFLbBE-o3kWu79d8ZcyHlZPIqzQfCabZCIPTJ51lZsY,24657
|
87
87
|
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
88
88
|
docling/utils/model_downloader.py,sha256=3vijCsAIVwWqehGBDRxRq7mJ3yRb9-zBsG00iqjqegU,4076
|
89
89
|
docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
|
@@ -91,9 +91,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
91
91
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
92
92
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
93
93
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
94
|
-
docling-2.42.
|
95
|
-
docling-2.42.
|
96
|
-
docling-2.42.
|
97
|
-
docling-2.42.
|
98
|
-
docling-2.42.
|
99
|
-
docling-2.42.
|
94
|
+
docling-2.42.2.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
95
|
+
docling-2.42.2.dist-info/METADATA,sha256=1u5N4PTeuTbyxNgK9QK5DuqVf4cmSHOMcHlCeV7j5Do,10449
|
96
|
+
docling-2.42.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
97
|
+
docling-2.42.2.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
98
|
+
docling-2.42.2.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
99
|
+
docling-2.42.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|