docling 2.44.0__py3-none-any.whl → 2.45.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +349 -77
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/pdf_backend.py +3 -3
- docling/cli/main.py +10 -0
- docling/datamodel/base_models.py +3 -0
- docling/datamodel/document.py +26 -0
- docling/datamodel/pipeline_options_vlm_model.py +8 -2
- docling/document_converter.py +4 -0
- docling/models/api_vlm_model.py +2 -5
- docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
- docling/models/vlm_models_inline/mlx_model.py +2 -4
- docling/pipeline/base_pipeline.py +7 -4
- {docling-2.44.0.dist-info → docling-2.45.0.dist-info}/METADATA +1 -1
- {docling-2.44.0.dist-info → docling-2.45.0.dist-info}/RECORD +18 -17
- {docling-2.44.0.dist-info → docling-2.45.0.dist-info}/WHEEL +0 -0
- {docling-2.44.0.dist-info → docling-2.45.0.dist-info}/entry_points.txt +0 -0
- {docling-2.44.0.dist-info → docling-2.45.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.44.0.dist-info → docling-2.45.0.dist-info}/top_level.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
+
from contextlib import contextmanager
|
4
|
+
from copy import deepcopy
|
3
5
|
from io import BytesIO
|
4
6
|
from pathlib import Path
|
5
7
|
from typing import Final, Optional, Union, cast
|
8
|
+
from urllib.parse import urljoin
|
6
9
|
|
7
10
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
8
11
|
from bs4.element import PreformattedString
|
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
|
|
18
21
|
TextItem,
|
19
22
|
)
|
20
23
|
from docling_core.types.doc.document import ContentLayer
|
21
|
-
from pydantic import BaseModel
|
24
|
+
from pydantic import AnyUrl, BaseModel, ValidationError
|
22
25
|
from typing_extensions import override
|
23
26
|
|
24
27
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
@@ -56,12 +59,76 @@ class _Context(BaseModel):
|
|
56
59
|
list_start_by_ref: dict[str, int] = {}
|
57
60
|
|
58
61
|
|
62
|
+
class AnnotatedText(BaseModel):
|
63
|
+
text: str
|
64
|
+
hyperlink: Union[AnyUrl, Path, None] = None
|
65
|
+
|
66
|
+
|
67
|
+
class AnnotatedTextList(list):
|
68
|
+
def to_single_text_element(self) -> AnnotatedText:
|
69
|
+
current_h = None
|
70
|
+
current_text = ""
|
71
|
+
for at in self:
|
72
|
+
t = at.text
|
73
|
+
h = at.hyperlink
|
74
|
+
current_text += t.strip() + " "
|
75
|
+
if h is not None and current_h is None:
|
76
|
+
current_h = h
|
77
|
+
elif h is not None and current_h is not None and h != current_h:
|
78
|
+
_log.warning(
|
79
|
+
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
80
|
+
)
|
81
|
+
return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
|
82
|
+
|
83
|
+
def simplify_text_elements(self) -> "AnnotatedTextList":
|
84
|
+
simplified = AnnotatedTextList()
|
85
|
+
if not self:
|
86
|
+
return self
|
87
|
+
text = self[0].text
|
88
|
+
hyperlink = self[0].hyperlink
|
89
|
+
last_elm = text
|
90
|
+
for i in range(1, len(self)):
|
91
|
+
if hyperlink == self[i].hyperlink:
|
92
|
+
sep = " "
|
93
|
+
if not self[i].text.strip() or not last_elm.strip():
|
94
|
+
sep = ""
|
95
|
+
text += sep + self[i].text
|
96
|
+
last_elm = self[i].text
|
97
|
+
else:
|
98
|
+
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
99
|
+
text = self[i].text
|
100
|
+
last_elm = text
|
101
|
+
hyperlink = self[i].hyperlink
|
102
|
+
if text:
|
103
|
+
simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
|
104
|
+
return simplified
|
105
|
+
|
106
|
+
def split_by_newline(self):
|
107
|
+
super_list = []
|
108
|
+
active_annotated_text_list = AnnotatedTextList()
|
109
|
+
for el in self:
|
110
|
+
sub_texts = el.text.split("\n")
|
111
|
+
if len(sub_texts) == 1:
|
112
|
+
active_annotated_text_list.append(el)
|
113
|
+
else:
|
114
|
+
for text in sub_texts:
|
115
|
+
sub_el = deepcopy(el)
|
116
|
+
sub_el.text = text
|
117
|
+
active_annotated_text_list.append(sub_el)
|
118
|
+
super_list.append(active_annotated_text_list)
|
119
|
+
active_annotated_text_list = AnnotatedTextList()
|
120
|
+
if active_annotated_text_list:
|
121
|
+
super_list.append(active_annotated_text_list)
|
122
|
+
return super_list
|
123
|
+
|
124
|
+
|
59
125
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
60
126
|
@override
|
61
127
|
def __init__(
|
62
128
|
self,
|
63
129
|
in_doc: InputDocument,
|
64
130
|
path_or_stream: Union[BytesIO, Path],
|
131
|
+
original_url: Optional[AnyUrl] = None,
|
65
132
|
):
|
66
133
|
super().__init__(in_doc, path_or_stream)
|
67
134
|
self.soup: Optional[Tag] = None
|
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
74
141
|
self.ctx = _Context()
|
75
142
|
for i in range(self.max_levels):
|
76
143
|
self.parents[i] = None
|
144
|
+
self.hyperlink = None
|
145
|
+
self.original_url = original_url
|
77
146
|
|
78
147
|
try:
|
79
148
|
raw = (
|
@@ -160,26 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
160
229
|
element: The XML tag to parse.
|
161
230
|
doc: The Docling document to be updated with the parsed content.
|
162
231
|
"""
|
163
|
-
buffer:
|
232
|
+
buffer: AnnotatedTextList = AnnotatedTextList()
|
164
233
|
|
165
234
|
def flush_buffer():
|
166
235
|
if not buffer:
|
167
236
|
return
|
168
|
-
|
237
|
+
annotated_text_list = buffer.simplify_text_elements()
|
238
|
+
parts = annotated_text_list.split_by_newline()
|
169
239
|
buffer.clear()
|
170
|
-
|
240
|
+
|
241
|
+
if not "".join([el.text for el in annotated_text_list]):
|
171
242
|
return
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
243
|
+
|
244
|
+
for annotated_text_list in parts:
|
245
|
+
with self.use_inline_group(annotated_text_list, doc):
|
246
|
+
for annotated_text in annotated_text_list:
|
247
|
+
if annotated_text.text.strip():
|
248
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(
|
249
|
+
annotated_text.text.strip()
|
250
|
+
)
|
251
|
+
doc.add_text(
|
252
|
+
parent=self.parents[self.level],
|
253
|
+
label=DocItemLabel.TEXT,
|
254
|
+
text=seg_clean,
|
255
|
+
content_layer=self.content_layer,
|
256
|
+
hyperlink=annotated_text.hyperlink,
|
257
|
+
)
|
183
258
|
|
184
259
|
for node in element.contents:
|
185
260
|
if isinstance(node, Tag):
|
@@ -187,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
187
262
|
if name == "img":
|
188
263
|
flush_buffer()
|
189
264
|
self._emit_image(node, doc)
|
265
|
+
elif name == "a":
|
266
|
+
with self.use_hyperlink(node):
|
267
|
+
self._walk(node, doc)
|
190
268
|
elif name in _BLOCK_TAGS:
|
191
269
|
flush_buffer()
|
192
270
|
self._handle_block(node, doc)
|
@@ -194,28 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
194
272
|
flush_buffer()
|
195
273
|
self._walk(node, doc)
|
196
274
|
else:
|
197
|
-
buffer.
|
275
|
+
buffer.extend(
|
276
|
+
self._extract_text_and_hyperlink_recursively(
|
277
|
+
node, find_parent_annotation=True, keep_newlines=True
|
278
|
+
)
|
279
|
+
)
|
198
280
|
elif isinstance(node, NavigableString) and not isinstance(
|
199
281
|
node, PreformattedString
|
200
282
|
):
|
201
|
-
|
283
|
+
if str(node).strip("\n\r") == "":
|
284
|
+
flush_buffer()
|
285
|
+
else:
|
286
|
+
buffer.extend(
|
287
|
+
self._extract_text_and_hyperlink_recursively(
|
288
|
+
node, find_parent_annotation=True, keep_newlines=True
|
289
|
+
)
|
290
|
+
)
|
202
291
|
|
203
292
|
flush_buffer()
|
204
293
|
|
294
|
+
def _extract_text_and_hyperlink_recursively(
|
295
|
+
self,
|
296
|
+
item: PageElement,
|
297
|
+
ignore_list=False,
|
298
|
+
find_parent_annotation=False,
|
299
|
+
keep_newlines=False,
|
300
|
+
) -> AnnotatedTextList:
|
301
|
+
result: AnnotatedTextList = AnnotatedTextList()
|
302
|
+
|
303
|
+
# If find_parent_annotation, make sure that we keep track of
|
304
|
+
# any a-tag that has been present in the DOM-parents already.
|
305
|
+
if find_parent_annotation:
|
306
|
+
this_parent = item.parent
|
307
|
+
while this_parent is not None:
|
308
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
309
|
+
with self.use_hyperlink(this_parent):
|
310
|
+
return self._extract_text_and_hyperlink_recursively(
|
311
|
+
item, ignore_list
|
312
|
+
)
|
313
|
+
this_parent = this_parent.parent
|
314
|
+
|
315
|
+
if isinstance(item, PreformattedString):
|
316
|
+
return AnnotatedTextList()
|
317
|
+
|
318
|
+
if isinstance(item, NavigableString):
|
319
|
+
text = item.strip()
|
320
|
+
if text:
|
321
|
+
return AnnotatedTextList(
|
322
|
+
[AnnotatedText(text=text, hyperlink=self.hyperlink)]
|
323
|
+
)
|
324
|
+
if keep_newlines and item.strip("\n\r") == "":
|
325
|
+
return AnnotatedTextList(
|
326
|
+
[AnnotatedText(text="\n", hyperlink=self.hyperlink)]
|
327
|
+
)
|
328
|
+
return AnnotatedTextList()
|
329
|
+
|
330
|
+
tag = cast(Tag, item)
|
331
|
+
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
332
|
+
for child in tag:
|
333
|
+
if isinstance(child, Tag) and child.name == "a":
|
334
|
+
with self.use_hyperlink(child):
|
335
|
+
result.extend(
|
336
|
+
self._extract_text_and_hyperlink_recursively(
|
337
|
+
child, ignore_list, keep_newlines=keep_newlines
|
338
|
+
)
|
339
|
+
)
|
340
|
+
else:
|
341
|
+
# Recursively get the child's text content
|
342
|
+
result.extend(
|
343
|
+
self._extract_text_and_hyperlink_recursively(
|
344
|
+
child, ignore_list, keep_newlines=keep_newlines
|
345
|
+
)
|
346
|
+
)
|
347
|
+
return result
|
348
|
+
|
349
|
+
@contextmanager
|
350
|
+
def use_hyperlink(self, tag):
|
351
|
+
this_href = tag.get("href")
|
352
|
+
if this_href is None:
|
353
|
+
yield None
|
354
|
+
else:
|
355
|
+
if this_href:
|
356
|
+
old_hyperlink = self.hyperlink
|
357
|
+
if self.original_url is not None:
|
358
|
+
this_href = urljoin(self.original_url, this_href)
|
359
|
+
# ugly fix for relative links since pydantic does not support them.
|
360
|
+
try:
|
361
|
+
AnyUrl(this_href)
|
362
|
+
except ValidationError:
|
363
|
+
this_href = Path(this_href)
|
364
|
+
self.hyperlink = this_href
|
365
|
+
try:
|
366
|
+
yield None
|
367
|
+
finally:
|
368
|
+
if this_href:
|
369
|
+
self.hyperlink = old_hyperlink
|
370
|
+
|
371
|
+
@contextmanager
|
372
|
+
def use_inline_group(
|
373
|
+
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
374
|
+
):
|
375
|
+
"""Create an inline group for annotated texts.
|
376
|
+
|
377
|
+
Checks if annotated_text_list has more than one item and if so creates an inline
|
378
|
+
group in which the text elements can then be generated. While the context manager
|
379
|
+
is active the inline group is set as the current parent.
|
380
|
+
|
381
|
+
Args:
|
382
|
+
annotated_text_list (AnnotatedTextList): Annotated text
|
383
|
+
doc (DoclingDocument): Currently used document
|
384
|
+
|
385
|
+
Yields:
|
386
|
+
None: _description_
|
387
|
+
"""
|
388
|
+
if len(annotated_text_list) > 1:
|
389
|
+
inline_fmt = doc.add_group(
|
390
|
+
label=GroupLabel.INLINE,
|
391
|
+
parent=self.parents[self.level],
|
392
|
+
content_layer=self.content_layer,
|
393
|
+
)
|
394
|
+
self.parents[self.level + 1] = inline_fmt
|
395
|
+
self.level += 1
|
396
|
+
try:
|
397
|
+
yield None
|
398
|
+
finally:
|
399
|
+
self.parents[self.level] = None
|
400
|
+
self.level -= 1
|
401
|
+
else:
|
402
|
+
yield None
|
403
|
+
|
205
404
|
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
206
405
|
tag_name = tag.name.lower()
|
207
406
|
# set default content layer to BODY as soon as we encounter a heading
|
208
407
|
self.content_layer = ContentLayer.BODY
|
209
408
|
level = int(tag_name[1])
|
210
|
-
|
211
|
-
|
409
|
+
annotated_text_list = self._extract_text_and_hyperlink_recursively(
|
410
|
+
tag, find_parent_annotation=True
|
411
|
+
)
|
412
|
+
annotated_text = annotated_text_list.to_single_text_element()
|
413
|
+
text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
|
212
414
|
# the first level is for the title item
|
213
415
|
if level == 1:
|
214
416
|
for key in self.parents.keys():
|
215
417
|
self.parents[key] = None
|
216
418
|
self.level = 0
|
217
419
|
self.parents[self.level + 1] = doc.add_title(
|
218
|
-
|
420
|
+
text_clean,
|
421
|
+
content_layer=self.content_layer,
|
422
|
+
hyperlink=annotated_text.hyperlink,
|
219
423
|
)
|
220
424
|
# the other levels need to be lowered by 1 if a title was set
|
221
425
|
else:
|
@@ -241,9 +445,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
241
445
|
self.parents[self.level + 1] = doc.add_heading(
|
242
446
|
parent=self.parents[self.level],
|
243
447
|
text=text_clean,
|
244
|
-
orig=text,
|
448
|
+
orig=annotated_text.text,
|
245
449
|
level=self.level,
|
246
450
|
content_layer=self.content_layer,
|
451
|
+
hyperlink=annotated_text.hyperlink,
|
247
452
|
)
|
248
453
|
self.level += 1
|
249
454
|
for img_tag in tag("img"):
|
@@ -292,37 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
292
497
|
marker = ""
|
293
498
|
|
294
499
|
# 2) extract only the "direct" text from this <li>
|
295
|
-
parts
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
text_part = HTMLDocumentBackend.get_text(child)
|
303
|
-
if text_part:
|
304
|
-
parts.append(text_part)
|
305
|
-
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
306
|
-
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
500
|
+
parts = self._extract_text_and_hyperlink_recursively(
|
501
|
+
li, ignore_list=True, find_parent_annotation=True
|
502
|
+
)
|
503
|
+
min_parts = parts.simplify_text_elements()
|
504
|
+
li_text = re.sub(
|
505
|
+
r"\s+|\n+", " ", "".join([el.text for el in min_parts])
|
506
|
+
).strip()
|
307
507
|
|
308
508
|
# 3) add the list item
|
309
509
|
if li_text:
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
510
|
+
if len(min_parts) > 1:
|
511
|
+
# create an empty list element in order to hook the inline group onto that one
|
512
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
513
|
+
text="",
|
514
|
+
enumerated=is_ordered,
|
515
|
+
marker=marker,
|
516
|
+
parent=list_group,
|
517
|
+
content_layer=self.content_layer,
|
518
|
+
)
|
519
|
+
self.level += 1
|
520
|
+
with self.use_inline_group(min_parts, doc):
|
521
|
+
for annotated_text in min_parts:
|
522
|
+
li_text = re.sub(
|
523
|
+
r"\s+|\n+", " ", annotated_text.text
|
524
|
+
).strip()
|
525
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
526
|
+
doc.add_text(
|
527
|
+
parent=self.parents[self.level],
|
528
|
+
label=DocItemLabel.TEXT,
|
529
|
+
text=li_clean,
|
530
|
+
content_layer=self.content_layer,
|
531
|
+
hyperlink=annotated_text.hyperlink,
|
532
|
+
)
|
533
|
+
|
534
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
535
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
536
|
+
if isinstance(sublist, Tag):
|
537
|
+
self._handle_block(sublist, doc)
|
538
|
+
|
539
|
+
# now the list element with inline group is not a parent anymore
|
540
|
+
self.parents[self.level] = None
|
541
|
+
self.level -= 1
|
542
|
+
else:
|
543
|
+
annotated_text = min_parts[0]
|
544
|
+
li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
|
545
|
+
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
546
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
547
|
+
text=li_clean,
|
548
|
+
enumerated=is_ordered,
|
549
|
+
marker=marker,
|
550
|
+
orig=li_text,
|
551
|
+
parent=list_group,
|
552
|
+
content_layer=self.content_layer,
|
553
|
+
hyperlink=annotated_text.hyperlink,
|
554
|
+
)
|
555
|
+
|
556
|
+
# 4) recurse into any nested lists, attaching them to this <li> item
|
557
|
+
for sublist in li({"ul", "ol"}, recursive=False):
|
558
|
+
if isinstance(sublist, Tag):
|
559
|
+
self.level += 1
|
560
|
+
self._handle_block(sublist, doc)
|
561
|
+
self.parents[self.level + 1] = None
|
562
|
+
self.level -= 1
|
326
563
|
else:
|
327
564
|
for sublist in li({"ul", "ol"}, recursive=False):
|
328
565
|
if isinstance(sublist, Tag):
|
@@ -351,17 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
351
588
|
self._handle_list(tag, doc)
|
352
589
|
|
353
590
|
elif tag_name in {"p", "address", "summary"}:
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
591
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
592
|
+
tag, find_parent_annotation=True
|
593
|
+
)
|
594
|
+
annotated_texts = text_list.simplify_text_elements()
|
595
|
+
for part in annotated_texts.split_by_newline():
|
596
|
+
with self.use_inline_group(part, doc):
|
597
|
+
for annotated_text in part:
|
598
|
+
if seg := annotated_text.text.strip():
|
599
|
+
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
600
|
+
doc.add_text(
|
601
|
+
parent=self.parents[self.level],
|
602
|
+
label=DocItemLabel.TEXT,
|
603
|
+
text=seg_clean,
|
604
|
+
content_layer=self.content_layer,
|
605
|
+
hyperlink=annotated_text.hyperlink,
|
606
|
+
)
|
607
|
+
|
365
608
|
for img_tag in tag("img"):
|
366
609
|
if isinstance(img_tag, Tag):
|
367
610
|
self._emit_image(img_tag, doc)
|
@@ -380,15 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
380
623
|
|
381
624
|
elif tag_name in {"pre", "code"}:
|
382
625
|
# handle monospace code snippets (pre).
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
626
|
+
text_list = self._extract_text_and_hyperlink_recursively(
|
627
|
+
tag, find_parent_annotation=True
|
628
|
+
)
|
629
|
+
annotated_texts = text_list.simplify_text_elements()
|
630
|
+
with self.use_inline_group(annotated_texts, doc):
|
631
|
+
for annotated_text in annotated_texts:
|
632
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
633
|
+
annotated_text.text.strip()
|
634
|
+
)
|
635
|
+
doc.add_code(
|
636
|
+
parent=self.parents[self.level],
|
637
|
+
text=text_clean,
|
638
|
+
content_layer=self.content_layer,
|
639
|
+
hyperlink=annotated_text.hyperlink,
|
640
|
+
)
|
392
641
|
|
393
642
|
elif tag_name == "details":
|
394
643
|
# handle details and its content.
|
@@ -405,22 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
405
654
|
|
406
655
|
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
407
656
|
figure = img_tag.find_parent("figure")
|
408
|
-
caption:
|
657
|
+
caption: AnnotatedTextList = AnnotatedTextList()
|
658
|
+
|
659
|
+
# check if the figure has a link - this is HACK:
|
660
|
+
def get_img_hyperlink(img_tag):
|
661
|
+
this_parent = img_tag.parent
|
662
|
+
while this_parent is not None:
|
663
|
+
if this_parent.name == "a" and this_parent.get("href"):
|
664
|
+
return this_parent.get("href")
|
665
|
+
this_parent = this_parent.parent
|
666
|
+
return None
|
667
|
+
|
668
|
+
if img_hyperlink := get_img_hyperlink(img_tag):
|
669
|
+
caption.append(
|
670
|
+
AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
|
671
|
+
)
|
672
|
+
|
409
673
|
if isinstance(figure, Tag):
|
410
674
|
caption_tag = figure.find("figcaption", recursive=False)
|
411
675
|
if isinstance(caption_tag, Tag):
|
412
|
-
caption =
|
413
|
-
|
414
|
-
|
676
|
+
caption = self._extract_text_and_hyperlink_recursively(
|
677
|
+
caption_tag, find_parent_annotation=True
|
678
|
+
)
|
679
|
+
if not caption and img_tag.get("alt"):
|
680
|
+
caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
|
681
|
+
|
682
|
+
caption_anno_text = caption.to_single_text_element()
|
415
683
|
|
416
684
|
caption_item: Optional[TextItem] = None
|
417
|
-
if
|
418
|
-
|
685
|
+
if caption_anno_text.text:
|
686
|
+
text_clean = HTMLDocumentBackend._clean_unicode(
|
687
|
+
caption_anno_text.text.strip()
|
688
|
+
)
|
689
|
+
print(caption_anno_text)
|
419
690
|
caption_item = doc.add_text(
|
420
691
|
label=DocItemLabel.CAPTION,
|
421
|
-
text=
|
422
|
-
orig=
|
692
|
+
text=text_clean,
|
693
|
+
orig=caption_anno_text.text,
|
423
694
|
content_layer=self.content_layer,
|
695
|
+
hyperlink=caption_anno_text.hyperlink,
|
424
696
|
)
|
425
697
|
|
426
698
|
doc.add_picture(
|
@@ -0,0 +1,399 @@
|
|
1
|
+
"""Backend for GBS Google Books schema."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import tarfile
|
5
|
+
from collections.abc import Iterable
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from enum import Enum
|
8
|
+
from io import BytesIO
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
|
11
|
+
|
12
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
13
|
+
from docling_core.types.doc.page import (
|
14
|
+
BoundingRectangle,
|
15
|
+
PdfPageBoundaryType,
|
16
|
+
PdfPageGeometry,
|
17
|
+
SegmentedPdfPage,
|
18
|
+
TextCell,
|
19
|
+
)
|
20
|
+
from lxml import etree
|
21
|
+
from PIL import Image
|
22
|
+
from PIL.Image import Image as PILImage
|
23
|
+
|
24
|
+
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
25
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
26
|
+
from docling.datamodel.base_models import InputFormat
|
27
|
+
|
28
|
+
if TYPE_CHECKING:
|
29
|
+
from docling.datamodel.document import InputDocument
|
30
|
+
|
31
|
+
_log = logging.getLogger(__name__)
|
32
|
+
|
33
|
+
|
34
|
+
def _get_pdf_page_geometry(
|
35
|
+
size: Size,
|
36
|
+
) -> PdfPageGeometry:
|
37
|
+
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
|
38
|
+
|
39
|
+
bbox_tuple = (0, 0, size.width, size.height)
|
40
|
+
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
|
41
|
+
|
42
|
+
return PdfPageGeometry(
|
43
|
+
angle=0.0,
|
44
|
+
rect=BoundingRectangle.from_bounding_box(bbox),
|
45
|
+
boundary_type=boundary_type,
|
46
|
+
art_bbox=bbox,
|
47
|
+
bleed_bbox=bbox,
|
48
|
+
crop_bbox=bbox,
|
49
|
+
media_bbox=bbox,
|
50
|
+
trim_bbox=bbox,
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
class MetsGbsPageBackend(PdfPageBackend):
|
55
|
+
def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
|
56
|
+
self._im = page_im
|
57
|
+
self._dpage = parsed_page
|
58
|
+
self.valid = parsed_page is not None
|
59
|
+
|
60
|
+
def is_valid(self) -> bool:
|
61
|
+
return self.valid
|
62
|
+
|
63
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
64
|
+
# Find intersecting cells on the page
|
65
|
+
text_piece = ""
|
66
|
+
page_size = self.get_size()
|
67
|
+
|
68
|
+
scale = (
|
69
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
70
|
+
)
|
71
|
+
|
72
|
+
for i, cell in enumerate(self._dpage.textline_cells):
|
73
|
+
cell_bbox = (
|
74
|
+
cell.rect.to_bounding_box()
|
75
|
+
.to_top_left_origin(page_height=page_size.height)
|
76
|
+
.scaled(scale)
|
77
|
+
)
|
78
|
+
|
79
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
80
|
+
|
81
|
+
if overlap_frac > 0.5:
|
82
|
+
if len(text_piece) > 0:
|
83
|
+
text_piece += " "
|
84
|
+
text_piece += cell.text
|
85
|
+
|
86
|
+
return text_piece
|
87
|
+
|
88
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
89
|
+
return self._dpage
|
90
|
+
|
91
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
92
|
+
return self._dpage.textline_cells
|
93
|
+
|
94
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
95
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
96
|
+
|
97
|
+
images = self._dpage.bitmap_resources
|
98
|
+
|
99
|
+
for img in images:
|
100
|
+
cropbox = img.rect.to_bounding_box().to_top_left_origin(
|
101
|
+
self.get_size().height
|
102
|
+
)
|
103
|
+
|
104
|
+
if cropbox.area() > AREA_THRESHOLD:
|
105
|
+
cropbox = cropbox.scaled(scale=scale)
|
106
|
+
|
107
|
+
yield cropbox
|
108
|
+
|
109
|
+
def get_page_image(
|
110
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
111
|
+
) -> Image.Image:
|
112
|
+
page_size = self.get_size()
|
113
|
+
assert (
|
114
|
+
page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
|
115
|
+
)
|
116
|
+
|
117
|
+
if not cropbox:
|
118
|
+
cropbox = BoundingBox(
|
119
|
+
l=0,
|
120
|
+
r=page_size.width,
|
121
|
+
t=0,
|
122
|
+
b=page_size.height,
|
123
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
124
|
+
)
|
125
|
+
|
126
|
+
image = self._im.resize(
|
127
|
+
size=(round(page_size.width * scale), round(page_size.height * scale))
|
128
|
+
).crop(cropbox.scaled(scale=scale).as_tuple())
|
129
|
+
return image
|
130
|
+
|
131
|
+
def get_size(self) -> Size:
|
132
|
+
return Size(
|
133
|
+
width=self._dpage.dimension.width, height=self._dpage.dimension.height
|
134
|
+
)
|
135
|
+
|
136
|
+
def unload(self) -> None:
|
137
|
+
if hasattr(self, "_im"):
|
138
|
+
delattr(self, "_im")
|
139
|
+
if hasattr(self, "_dpage"):
|
140
|
+
delattr(self, "_dpage")
|
141
|
+
|
142
|
+
|
143
|
+
class _UseType(str, Enum):
|
144
|
+
IMAGE = "image"
|
145
|
+
OCR = "OCR"
|
146
|
+
COORD_OCR = "coordOCR"
|
147
|
+
|
148
|
+
|
149
|
+
@dataclass
|
150
|
+
class _FileInfo:
|
151
|
+
file_id: str
|
152
|
+
mimetype: str
|
153
|
+
path: str
|
154
|
+
use: _UseType
|
155
|
+
|
156
|
+
|
157
|
+
@dataclass
|
158
|
+
class _PageFiles:
|
159
|
+
image: Optional[_FileInfo] = None
|
160
|
+
ocr: Optional[_FileInfo] = None
|
161
|
+
coordOCR: Optional[_FileInfo] = None
|
162
|
+
|
163
|
+
|
164
|
+
def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
|
165
|
+
"""
|
166
|
+
Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
|
167
|
+
"""
|
168
|
+
parts = title_str.split(";")
|
169
|
+
for part in parts:
|
170
|
+
part = part.strip()
|
171
|
+
if part.startswith("bbox "):
|
172
|
+
try:
|
173
|
+
coords = part.split()[1:]
|
174
|
+
rect = BoundingRectangle.from_bounding_box(
|
175
|
+
bbox=BoundingBox.from_tuple(
|
176
|
+
tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
|
177
|
+
)
|
178
|
+
)
|
179
|
+
return rect
|
180
|
+
except Exception:
|
181
|
+
return None
|
182
|
+
return None
|
183
|
+
|
184
|
+
|
185
|
+
def _extract_confidence(title_str) -> float:
|
186
|
+
"""Extracts x_wconf (OCR confidence) value from title string."""
|
187
|
+
for part in title_str.split(";"):
|
188
|
+
part = part.strip()
|
189
|
+
if part.startswith("x_wconf"):
|
190
|
+
try:
|
191
|
+
return float(part.split()[1]) / 100.0
|
192
|
+
except Exception:
|
193
|
+
return 1
|
194
|
+
return 1
|
195
|
+
|
196
|
+
|
197
|
+
class MetsGbsDocumentBackend(PdfDocumentBackend):
|
198
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
199
|
+
super().__init__(in_doc, path_or_stream)
|
200
|
+
|
201
|
+
self._tar: tarfile.TarFile = (
|
202
|
+
tarfile.open(name=self.path_or_stream, mode="r:gz")
|
203
|
+
if isinstance(self.path_or_stream, Path)
|
204
|
+
else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
|
205
|
+
)
|
206
|
+
self.root_mets: Optional[etree._Element] = None
|
207
|
+
self.page_map: Dict[int, _PageFiles] = {}
|
208
|
+
|
209
|
+
for member in self._tar.getmembers():
|
210
|
+
if member.name.endswith(".xml"):
|
211
|
+
file = self._tar.extractfile(member)
|
212
|
+
if file is not None:
|
213
|
+
content = file.read()
|
214
|
+
self.root_mets = self._validate_mets_xml(content)
|
215
|
+
if self.root_mets is not None:
|
216
|
+
break
|
217
|
+
|
218
|
+
if self.root_mets is None:
|
219
|
+
raise RuntimeError(
|
220
|
+
f"METS GBS backend could not load document {self.document_hash}."
|
221
|
+
)
|
222
|
+
|
223
|
+
ns = {
|
224
|
+
"mets": "http://www.loc.gov/METS/",
|
225
|
+
"xlink": "http://www.w3.org/1999/xlink",
|
226
|
+
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
|
227
|
+
"gbs": "http://books.google.com/gbs",
|
228
|
+
"premis": "info:lc/xmlns/premis-v2",
|
229
|
+
"marc": "http://www.loc.gov/MARC21/slim",
|
230
|
+
}
|
231
|
+
|
232
|
+
file_info_by_id: Dict[str, _FileInfo] = {}
|
233
|
+
|
234
|
+
for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
|
235
|
+
use_raw = filegrp.get("USE")
|
236
|
+
try:
|
237
|
+
use = _UseType(use_raw)
|
238
|
+
except ValueError:
|
239
|
+
continue # Ignore unknown USE types
|
240
|
+
|
241
|
+
for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
|
242
|
+
file_id = file_elem.get("ID")
|
243
|
+
mimetype = file_elem.get("MIMETYPE")
|
244
|
+
flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
|
245
|
+
href = (
|
246
|
+
flocat_elem.get("{http://www.w3.org/1999/xlink}href")
|
247
|
+
if flocat_elem is not None
|
248
|
+
else None
|
249
|
+
)
|
250
|
+
if href is None:
|
251
|
+
continue
|
252
|
+
|
253
|
+
file_info_by_id[file_id] = _FileInfo(
|
254
|
+
file_id=file_id, mimetype=mimetype, path=href, use=use
|
255
|
+
)
|
256
|
+
|
257
|
+
USE_TO_ATTR = {
|
258
|
+
_UseType.IMAGE: "image",
|
259
|
+
_UseType.OCR: "ocr",
|
260
|
+
_UseType.COORD_OCR: "coordOCR",
|
261
|
+
}
|
262
|
+
|
263
|
+
for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
|
264
|
+
order_str = div.get("ORDER")
|
265
|
+
if not order_str:
|
266
|
+
continue
|
267
|
+
try:
|
268
|
+
page_no = int(order_str) - 1 # make 0-index pages
|
269
|
+
except ValueError:
|
270
|
+
continue
|
271
|
+
|
272
|
+
page_files = _PageFiles()
|
273
|
+
|
274
|
+
for fptr in div.xpath("./mets:fptr", namespaces=ns):
|
275
|
+
file_id = fptr.get("FILEID")
|
276
|
+
file_info = file_info_by_id.get(file_id)
|
277
|
+
|
278
|
+
if file_info:
|
279
|
+
attr = USE_TO_ATTR.get(file_info.use)
|
280
|
+
if attr:
|
281
|
+
setattr(page_files, attr, file_info)
|
282
|
+
|
283
|
+
self.page_map[page_no] = page_files
|
284
|
+
|
285
|
+
def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
|
286
|
+
root: etree._Element = etree.fromstring(xml_string)
|
287
|
+
if (
|
288
|
+
root.tag == "{http://www.loc.gov/METS/}mets"
|
289
|
+
and root.get("PROFILE") == "gbs"
|
290
|
+
):
|
291
|
+
return root
|
292
|
+
|
293
|
+
_log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
|
294
|
+
return None
|
295
|
+
|
296
|
+
def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
|
297
|
+
# TODO: use better fallbacks...
|
298
|
+
image_info = self.page_map[page_no].image
|
299
|
+
assert image_info is not None
|
300
|
+
ocr_info = self.page_map[page_no].coordOCR
|
301
|
+
assert ocr_info is not None
|
302
|
+
|
303
|
+
image_file = self._tar.extractfile(image_info.path)
|
304
|
+
assert image_file is not None
|
305
|
+
buf = BytesIO(image_file.read())
|
306
|
+
im: PILImage = Image.open(buf)
|
307
|
+
ocr_file = self._tar.extractfile(ocr_info.path)
|
308
|
+
assert ocr_file is not None
|
309
|
+
ocr_content = ocr_file.read()
|
310
|
+
parser = etree.HTMLParser()
|
311
|
+
ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
|
312
|
+
|
313
|
+
line_cells: List[TextCell] = []
|
314
|
+
word_cells: List[TextCell] = []
|
315
|
+
|
316
|
+
page_div = ocr_root.xpath("//div[@class='ocr_page']")
|
317
|
+
|
318
|
+
size = Size(width=im.size[0], height=im.size[1])
|
319
|
+
if page_div:
|
320
|
+
title = page_div[0].attrib.get("title", "")
|
321
|
+
rect = _extract_rect(title)
|
322
|
+
if rect:
|
323
|
+
size = Size(width=rect.width, height=rect.height)
|
324
|
+
else:
|
325
|
+
_log.error(f"Could not find ocr_page for page {page_no}")
|
326
|
+
|
327
|
+
im = im.resize(size=(round(size.width), round(size.height)))
|
328
|
+
im = im.convert("RGB")
|
329
|
+
|
330
|
+
# Extract all ocrx_word spans
|
331
|
+
for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
|
332
|
+
text = "".join(word.itertext()).strip()
|
333
|
+
title = word.attrib.get("title", "")
|
334
|
+
rect = _extract_rect(title)
|
335
|
+
conf = _extract_confidence(title)
|
336
|
+
if rect:
|
337
|
+
word_cells.append(
|
338
|
+
TextCell(
|
339
|
+
index=ix,
|
340
|
+
text=text,
|
341
|
+
orig=text,
|
342
|
+
rect=rect,
|
343
|
+
from_ocr=True,
|
344
|
+
confidence=conf,
|
345
|
+
)
|
346
|
+
)
|
347
|
+
|
348
|
+
# Extract all ocr_line spans
|
349
|
+
# line: etree._Element
|
350
|
+
for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
|
351
|
+
text = "".join(line.itertext()).strip()
|
352
|
+
title = line.attrib.get("title", "")
|
353
|
+
rect = _extract_rect(title)
|
354
|
+
conf = _extract_confidence(title)
|
355
|
+
if rect:
|
356
|
+
line_cells.append(
|
357
|
+
TextCell(
|
358
|
+
index=ix,
|
359
|
+
text=text,
|
360
|
+
orig=text,
|
361
|
+
rect=rect,
|
362
|
+
from_ocr=True,
|
363
|
+
confidence=conf,
|
364
|
+
)
|
365
|
+
)
|
366
|
+
|
367
|
+
page = SegmentedPdfPage(
|
368
|
+
dimension=_get_pdf_page_geometry(size),
|
369
|
+
textline_cells=line_cells,
|
370
|
+
char_cells=[],
|
371
|
+
word_cells=word_cells,
|
372
|
+
has_textlines=True,
|
373
|
+
has_words=True,
|
374
|
+
has_chars=False,
|
375
|
+
)
|
376
|
+
return page, im
|
377
|
+
|
378
|
+
def page_count(self) -> int:
|
379
|
+
return len(self.page_map)
|
380
|
+
|
381
|
+
def load_page(self, page_no: int) -> MetsGbsPageBackend:
|
382
|
+
# TODO: is this thread-safe?
|
383
|
+
page, im = self._parse_page(page_no)
|
384
|
+
return MetsGbsPageBackend(parsed_page=page, page_im=im)
|
385
|
+
|
386
|
+
def is_valid(self) -> bool:
|
387
|
+
return self.root_mets is not None and self.page_count() > 0
|
388
|
+
|
389
|
+
@classmethod
|
390
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
391
|
+
return {InputFormat.METS_GBS}
|
392
|
+
|
393
|
+
@classmethod
|
394
|
+
def supports_pagination(cls) -> bool:
|
395
|
+
return True
|
396
|
+
|
397
|
+
def unload(self) -> None:
|
398
|
+
super().unload()
|
399
|
+
self._tar.close()
|
docling/backend/pdf_backend.py
CHANGED
@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
84
84
|
|
85
85
|
buf.seek(0)
|
86
86
|
self.path_or_stream = buf
|
87
|
-
|
87
|
+
elif self.input_format not in self.supported_formats():
|
88
88
|
raise RuntimeError(
|
89
|
-
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
|
89
|
+
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
|
90
90
|
)
|
91
91
|
|
92
92
|
@abstractmethod
|
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
99
99
|
|
100
100
|
@classmethod
|
101
101
|
def supported_formats(cls) -> Set[InputFormat]:
|
102
|
-
return {InputFormat.PDF}
|
102
|
+
return {InputFormat.PDF, InputFormat.IMAGE}
|
103
103
|
|
104
104
|
@classmethod
|
105
105
|
def supports_pagination(cls) -> bool:
|
docling/cli/main.py
CHANGED
@@ -26,6 +26,7 @@ from rich.console import Console
|
|
26
26
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
27
27
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
28
28
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
29
|
+
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
29
30
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
30
31
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
31
32
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
@@ -607,9 +608,18 @@ def convert( # noqa: C901
|
|
607
608
|
backend=backend, # pdf_backend
|
608
609
|
)
|
609
610
|
|
611
|
+
# METS GBS options
|
612
|
+
mets_gbs_options = pipeline_options.model_copy()
|
613
|
+
mets_gbs_options.do_ocr = False
|
614
|
+
mets_gbs_format_option = PdfFormatOption(
|
615
|
+
pipeline_options=mets_gbs_options,
|
616
|
+
backend=MetsGbsDocumentBackend,
|
617
|
+
)
|
618
|
+
|
610
619
|
format_options = {
|
611
620
|
InputFormat.PDF: pdf_format_option,
|
612
621
|
InputFormat.IMAGE: pdf_format_option,
|
622
|
+
InputFormat.METS_GBS: mets_gbs_format_option,
|
613
623
|
}
|
614
624
|
|
615
625
|
elif pipeline == ProcessingPipeline.VLM:
|
docling/datamodel/base_models.py
CHANGED
@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
|
|
56
56
|
XLSX = "xlsx"
|
57
57
|
XML_USPTO = "xml_uspto"
|
58
58
|
XML_JATS = "xml_jats"
|
59
|
+
METS_GBS = "mets_gbs"
|
59
60
|
JSON_DOCLING = "json_docling"
|
60
61
|
AUDIO = "audio"
|
61
62
|
|
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
81
82
|
InputFormat.CSV: ["csv"],
|
82
83
|
InputFormat.XLSX: ["xlsx", "xlsm"],
|
83
84
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
85
|
+
InputFormat.METS_GBS: ["tar.gz"],
|
84
86
|
InputFormat.JSON_DOCLING: ["json"],
|
85
87
|
InputFormat.AUDIO: ["wav", "mp3"],
|
86
88
|
}
|
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
113
115
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
114
116
|
],
|
115
117
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
118
|
+
InputFormat.METS_GBS: ["application/mets+xml"],
|
116
119
|
InputFormat.JSON_DOCLING: ["application/json"],
|
117
120
|
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
|
118
121
|
}
|
docling/datamodel/document.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import csv
|
2
2
|
import logging
|
3
3
|
import re
|
4
|
+
import tarfile
|
4
5
|
from collections.abc import Iterable
|
5
6
|
from enum import Enum
|
6
7
|
from io import BytesIO
|
@@ -314,6 +315,10 @@ class _DocumentConversionInput(BaseModel):
|
|
314
315
|
elif objname.endswith(".pptx"):
|
315
316
|
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
316
317
|
|
318
|
+
if mime is not None and mime.lower() == "application/gzip":
|
319
|
+
if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
|
320
|
+
mime = detected_mime
|
321
|
+
|
317
322
|
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
318
323
|
mime = mime or _DocumentConversionInput._detect_csv(content)
|
319
324
|
mime = mime or "text/plain"
|
@@ -457,3 +462,24 @@ class _DocumentConversionInput(BaseModel):
|
|
457
462
|
return None
|
458
463
|
|
459
464
|
return None
|
465
|
+
|
466
|
+
@staticmethod
|
467
|
+
def _detect_mets_gbs(
|
468
|
+
obj: Union[Path, DocumentStream],
|
469
|
+
) -> Optional[Literal["application/mets+xml"]]:
|
470
|
+
content = obj if isinstance(obj, Path) else obj.stream
|
471
|
+
tar: tarfile.TarFile
|
472
|
+
member: tarfile.TarInfo
|
473
|
+
with tarfile.open(
|
474
|
+
name=content if isinstance(content, Path) else None,
|
475
|
+
fileobj=content if isinstance(content, BytesIO) else None,
|
476
|
+
mode="r:gz",
|
477
|
+
) as tar:
|
478
|
+
for member in tar.getmembers():
|
479
|
+
if member.name.endswith(".xml"):
|
480
|
+
file = tar.extractfile(member)
|
481
|
+
if file is not None:
|
482
|
+
content_str = file.read().decode(errors="ignore")
|
483
|
+
if "http://www.loc.gov/METS/" in content_str:
|
484
|
+
return "application/mets+xml"
|
485
|
+
return None
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import Any,
|
2
|
+
from typing import Any, Dict, List, Literal, Optional
|
3
3
|
|
4
4
|
from docling_core.types.doc.page import SegmentedPage
|
5
5
|
from pydantic import AnyUrl, BaseModel
|
@@ -10,11 +10,17 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
10
10
|
|
11
11
|
class BaseVlmOptions(BaseModel):
|
12
12
|
kind: str
|
13
|
-
prompt:
|
13
|
+
prompt: str
|
14
14
|
scale: float = 2.0
|
15
15
|
max_size: Optional[int] = None
|
16
16
|
temperature: float = 0.0
|
17
17
|
|
18
|
+
def build_prompt(self, page: Optional[SegmentedPage]) -> str:
|
19
|
+
return self.prompt
|
20
|
+
|
21
|
+
def decode_response(self, text: str) -> str:
|
22
|
+
return text
|
23
|
+
|
18
24
|
|
19
25
|
class ResponseFormat(str, Enum):
|
20
26
|
DOCTAGS = "doctags"
|
docling/document_converter.py
CHANGED
@@ -20,6 +20,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
|
20
20
|
from docling.backend.html_backend import HTMLDocumentBackend
|
21
21
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
22
22
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
23
|
+
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
23
24
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
24
25
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
25
26
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
@@ -159,6 +160,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
159
160
|
InputFormat.XML_JATS: FormatOption(
|
160
161
|
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
161
162
|
),
|
163
|
+
InputFormat.METS_GBS: FormatOption(
|
164
|
+
pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
|
165
|
+
),
|
162
166
|
InputFormat.IMAGE: FormatOption(
|
163
167
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
164
168
|
),
|
docling/models/api_vlm_model.py
CHANGED
@@ -53,11 +53,7 @@ class ApiVlmModel(BasePageModel):
|
|
53
53
|
if hi_res_image.mode != "RGB":
|
54
54
|
hi_res_image = hi_res_image.convert("RGB")
|
55
55
|
|
56
|
-
|
57
|
-
prompt = self.vlm_options.prompt(page.parsed_page)
|
58
|
-
else:
|
59
|
-
prompt = self.vlm_options.prompt
|
60
|
-
|
56
|
+
prompt = self.vlm_options.build_prompt(page.parsed_page)
|
61
57
|
page_tags = api_image_request(
|
62
58
|
image=hi_res_image,
|
63
59
|
prompt=prompt,
|
@@ -67,6 +63,7 @@ class ApiVlmModel(BasePageModel):
|
|
67
63
|
**self.params,
|
68
64
|
)
|
69
65
|
|
66
|
+
page_tags = self.vlm_options.decode_response(page_tags)
|
70
67
|
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
71
68
|
|
72
69
|
return page
|
@@ -135,10 +135,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
135
135
|
)
|
136
136
|
|
137
137
|
# Define prompt structure
|
138
|
-
|
139
|
-
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
140
|
-
else:
|
141
|
-
user_prompt = self.vlm_options.prompt
|
138
|
+
user_prompt = self.vlm_options.build_prompt(page.parsed_page)
|
142
139
|
prompt = self.formulate_prompt(user_prompt)
|
143
140
|
|
144
141
|
inputs = self.processor(
|
@@ -166,6 +163,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
166
163
|
_log.debug(
|
167
164
|
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
168
165
|
)
|
166
|
+
generated_texts = self.vlm_options.decode_response(generated_texts)
|
169
167
|
page.predictions.vlm_response = VlmPrediction(
|
170
168
|
text=generated_texts,
|
171
169
|
generation_time=generation_time,
|
@@ -84,10 +84,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
84
84
|
if hi_res_image.mode != "RGB":
|
85
85
|
hi_res_image = hi_res_image.convert("RGB")
|
86
86
|
|
87
|
-
|
88
|
-
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
89
|
-
else:
|
90
|
-
user_prompt = self.vlm_options.prompt
|
87
|
+
user_prompt = self.vlm_options.build_prompt(page.parsed_page)
|
91
88
|
prompt = self.apply_chat_template(
|
92
89
|
self.processor, self.config, user_prompt, num_images=1
|
93
90
|
)
|
@@ -142,6 +139,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
142
139
|
_log.debug(
|
143
140
|
f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
|
144
141
|
)
|
142
|
+
page_tags = self.vlm_options.decode_response(page_tags)
|
145
143
|
page.predictions.vlm_response = VlmPrediction(
|
146
144
|
text=page_tags,
|
147
145
|
generation_time=generation_time,
|
@@ -8,7 +8,10 @@ from typing import Any, Callable, List
|
|
8
8
|
|
9
9
|
from docling_core.types.doc import NodeItem
|
10
10
|
|
11
|
-
from docling.backend.abstract_backend import
|
11
|
+
from docling.backend.abstract_backend import (
|
12
|
+
AbstractDocumentBackend,
|
13
|
+
PaginatedDocumentBackend,
|
14
|
+
)
|
12
15
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
13
16
|
from docling.datamodel.base_models import (
|
14
17
|
ConversionStatus,
|
@@ -126,10 +129,10 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
126
129
|
yield from page_batch
|
127
130
|
|
128
131
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
129
|
-
if not isinstance(conv_res.input._backend,
|
132
|
+
if not isinstance(conv_res.input._backend, PaginatedDocumentBackend):
|
130
133
|
raise RuntimeError(
|
131
|
-
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a
|
132
|
-
f"Can not convert this with a PDF pipeline. "
|
134
|
+
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a paginated backend. "
|
135
|
+
f"Can not convert this with a paginated PDF pipeline. "
|
133
136
|
f"Please check your format configuration on DocumentConverter."
|
134
137
|
)
|
135
138
|
# conv_res.status = ConversionStatus.FAILURE
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.45.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -1,5 +1,5 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/document_converter.py,sha256=
|
2
|
+
docling/document_converter.py,sha256=7lid_uhGNuurYICweaA1jqtSbnhf3hpuUYUNleHh-Ww,15924
|
3
3
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
4
4
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
5
5
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -9,13 +9,14 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
|
|
9
9
|
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
10
10
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
11
|
docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
|
12
|
-
docling/backend/html_backend.py,sha256=
|
12
|
+
docling/backend/html_backend.py,sha256=jTkpdJ-EKMmkbUfh88DONVG-gENE7m0_cnIhWpWSobI,34523
|
13
13
|
docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
|
14
|
+
docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
|
14
15
|
docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
|
15
16
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
16
17
|
docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
|
17
18
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
18
|
-
docling/backend/pdf_backend.py,sha256=
|
19
|
+
docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
|
19
20
|
docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
|
20
21
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
22
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -28,22 +29,22 @@ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5g
|
|
28
29
|
docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
|
29
30
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
30
31
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
docling/cli/main.py,sha256
|
32
|
+
docling/cli/main.py,sha256=-W_vdKvSm5gZUZyvRpFH0YMI_1iJrP5sJOZ5_1bLorw,30359
|
32
33
|
docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
|
33
34
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
34
35
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
36
|
docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
|
36
37
|
docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
|
37
|
-
docling/datamodel/base_models.py,sha256=
|
38
|
-
docling/datamodel/document.py,sha256=
|
38
|
+
docling/datamodel/base_models.py,sha256=Ifd8PPHs4sW7ScwSqpa-y3rwgPbde_iw13Y2NUCPfU8,11944
|
39
|
+
docling/datamodel/document.py,sha256=zsxFYXvo6GtwGNogSDoBB1TFvkm7IOrP_VnqXNqBhJs,17329
|
39
40
|
docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
|
40
41
|
docling/datamodel/pipeline_options.py,sha256=TaBmCBRjSxyoh79UkpEkPzokLYS8BA2QJam86g9pT5g,10544
|
41
42
|
docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
|
42
|
-
docling/datamodel/pipeline_options_vlm_model.py,sha256=
|
43
|
+
docling/datamodel/pipeline_options_vlm_model.py,sha256=eH-Cj_8aic9FdX4xGlBcf5_R9e152JAL2LhtY8d0rhw,2498
|
43
44
|
docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
|
44
45
|
docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
|
45
46
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
46
|
-
docling/models/api_vlm_model.py,sha256
|
47
|
+
docling/models/api_vlm_model.py,sha256=-zisU32pgDRbychyG6-neB0qweNbPaYnLXwiGT7SEdI,2859
|
47
48
|
docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
|
48
49
|
docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
|
49
50
|
docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
|
@@ -70,11 +71,11 @@ docling/models/plugins/defaults.py,sha256=OAHWW2tCcUXSyDMFxV_lXVRjSBJ1n6z-Eb3R8c
|
|
70
71
|
docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
72
|
docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
|
72
73
|
docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
73
|
-
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=
|
74
|
-
docling/models/vlm_models_inline/mlx_model.py,sha256=
|
74
|
+
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=Rwdr7neDpn5ehtrp6n7G21fcPBK2m9Har_6BFNdyw-Q,8359
|
75
|
+
docling/models/vlm_models_inline/mlx_model.py,sha256=YYYmopsITlX17JVS5KhLlb1IQSEVoSECNx_fXLHNpAc,5880
|
75
76
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
76
77
|
docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
|
77
|
-
docling/pipeline/base_pipeline.py,sha256=
|
78
|
+
docling/pipeline/base_pipeline.py,sha256=MOKZtx3jNYotfntgoJHoyb6UsvdvG6bQLyDl9Lxvc1w,9586
|
78
79
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
79
80
|
docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
|
80
81
|
docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=Rjdq1x2fRHBA0rMHJ6rqqHzxVVzgTEALBBj5d30oOZ8,26018
|
@@ -92,9 +93,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
92
93
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
93
94
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
94
95
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
95
|
-
docling-2.
|
96
|
-
docling-2.
|
97
|
-
docling-2.
|
98
|
-
docling-2.
|
99
|
-
docling-2.
|
100
|
-
docling-2.
|
96
|
+
docling-2.45.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
97
|
+
docling-2.45.0.dist-info/METADATA,sha256=-iB6xJ4H7DIStzPn-ruYcBa_Tq45Ijk52zfoM_6FkCE,10459
|
98
|
+
docling-2.45.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
99
|
+
docling-2.45.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
100
|
+
docling-2.45.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
101
|
+
docling-2.45.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|