docling 2.44.0__py3-none-any.whl → 2.45.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,11 @@
1
1
  import logging
2
2
  import re
3
+ from contextlib import contextmanager
4
+ from copy import deepcopy
3
5
  from io import BytesIO
4
6
  from pathlib import Path
5
7
  from typing import Final, Optional, Union, cast
8
+ from urllib.parse import urljoin
6
9
 
7
10
  from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
8
11
  from bs4.element import PreformattedString
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
18
21
  TextItem,
19
22
  )
20
23
  from docling_core.types.doc.document import ContentLayer
21
- from pydantic import BaseModel
24
+ from pydantic import AnyUrl, BaseModel, ValidationError
22
25
  from typing_extensions import override
23
26
 
24
27
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -56,12 +59,76 @@ class _Context(BaseModel):
56
59
  list_start_by_ref: dict[str, int] = {}
57
60
 
58
61
 
62
+ class AnnotatedText(BaseModel):
63
+ text: str
64
+ hyperlink: Union[AnyUrl, Path, None] = None
65
+
66
+
67
+ class AnnotatedTextList(list):
68
+ def to_single_text_element(self) -> AnnotatedText:
69
+ current_h = None
70
+ current_text = ""
71
+ for at in self:
72
+ t = at.text
73
+ h = at.hyperlink
74
+ current_text += t.strip() + " "
75
+ if h is not None and current_h is None:
76
+ current_h = h
77
+ elif h is not None and current_h is not None and h != current_h:
78
+ _log.warning(
79
+ f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
80
+ )
81
+ return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
82
+
83
+ def simplify_text_elements(self) -> "AnnotatedTextList":
84
+ simplified = AnnotatedTextList()
85
+ if not self:
86
+ return self
87
+ text = self[0].text
88
+ hyperlink = self[0].hyperlink
89
+ last_elm = text
90
+ for i in range(1, len(self)):
91
+ if hyperlink == self[i].hyperlink:
92
+ sep = " "
93
+ if not self[i].text.strip() or not last_elm.strip():
94
+ sep = ""
95
+ text += sep + self[i].text
96
+ last_elm = self[i].text
97
+ else:
98
+ simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
99
+ text = self[i].text
100
+ last_elm = text
101
+ hyperlink = self[i].hyperlink
102
+ if text:
103
+ simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
104
+ return simplified
105
+
106
+ def split_by_newline(self):
107
+ super_list = []
108
+ active_annotated_text_list = AnnotatedTextList()
109
+ for el in self:
110
+ sub_texts = el.text.split("\n")
111
+ if len(sub_texts) == 1:
112
+ active_annotated_text_list.append(el)
113
+ else:
114
+ for text in sub_texts:
115
+ sub_el = deepcopy(el)
116
+ sub_el.text = text
117
+ active_annotated_text_list.append(sub_el)
118
+ super_list.append(active_annotated_text_list)
119
+ active_annotated_text_list = AnnotatedTextList()
120
+ if active_annotated_text_list:
121
+ super_list.append(active_annotated_text_list)
122
+ return super_list
123
+
124
+
59
125
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
60
126
  @override
61
127
  def __init__(
62
128
  self,
63
129
  in_doc: InputDocument,
64
130
  path_or_stream: Union[BytesIO, Path],
131
+ original_url: Optional[AnyUrl] = None,
65
132
  ):
66
133
  super().__init__(in_doc, path_or_stream)
67
134
  self.soup: Optional[Tag] = None
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
74
141
  self.ctx = _Context()
75
142
  for i in range(self.max_levels):
76
143
  self.parents[i] = None
144
+ self.hyperlink = None
145
+ self.original_url = original_url
77
146
 
78
147
  try:
79
148
  raw = (
@@ -160,26 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
160
229
  element: The XML tag to parse.
161
230
  doc: The Docling document to be updated with the parsed content.
162
231
  """
163
- buffer: list[str] = []
232
+ buffer: AnnotatedTextList = AnnotatedTextList()
164
233
 
165
234
  def flush_buffer():
166
235
  if not buffer:
167
236
  return
168
- text = "".join(buffer).strip()
237
+ annotated_text_list = buffer.simplify_text_elements()
238
+ parts = annotated_text_list.split_by_newline()
169
239
  buffer.clear()
170
- if not text:
240
+
241
+ if not "".join([el.text for el in annotated_text_list]):
171
242
  return
172
- for part in text.split("\n"):
173
- seg = part.strip()
174
- seg_clean = HTMLDocumentBackend._clean_unicode(seg)
175
- if seg:
176
- doc.add_text(
177
- label=DocItemLabel.TEXT,
178
- text=seg_clean,
179
- orig=seg,
180
- parent=self.parents[self.level],
181
- content_layer=self.content_layer,
182
- )
243
+
244
+ for annotated_text_list in parts:
245
+ with self.use_inline_group(annotated_text_list, doc):
246
+ for annotated_text in annotated_text_list:
247
+ if annotated_text.text.strip():
248
+ seg_clean = HTMLDocumentBackend._clean_unicode(
249
+ annotated_text.text.strip()
250
+ )
251
+ doc.add_text(
252
+ parent=self.parents[self.level],
253
+ label=DocItemLabel.TEXT,
254
+ text=seg_clean,
255
+ content_layer=self.content_layer,
256
+ hyperlink=annotated_text.hyperlink,
257
+ )
183
258
 
184
259
  for node in element.contents:
185
260
  if isinstance(node, Tag):
@@ -187,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
187
262
  if name == "img":
188
263
  flush_buffer()
189
264
  self._emit_image(node, doc)
265
+ elif name == "a":
266
+ with self.use_hyperlink(node):
267
+ self._walk(node, doc)
190
268
  elif name in _BLOCK_TAGS:
191
269
  flush_buffer()
192
270
  self._handle_block(node, doc)
@@ -194,28 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
194
272
  flush_buffer()
195
273
  self._walk(node, doc)
196
274
  else:
197
- buffer.append(node.text)
275
+ buffer.extend(
276
+ self._extract_text_and_hyperlink_recursively(
277
+ node, find_parent_annotation=True, keep_newlines=True
278
+ )
279
+ )
198
280
  elif isinstance(node, NavigableString) and not isinstance(
199
281
  node, PreformattedString
200
282
  ):
201
- buffer.append(str(node))
283
+ if str(node).strip("\n\r") == "":
284
+ flush_buffer()
285
+ else:
286
+ buffer.extend(
287
+ self._extract_text_and_hyperlink_recursively(
288
+ node, find_parent_annotation=True, keep_newlines=True
289
+ )
290
+ )
202
291
 
203
292
  flush_buffer()
204
293
 
294
+ def _extract_text_and_hyperlink_recursively(
295
+ self,
296
+ item: PageElement,
297
+ ignore_list=False,
298
+ find_parent_annotation=False,
299
+ keep_newlines=False,
300
+ ) -> AnnotatedTextList:
301
+ result: AnnotatedTextList = AnnotatedTextList()
302
+
303
+ # If find_parent_annotation, make sure that we keep track of
304
+ # any a-tag that has been present in the DOM-parents already.
305
+ if find_parent_annotation:
306
+ this_parent = item.parent
307
+ while this_parent is not None:
308
+ if this_parent.name == "a" and this_parent.get("href"):
309
+ with self.use_hyperlink(this_parent):
310
+ return self._extract_text_and_hyperlink_recursively(
311
+ item, ignore_list
312
+ )
313
+ this_parent = this_parent.parent
314
+
315
+ if isinstance(item, PreformattedString):
316
+ return AnnotatedTextList()
317
+
318
+ if isinstance(item, NavigableString):
319
+ text = item.strip()
320
+ if text:
321
+ return AnnotatedTextList(
322
+ [AnnotatedText(text=text, hyperlink=self.hyperlink)]
323
+ )
324
+ if keep_newlines and item.strip("\n\r") == "":
325
+ return AnnotatedTextList(
326
+ [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
327
+ )
328
+ return AnnotatedTextList()
329
+
330
+ tag = cast(Tag, item)
331
+ if not ignore_list or (tag.name not in ["ul", "ol"]):
332
+ for child in tag:
333
+ if isinstance(child, Tag) and child.name == "a":
334
+ with self.use_hyperlink(child):
335
+ result.extend(
336
+ self._extract_text_and_hyperlink_recursively(
337
+ child, ignore_list, keep_newlines=keep_newlines
338
+ )
339
+ )
340
+ else:
341
+ # Recursively get the child's text content
342
+ result.extend(
343
+ self._extract_text_and_hyperlink_recursively(
344
+ child, ignore_list, keep_newlines=keep_newlines
345
+ )
346
+ )
347
+ return result
348
+
349
+ @contextmanager
350
+ def use_hyperlink(self, tag):
351
+ this_href = tag.get("href")
352
+ if this_href is None:
353
+ yield None
354
+ else:
355
+ if this_href:
356
+ old_hyperlink = self.hyperlink
357
+ if self.original_url is not None:
358
+ this_href = urljoin(self.original_url, this_href)
359
+ # ugly fix for relative links since pydantic does not support them.
360
+ try:
361
+ AnyUrl(this_href)
362
+ except ValidationError:
363
+ this_href = Path(this_href)
364
+ self.hyperlink = this_href
365
+ try:
366
+ yield None
367
+ finally:
368
+ if this_href:
369
+ self.hyperlink = old_hyperlink
370
+
371
+ @contextmanager
372
+ def use_inline_group(
373
+ self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
374
+ ):
375
+ """Create an inline group for annotated texts.
376
+
377
+ Checks if annotated_text_list has more than one item and if so creates an inline
378
+ group in which the text elements can then be generated. While the context manager
379
+ is active the inline group is set as the current parent.
380
+
381
+ Args:
382
+ annotated_text_list (AnnotatedTextList): Annotated text
383
+ doc (DoclingDocument): Currently used document
384
+
385
+ Yields:
386
+ None: _description_
387
+ """
388
+ if len(annotated_text_list) > 1:
389
+ inline_fmt = doc.add_group(
390
+ label=GroupLabel.INLINE,
391
+ parent=self.parents[self.level],
392
+ content_layer=self.content_layer,
393
+ )
394
+ self.parents[self.level + 1] = inline_fmt
395
+ self.level += 1
396
+ try:
397
+ yield None
398
+ finally:
399
+ self.parents[self.level] = None
400
+ self.level -= 1
401
+ else:
402
+ yield None
403
+
205
404
  def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
206
405
  tag_name = tag.name.lower()
207
406
  # set default content layer to BODY as soon as we encounter a heading
208
407
  self.content_layer = ContentLayer.BODY
209
408
  level = int(tag_name[1])
210
- text = tag.get_text(strip=True, separator=" ")
211
- text_clean = HTMLDocumentBackend._clean_unicode(text)
409
+ annotated_text_list = self._extract_text_and_hyperlink_recursively(
410
+ tag, find_parent_annotation=True
411
+ )
412
+ annotated_text = annotated_text_list.to_single_text_element()
413
+ text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
212
414
  # the first level is for the title item
213
415
  if level == 1:
214
416
  for key in self.parents.keys():
215
417
  self.parents[key] = None
216
418
  self.level = 0
217
419
  self.parents[self.level + 1] = doc.add_title(
218
- text=text_clean, orig=text, content_layer=self.content_layer
420
+ text_clean,
421
+ content_layer=self.content_layer,
422
+ hyperlink=annotated_text.hyperlink,
219
423
  )
220
424
  # the other levels need to be lowered by 1 if a title was set
221
425
  else:
@@ -241,9 +445,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
241
445
  self.parents[self.level + 1] = doc.add_heading(
242
446
  parent=self.parents[self.level],
243
447
  text=text_clean,
244
- orig=text,
448
+ orig=annotated_text.text,
245
449
  level=self.level,
246
450
  content_layer=self.content_layer,
451
+ hyperlink=annotated_text.hyperlink,
247
452
  )
248
453
  self.level += 1
249
454
  for img_tag in tag("img"):
@@ -292,37 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
292
497
  marker = ""
293
498
 
294
499
  # 2) extract only the "direct" text from this <li>
295
- parts: list[str] = []
296
- for child in li.contents:
297
- if isinstance(child, NavigableString) and not isinstance(
298
- child, PreformattedString
299
- ):
300
- parts.append(child)
301
- elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
302
- text_part = HTMLDocumentBackend.get_text(child)
303
- if text_part:
304
- parts.append(text_part)
305
- li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
306
- li_clean = HTMLDocumentBackend._clean_unicode(li_text)
500
+ parts = self._extract_text_and_hyperlink_recursively(
501
+ li, ignore_list=True, find_parent_annotation=True
502
+ )
503
+ min_parts = parts.simplify_text_elements()
504
+ li_text = re.sub(
505
+ r"\s+|\n+", " ", "".join([el.text for el in min_parts])
506
+ ).strip()
307
507
 
308
508
  # 3) add the list item
309
509
  if li_text:
310
- self.parents[self.level + 1] = doc.add_list_item(
311
- text=li_clean,
312
- enumerated=is_ordered,
313
- marker=marker,
314
- orig=li_text,
315
- parent=list_group,
316
- content_layer=self.content_layer,
317
- )
318
-
319
- # 4) recurse into any nested lists, attaching them to this <li> item
320
- for sublist in li({"ul", "ol"}, recursive=False):
321
- if isinstance(sublist, Tag):
322
- self.level += 1
323
- self._handle_block(sublist, doc)
324
- self.parents[self.level + 1] = None
325
- self.level -= 1
510
+ if len(min_parts) > 1:
511
+ # create an empty list element in order to hook the inline group onto that one
512
+ self.parents[self.level + 1] = doc.add_list_item(
513
+ text="",
514
+ enumerated=is_ordered,
515
+ marker=marker,
516
+ parent=list_group,
517
+ content_layer=self.content_layer,
518
+ )
519
+ self.level += 1
520
+ with self.use_inline_group(min_parts, doc):
521
+ for annotated_text in min_parts:
522
+ li_text = re.sub(
523
+ r"\s+|\n+", " ", annotated_text.text
524
+ ).strip()
525
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
526
+ doc.add_text(
527
+ parent=self.parents[self.level],
528
+ label=DocItemLabel.TEXT,
529
+ text=li_clean,
530
+ content_layer=self.content_layer,
531
+ hyperlink=annotated_text.hyperlink,
532
+ )
533
+
534
+ # 4) recurse into any nested lists, attaching them to this <li> item
535
+ for sublist in li({"ul", "ol"}, recursive=False):
536
+ if isinstance(sublist, Tag):
537
+ self._handle_block(sublist, doc)
538
+
539
+ # now the list element with inline group is not a parent anymore
540
+ self.parents[self.level] = None
541
+ self.level -= 1
542
+ else:
543
+ annotated_text = min_parts[0]
544
+ li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
545
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
546
+ self.parents[self.level + 1] = doc.add_list_item(
547
+ text=li_clean,
548
+ enumerated=is_ordered,
549
+ marker=marker,
550
+ orig=li_text,
551
+ parent=list_group,
552
+ content_layer=self.content_layer,
553
+ hyperlink=annotated_text.hyperlink,
554
+ )
555
+
556
+ # 4) recurse into any nested lists, attaching them to this <li> item
557
+ for sublist in li({"ul", "ol"}, recursive=False):
558
+ if isinstance(sublist, Tag):
559
+ self.level += 1
560
+ self._handle_block(sublist, doc)
561
+ self.parents[self.level + 1] = None
562
+ self.level -= 1
326
563
  else:
327
564
  for sublist in li({"ul", "ol"}, recursive=False):
328
565
  if isinstance(sublist, Tag):
@@ -351,17 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
351
588
  self._handle_list(tag, doc)
352
589
 
353
590
  elif tag_name in {"p", "address", "summary"}:
354
- for part in tag.text.split("\n"):
355
- seg = part.strip()
356
- seg_clean = HTMLDocumentBackend._clean_unicode(seg)
357
- if seg:
358
- doc.add_text(
359
- label=DocItemLabel.TEXT,
360
- text=seg_clean,
361
- orig=seg,
362
- parent=self.parents[self.level],
363
- content_layer=self.content_layer,
364
- )
591
+ text_list = self._extract_text_and_hyperlink_recursively(
592
+ tag, find_parent_annotation=True
593
+ )
594
+ annotated_texts = text_list.simplify_text_elements()
595
+ for part in annotated_texts.split_by_newline():
596
+ with self.use_inline_group(part, doc):
597
+ for annotated_text in part:
598
+ if seg := annotated_text.text.strip():
599
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
600
+ doc.add_text(
601
+ parent=self.parents[self.level],
602
+ label=DocItemLabel.TEXT,
603
+ text=seg_clean,
604
+ content_layer=self.content_layer,
605
+ hyperlink=annotated_text.hyperlink,
606
+ )
607
+
365
608
  for img_tag in tag("img"):
366
609
  if isinstance(img_tag, Tag):
367
610
  self._emit_image(img_tag, doc)
@@ -380,15 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
380
623
 
381
624
  elif tag_name in {"pre", "code"}:
382
625
  # handle monospace code snippets (pre).
383
- text = tag.get_text(strip=True)
384
- text_clean = HTMLDocumentBackend._clean_unicode(text)
385
- if text:
386
- doc.add_code(
387
- parent=self.parents[self.level],
388
- text=text_clean,
389
- orig=text,
390
- content_layer=self.content_layer,
391
- )
626
+ text_list = self._extract_text_and_hyperlink_recursively(
627
+ tag, find_parent_annotation=True
628
+ )
629
+ annotated_texts = text_list.simplify_text_elements()
630
+ with self.use_inline_group(annotated_texts, doc):
631
+ for annotated_text in annotated_texts:
632
+ text_clean = HTMLDocumentBackend._clean_unicode(
633
+ annotated_text.text.strip()
634
+ )
635
+ doc.add_code(
636
+ parent=self.parents[self.level],
637
+ text=text_clean,
638
+ content_layer=self.content_layer,
639
+ hyperlink=annotated_text.hyperlink,
640
+ )
392
641
 
393
642
  elif tag_name == "details":
394
643
  # handle details and its content.
@@ -405,22 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
405
654
 
406
655
  def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
407
656
  figure = img_tag.find_parent("figure")
408
- caption: str = ""
657
+ caption: AnnotatedTextList = AnnotatedTextList()
658
+
659
+ # check if the figure has a link - this is HACK:
660
+ def get_img_hyperlink(img_tag):
661
+ this_parent = img_tag.parent
662
+ while this_parent is not None:
663
+ if this_parent.name == "a" and this_parent.get("href"):
664
+ return this_parent.get("href")
665
+ this_parent = this_parent.parent
666
+ return None
667
+
668
+ if img_hyperlink := get_img_hyperlink(img_tag):
669
+ caption.append(
670
+ AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
671
+ )
672
+
409
673
  if isinstance(figure, Tag):
410
674
  caption_tag = figure.find("figcaption", recursive=False)
411
675
  if isinstance(caption_tag, Tag):
412
- caption = caption_tag.get_text()
413
- if not caption:
414
- caption = str(img_tag.get("alt", "")).strip()
676
+ caption = self._extract_text_and_hyperlink_recursively(
677
+ caption_tag, find_parent_annotation=True
678
+ )
679
+ if not caption and img_tag.get("alt"):
680
+ caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
681
+
682
+ caption_anno_text = caption.to_single_text_element()
415
683
 
416
684
  caption_item: Optional[TextItem] = None
417
- if caption:
418
- caption_clean = HTMLDocumentBackend._clean_unicode(caption)
685
+ if caption_anno_text.text:
686
+ text_clean = HTMLDocumentBackend._clean_unicode(
687
+ caption_anno_text.text.strip()
688
+ )
689
+ print(caption_anno_text)
419
690
  caption_item = doc.add_text(
420
691
  label=DocItemLabel.CAPTION,
421
- text=caption_clean,
422
- orig=caption,
692
+ text=text_clean,
693
+ orig=caption_anno_text.text,
423
694
  content_layer=self.content_layer,
695
+ hyperlink=caption_anno_text.hyperlink,
424
696
  )
425
697
 
426
698
  doc.add_picture(
@@ -0,0 +1,399 @@
1
+ """Backend for GBS Google Books schema."""
2
+
3
+ import logging
4
+ import tarfile
5
+ from collections.abc import Iterable
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from io import BytesIO
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
11
+
12
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
13
+ from docling_core.types.doc.page import (
14
+ BoundingRectangle,
15
+ PdfPageBoundaryType,
16
+ PdfPageGeometry,
17
+ SegmentedPdfPage,
18
+ TextCell,
19
+ )
20
+ from lxml import etree
21
+ from PIL import Image
22
+ from PIL.Image import Image as PILImage
23
+
24
+ from docling.backend.abstract_backend import PaginatedDocumentBackend
25
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
26
+ from docling.datamodel.base_models import InputFormat
27
+
28
+ if TYPE_CHECKING:
29
+ from docling.datamodel.document import InputDocument
30
+
31
+ _log = logging.getLogger(__name__)
32
+
33
+
34
+ def _get_pdf_page_geometry(
35
+ size: Size,
36
+ ) -> PdfPageGeometry:
37
+ boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
38
+
39
+ bbox_tuple = (0, 0, size.width, size.height)
40
+ bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
41
+
42
+ return PdfPageGeometry(
43
+ angle=0.0,
44
+ rect=BoundingRectangle.from_bounding_box(bbox),
45
+ boundary_type=boundary_type,
46
+ art_bbox=bbox,
47
+ bleed_bbox=bbox,
48
+ crop_bbox=bbox,
49
+ media_bbox=bbox,
50
+ trim_bbox=bbox,
51
+ )
52
+
53
+
54
+ class MetsGbsPageBackend(PdfPageBackend):
55
+ def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
56
+ self._im = page_im
57
+ self._dpage = parsed_page
58
+ self.valid = parsed_page is not None
59
+
60
+ def is_valid(self) -> bool:
61
+ return self.valid
62
+
63
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
64
+ # Find intersecting cells on the page
65
+ text_piece = ""
66
+ page_size = self.get_size()
67
+
68
+ scale = (
69
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
70
+ )
71
+
72
+ for i, cell in enumerate(self._dpage.textline_cells):
73
+ cell_bbox = (
74
+ cell.rect.to_bounding_box()
75
+ .to_top_left_origin(page_height=page_size.height)
76
+ .scaled(scale)
77
+ )
78
+
79
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
80
+
81
+ if overlap_frac > 0.5:
82
+ if len(text_piece) > 0:
83
+ text_piece += " "
84
+ text_piece += cell.text
85
+
86
+ return text_piece
87
+
88
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
89
+ return self._dpage
90
+
91
+ def get_text_cells(self) -> Iterable[TextCell]:
92
+ return self._dpage.textline_cells
93
+
94
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
95
+ AREA_THRESHOLD = 0 # 32 * 32
96
+
97
+ images = self._dpage.bitmap_resources
98
+
99
+ for img in images:
100
+ cropbox = img.rect.to_bounding_box().to_top_left_origin(
101
+ self.get_size().height
102
+ )
103
+
104
+ if cropbox.area() > AREA_THRESHOLD:
105
+ cropbox = cropbox.scaled(scale=scale)
106
+
107
+ yield cropbox
108
+
109
+ def get_page_image(
110
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
111
+ ) -> Image.Image:
112
+ page_size = self.get_size()
113
+ assert (
114
+ page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
115
+ )
116
+
117
+ if not cropbox:
118
+ cropbox = BoundingBox(
119
+ l=0,
120
+ r=page_size.width,
121
+ t=0,
122
+ b=page_size.height,
123
+ coord_origin=CoordOrigin.TOPLEFT,
124
+ )
125
+
126
+ image = self._im.resize(
127
+ size=(round(page_size.width * scale), round(page_size.height * scale))
128
+ ).crop(cropbox.scaled(scale=scale).as_tuple())
129
+ return image
130
+
131
+ def get_size(self) -> Size:
132
+ return Size(
133
+ width=self._dpage.dimension.width, height=self._dpage.dimension.height
134
+ )
135
+
136
+ def unload(self) -> None:
137
+ if hasattr(self, "_im"):
138
+ delattr(self, "_im")
139
+ if hasattr(self, "_dpage"):
140
+ delattr(self, "_dpage")
141
+
142
+
143
+ class _UseType(str, Enum):
144
+ IMAGE = "image"
145
+ OCR = "OCR"
146
+ COORD_OCR = "coordOCR"
147
+
148
+
149
+ @dataclass
150
+ class _FileInfo:
151
+ file_id: str
152
+ mimetype: str
153
+ path: str
154
+ use: _UseType
155
+
156
+
157
+ @dataclass
158
+ class _PageFiles:
159
+ image: Optional[_FileInfo] = None
160
+ ocr: Optional[_FileInfo] = None
161
+ coordOCR: Optional[_FileInfo] = None
162
+
163
+
164
+ def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
165
+ """
166
+ Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
167
+ """
168
+ parts = title_str.split(";")
169
+ for part in parts:
170
+ part = part.strip()
171
+ if part.startswith("bbox "):
172
+ try:
173
+ coords = part.split()[1:]
174
+ rect = BoundingRectangle.from_bounding_box(
175
+ bbox=BoundingBox.from_tuple(
176
+ tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
177
+ )
178
+ )
179
+ return rect
180
+ except Exception:
181
+ return None
182
+ return None
183
+
184
+
185
+ def _extract_confidence(title_str) -> float:
186
+ """Extracts x_wconf (OCR confidence) value from title string."""
187
+ for part in title_str.split(";"):
188
+ part = part.strip()
189
+ if part.startswith("x_wconf"):
190
+ try:
191
+ return float(part.split()[1]) / 100.0
192
+ except Exception:
193
+ return 1
194
+ return 1
195
+
196
+
197
+ class MetsGbsDocumentBackend(PdfDocumentBackend):
198
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
199
+ super().__init__(in_doc, path_or_stream)
200
+
201
+ self._tar: tarfile.TarFile = (
202
+ tarfile.open(name=self.path_or_stream, mode="r:gz")
203
+ if isinstance(self.path_or_stream, Path)
204
+ else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
205
+ )
206
+ self.root_mets: Optional[etree._Element] = None
207
+ self.page_map: Dict[int, _PageFiles] = {}
208
+
209
+ for member in self._tar.getmembers():
210
+ if member.name.endswith(".xml"):
211
+ file = self._tar.extractfile(member)
212
+ if file is not None:
213
+ content = file.read()
214
+ self.root_mets = self._validate_mets_xml(content)
215
+ if self.root_mets is not None:
216
+ break
217
+
218
+ if self.root_mets is None:
219
+ raise RuntimeError(
220
+ f"METS GBS backend could not load document {self.document_hash}."
221
+ )
222
+
223
+ ns = {
224
+ "mets": "http://www.loc.gov/METS/",
225
+ "xlink": "http://www.w3.org/1999/xlink",
226
+ "xsi": "http://www.w3.org/2001/XMLSchema-instance",
227
+ "gbs": "http://books.google.com/gbs",
228
+ "premis": "info:lc/xmlns/premis-v2",
229
+ "marc": "http://www.loc.gov/MARC21/slim",
230
+ }
231
+
232
+ file_info_by_id: Dict[str, _FileInfo] = {}
233
+
234
+ for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
235
+ use_raw = filegrp.get("USE")
236
+ try:
237
+ use = _UseType(use_raw)
238
+ except ValueError:
239
+ continue # Ignore unknown USE types
240
+
241
+ for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
242
+ file_id = file_elem.get("ID")
243
+ mimetype = file_elem.get("MIMETYPE")
244
+ flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
245
+ href = (
246
+ flocat_elem.get("{http://www.w3.org/1999/xlink}href")
247
+ if flocat_elem is not None
248
+ else None
249
+ )
250
+ if href is None:
251
+ continue
252
+
253
+ file_info_by_id[file_id] = _FileInfo(
254
+ file_id=file_id, mimetype=mimetype, path=href, use=use
255
+ )
256
+
257
+ USE_TO_ATTR = {
258
+ _UseType.IMAGE: "image",
259
+ _UseType.OCR: "ocr",
260
+ _UseType.COORD_OCR: "coordOCR",
261
+ }
262
+
263
+ for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
264
+ order_str = div.get("ORDER")
265
+ if not order_str:
266
+ continue
267
+ try:
268
+ page_no = int(order_str) - 1 # make 0-index pages
269
+ except ValueError:
270
+ continue
271
+
272
+ page_files = _PageFiles()
273
+
274
+ for fptr in div.xpath("./mets:fptr", namespaces=ns):
275
+ file_id = fptr.get("FILEID")
276
+ file_info = file_info_by_id.get(file_id)
277
+
278
+ if file_info:
279
+ attr = USE_TO_ATTR.get(file_info.use)
280
+ if attr:
281
+ setattr(page_files, attr, file_info)
282
+
283
+ self.page_map[page_no] = page_files
284
+
285
+ def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
286
+ root: etree._Element = etree.fromstring(xml_string)
287
+ if (
288
+ root.tag == "{http://www.loc.gov/METS/}mets"
289
+ and root.get("PROFILE") == "gbs"
290
+ ):
291
+ return root
292
+
293
+ _log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
294
+ return None
295
+
296
+ def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
297
+ # TODO: use better fallbacks...
298
+ image_info = self.page_map[page_no].image
299
+ assert image_info is not None
300
+ ocr_info = self.page_map[page_no].coordOCR
301
+ assert ocr_info is not None
302
+
303
+ image_file = self._tar.extractfile(image_info.path)
304
+ assert image_file is not None
305
+ buf = BytesIO(image_file.read())
306
+ im: PILImage = Image.open(buf)
307
+ ocr_file = self._tar.extractfile(ocr_info.path)
308
+ assert ocr_file is not None
309
+ ocr_content = ocr_file.read()
310
+ parser = etree.HTMLParser()
311
+ ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
312
+
313
+ line_cells: List[TextCell] = []
314
+ word_cells: List[TextCell] = []
315
+
316
+ page_div = ocr_root.xpath("//div[@class='ocr_page']")
317
+
318
+ size = Size(width=im.size[0], height=im.size[1])
319
+ if page_div:
320
+ title = page_div[0].attrib.get("title", "")
321
+ rect = _extract_rect(title)
322
+ if rect:
323
+ size = Size(width=rect.width, height=rect.height)
324
+ else:
325
+ _log.error(f"Could not find ocr_page for page {page_no}")
326
+
327
+ im = im.resize(size=(round(size.width), round(size.height)))
328
+ im = im.convert("RGB")
329
+
330
+ # Extract all ocrx_word spans
331
+ for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
332
+ text = "".join(word.itertext()).strip()
333
+ title = word.attrib.get("title", "")
334
+ rect = _extract_rect(title)
335
+ conf = _extract_confidence(title)
336
+ if rect:
337
+ word_cells.append(
338
+ TextCell(
339
+ index=ix,
340
+ text=text,
341
+ orig=text,
342
+ rect=rect,
343
+ from_ocr=True,
344
+ confidence=conf,
345
+ )
346
+ )
347
+
348
+ # Extract all ocr_line spans
349
+ # line: etree._Element
350
+ for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
351
+ text = "".join(line.itertext()).strip()
352
+ title = line.attrib.get("title", "")
353
+ rect = _extract_rect(title)
354
+ conf = _extract_confidence(title)
355
+ if rect:
356
+ line_cells.append(
357
+ TextCell(
358
+ index=ix,
359
+ text=text,
360
+ orig=text,
361
+ rect=rect,
362
+ from_ocr=True,
363
+ confidence=conf,
364
+ )
365
+ )
366
+
367
+ page = SegmentedPdfPage(
368
+ dimension=_get_pdf_page_geometry(size),
369
+ textline_cells=line_cells,
370
+ char_cells=[],
371
+ word_cells=word_cells,
372
+ has_textlines=True,
373
+ has_words=True,
374
+ has_chars=False,
375
+ )
376
+ return page, im
377
+
378
+ def page_count(self) -> int:
379
+ return len(self.page_map)
380
+
381
+ def load_page(self, page_no: int) -> MetsGbsPageBackend:
382
+ # TODO: is this thread-safe?
383
+ page, im = self._parse_page(page_no)
384
+ return MetsGbsPageBackend(parsed_page=page, page_im=im)
385
+
386
+ def is_valid(self) -> bool:
387
+ return self.root_mets is not None and self.page_count() > 0
388
+
389
+ @classmethod
390
+ def supported_formats(cls) -> Set[InputFormat]:
391
+ return {InputFormat.METS_GBS}
392
+
393
+ @classmethod
394
+ def supports_pagination(cls) -> bool:
395
+ return True
396
+
397
+ def unload(self) -> None:
398
+ super().unload()
399
+ self._tar.close()
@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
84
84
 
85
85
  buf.seek(0)
86
86
  self.path_or_stream = buf
87
- else:
87
+ elif self.input_format not in self.supported_formats():
88
88
  raise RuntimeError(
89
- f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
89
+ f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
90
90
  )
91
91
 
92
92
  @abstractmethod
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
99
99
 
100
100
  @classmethod
101
101
  def supported_formats(cls) -> Set[InputFormat]:
102
- return {InputFormat.PDF}
102
+ return {InputFormat.PDF, InputFormat.IMAGE}
103
103
 
104
104
  @classmethod
105
105
  def supports_pagination(cls) -> bool:
docling/cli/main.py CHANGED
@@ -26,6 +26,7 @@ from rich.console import Console
26
26
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
27
27
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
28
28
  from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
29
+ from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
29
30
  from docling.backend.pdf_backend import PdfDocumentBackend
30
31
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
31
32
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -607,9 +608,18 @@ def convert( # noqa: C901
607
608
  backend=backend, # pdf_backend
608
609
  )
609
610
 
611
+ # METS GBS options
612
+ mets_gbs_options = pipeline_options.model_copy()
613
+ mets_gbs_options.do_ocr = False
614
+ mets_gbs_format_option = PdfFormatOption(
615
+ pipeline_options=mets_gbs_options,
616
+ backend=MetsGbsDocumentBackend,
617
+ )
618
+
610
619
  format_options = {
611
620
  InputFormat.PDF: pdf_format_option,
612
621
  InputFormat.IMAGE: pdf_format_option,
622
+ InputFormat.METS_GBS: mets_gbs_format_option,
613
623
  }
614
624
 
615
625
  elif pipeline == ProcessingPipeline.VLM:
@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
56
56
  XLSX = "xlsx"
57
57
  XML_USPTO = "xml_uspto"
58
58
  XML_JATS = "xml_jats"
59
+ METS_GBS = "mets_gbs"
59
60
  JSON_DOCLING = "json_docling"
60
61
  AUDIO = "audio"
61
62
 
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
81
82
  InputFormat.CSV: ["csv"],
82
83
  InputFormat.XLSX: ["xlsx", "xlsm"],
83
84
  InputFormat.XML_USPTO: ["xml", "txt"],
85
+ InputFormat.METS_GBS: ["tar.gz"],
84
86
  InputFormat.JSON_DOCLING: ["json"],
85
87
  InputFormat.AUDIO: ["wav", "mp3"],
86
88
  }
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
113
115
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
114
116
  ],
115
117
  InputFormat.XML_USPTO: ["application/xml", "text/plain"],
118
+ InputFormat.METS_GBS: ["application/mets+xml"],
116
119
  InputFormat.JSON_DOCLING: ["application/json"],
117
120
  InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
118
121
  }
@@ -1,6 +1,7 @@
1
1
  import csv
2
2
  import logging
3
3
  import re
4
+ import tarfile
4
5
  from collections.abc import Iterable
5
6
  from enum import Enum
6
7
  from io import BytesIO
@@ -314,6 +315,10 @@ class _DocumentConversionInput(BaseModel):
314
315
  elif objname.endswith(".pptx"):
315
316
  mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
316
317
 
318
+ if mime is not None and mime.lower() == "application/gzip":
319
+ if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
320
+ mime = detected_mime
321
+
317
322
  mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
318
323
  mime = mime or _DocumentConversionInput._detect_csv(content)
319
324
  mime = mime or "text/plain"
@@ -457,3 +462,24 @@ class _DocumentConversionInput(BaseModel):
457
462
  return None
458
463
 
459
464
  return None
465
+
466
+ @staticmethod
467
+ def _detect_mets_gbs(
468
+ obj: Union[Path, DocumentStream],
469
+ ) -> Optional[Literal["application/mets+xml"]]:
470
+ content = obj if isinstance(obj, Path) else obj.stream
471
+ tar: tarfile.TarFile
472
+ member: tarfile.TarInfo
473
+ with tarfile.open(
474
+ name=content if isinstance(content, Path) else None,
475
+ fileobj=content if isinstance(content, BytesIO) else None,
476
+ mode="r:gz",
477
+ ) as tar:
478
+ for member in tar.getmembers():
479
+ if member.name.endswith(".xml"):
480
+ file = tar.extractfile(member)
481
+ if file is not None:
482
+ content_str = file.read().decode(errors="ignore")
483
+ if "http://www.loc.gov/METS/" in content_str:
484
+ return "application/mets+xml"
485
+ return None
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import Any, Callable, Dict, List, Literal, Optional, Union
2
+ from typing import Any, Dict, List, Literal, Optional
3
3
 
4
4
  from docling_core.types.doc.page import SegmentedPage
5
5
  from pydantic import AnyUrl, BaseModel
@@ -10,11 +10,17 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
10
10
 
11
11
  class BaseVlmOptions(BaseModel):
12
12
  kind: str
13
- prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
13
+ prompt: str
14
14
  scale: float = 2.0
15
15
  max_size: Optional[int] = None
16
16
  temperature: float = 0.0
17
17
 
18
+ def build_prompt(self, page: Optional[SegmentedPage]) -> str:
19
+ return self.prompt
20
+
21
+ def decode_response(self, text: str) -> str:
22
+ return text
23
+
18
24
 
19
25
  class ResponseFormat(str, Enum):
20
26
  DOCTAGS = "doctags"
@@ -20,6 +20,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
20
20
  from docling.backend.html_backend import HTMLDocumentBackend
21
21
  from docling.backend.json.docling_json_backend import DoclingJSONBackend
22
22
  from docling.backend.md_backend import MarkdownDocumentBackend
23
+ from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
23
24
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
24
25
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
25
26
  from docling.backend.msword_backend import MsWordDocumentBackend
@@ -159,6 +160,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
159
160
  InputFormat.XML_JATS: FormatOption(
160
161
  pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
161
162
  ),
163
+ InputFormat.METS_GBS: FormatOption(
164
+ pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
165
+ ),
162
166
  InputFormat.IMAGE: FormatOption(
163
167
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
164
168
  ),
@@ -53,11 +53,7 @@ class ApiVlmModel(BasePageModel):
53
53
  if hi_res_image.mode != "RGB":
54
54
  hi_res_image = hi_res_image.convert("RGB")
55
55
 
56
- if callable(self.vlm_options.prompt):
57
- prompt = self.vlm_options.prompt(page.parsed_page)
58
- else:
59
- prompt = self.vlm_options.prompt
60
-
56
+ prompt = self.vlm_options.build_prompt(page.parsed_page)
61
57
  page_tags = api_image_request(
62
58
  image=hi_res_image,
63
59
  prompt=prompt,
@@ -67,6 +63,7 @@ class ApiVlmModel(BasePageModel):
67
63
  **self.params,
68
64
  )
69
65
 
66
+ page_tags = self.vlm_options.decode_response(page_tags)
70
67
  page.predictions.vlm_response = VlmPrediction(text=page_tags)
71
68
 
72
69
  return page
@@ -135,10 +135,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
135
135
  )
136
136
 
137
137
  # Define prompt structure
138
- if callable(self.vlm_options.prompt):
139
- user_prompt = self.vlm_options.prompt(page.parsed_page)
140
- else:
141
- user_prompt = self.vlm_options.prompt
138
+ user_prompt = self.vlm_options.build_prompt(page.parsed_page)
142
139
  prompt = self.formulate_prompt(user_prompt)
143
140
 
144
141
  inputs = self.processor(
@@ -166,6 +163,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
166
163
  _log.debug(
167
164
  f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
168
165
  )
166
+ generated_texts = self.vlm_options.decode_response(generated_texts)
169
167
  page.predictions.vlm_response = VlmPrediction(
170
168
  text=generated_texts,
171
169
  generation_time=generation_time,
@@ -84,10 +84,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
84
84
  if hi_res_image.mode != "RGB":
85
85
  hi_res_image = hi_res_image.convert("RGB")
86
86
 
87
- if callable(self.vlm_options.prompt):
88
- user_prompt = self.vlm_options.prompt(page.parsed_page)
89
- else:
90
- user_prompt = self.vlm_options.prompt
87
+ user_prompt = self.vlm_options.build_prompt(page.parsed_page)
91
88
  prompt = self.apply_chat_template(
92
89
  self.processor, self.config, user_prompt, num_images=1
93
90
  )
@@ -142,6 +139,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
142
139
  _log.debug(
143
140
  f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
144
141
  )
142
+ page_tags = self.vlm_options.decode_response(page_tags)
145
143
  page.predictions.vlm_response = VlmPrediction(
146
144
  text=page_tags,
147
145
  generation_time=generation_time,
@@ -8,7 +8,10 @@ from typing import Any, Callable, List
8
8
 
9
9
  from docling_core.types.doc import NodeItem
10
10
 
11
- from docling.backend.abstract_backend import AbstractDocumentBackend
11
+ from docling.backend.abstract_backend import (
12
+ AbstractDocumentBackend,
13
+ PaginatedDocumentBackend,
14
+ )
12
15
  from docling.backend.pdf_backend import PdfDocumentBackend
13
16
  from docling.datamodel.base_models import (
14
17
  ConversionStatus,
@@ -126,10 +129,10 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
126
129
  yield from page_batch
127
130
 
128
131
  def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
129
- if not isinstance(conv_res.input._backend, PdfDocumentBackend):
132
+ if not isinstance(conv_res.input._backend, PaginatedDocumentBackend):
130
133
  raise RuntimeError(
131
- f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
132
- f"Can not convert this with a PDF pipeline. "
134
+ f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a paginated backend. "
135
+ f"Can not convert this with a paginated PDF pipeline. "
133
136
  f"Please check your format configuration on DocumentConverter."
134
137
  )
135
138
  # conv_res.status = ConversionStatus.FAILURE
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.44.0
3
+ Version: 2.45.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/document_converter.py,sha256=l4b9m9NcbnwzXNNvf777nszyXznQJiaTXyIl_WehkyQ,15724
2
+ docling/document_converter.py,sha256=7lid_uhGNuurYICweaA1jqtSbnhf3hpuUYUNleHh-Ww,15924
3
3
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
4
4
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
5
5
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,13 +9,14 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
9
9
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
11
  docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
12
- docling/backend/html_backend.py,sha256=0_l-I9gBAs0HKU3yKLQ3OqyYgB3V48hInv42GudnSjA,22856
12
+ docling/backend/html_backend.py,sha256=jTkpdJ-EKMmkbUfh88DONVG-gENE7m0_cnIhWpWSobI,34523
13
13
  docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
14
+ docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
14
15
  docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
15
16
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
16
17
  docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
17
18
  docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
18
- docling/backend/pdf_backend.py,sha256=sUBrCz1zvt6E7sVl4xHtrkpTBClOK0vBV2lLi_TRHNg,3237
19
+ docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
19
20
  docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
20
21
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
22
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -28,22 +29,22 @@ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5g
28
29
  docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
29
30
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
30
31
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- docling/cli/main.py,sha256=rXWR2QJFLeHLPWkMsLXvsVblX-KOXwbM8r0ku80KU5Q,29925
32
+ docling/cli/main.py,sha256=-W_vdKvSm5gZUZyvRpFH0YMI_1iJrP5sJOZ5_1bLorw,30359
32
33
  docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
33
34
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
34
35
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
36
  docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
36
37
  docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
37
- docling/datamodel/base_models.py,sha256=9FslHkGUNmBp264LpLL_2JTfDAdaikldYs3SiQOHb5A,11828
38
- docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
38
+ docling/datamodel/base_models.py,sha256=Ifd8PPHs4sW7ScwSqpa-y3rwgPbde_iw13Y2NUCPfU8,11944
39
+ docling/datamodel/document.py,sha256=zsxFYXvo6GtwGNogSDoBB1TFvkm7IOrP_VnqXNqBhJs,17329
39
40
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
40
41
  docling/datamodel/pipeline_options.py,sha256=TaBmCBRjSxyoh79UkpEkPzokLYS8BA2QJam86g9pT5g,10544
41
42
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
42
- docling/datamodel/pipeline_options_vlm_model.py,sha256=z-pUqwRA8nJp6C3SEXZLem2zvSYdgavaAVYa8wkAIZY,2400
43
+ docling/datamodel/pipeline_options_vlm_model.py,sha256=eH-Cj_8aic9FdX4xGlBcf5_R9e152JAL2LhtY8d0rhw,2498
43
44
  docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
44
45
  docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
45
46
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- docling/models/api_vlm_model.py,sha256=foBvzaWeHFH1t-VdvRWLdiXiiofhvhjvHqRI0eNA_3w,2923
47
+ docling/models/api_vlm_model.py,sha256=-zisU32pgDRbychyG6-neB0qweNbPaYnLXwiGT7SEdI,2859
47
48
  docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
48
49
  docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
49
50
  docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
@@ -70,11 +71,11 @@ docling/models/plugins/defaults.py,sha256=OAHWW2tCcUXSyDMFxV_lXVRjSBJ1n6z-Eb3R8c
70
71
  docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
72
  docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
72
73
  docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6DQMLtCQQW3-YUPDMbgeD2tjfM8vLM,8415
74
- docling/models/vlm_models_inline/mlx_model.py,sha256=tqbJ8tmf2VBDuMLYIv9s1Ysn3G831k2uE_PdOv0kCaE,5948
74
+ docling/models/vlm_models_inline/hf_transformers_model.py,sha256=Rwdr7neDpn5ehtrp6n7G21fcPBK2m9Har_6BFNdyw-Q,8359
75
+ docling/models/vlm_models_inline/mlx_model.py,sha256=YYYmopsITlX17JVS5KhLlb1IQSEVoSECNx_fXLHNpAc,5880
75
76
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
77
  docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
77
- docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
78
+ docling/pipeline/base_pipeline.py,sha256=MOKZtx3jNYotfntgoJHoyb6UsvdvG6bQLyDl9Lxvc1w,9586
78
79
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
79
80
  docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
80
81
  docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=Rjdq1x2fRHBA0rMHJ6rqqHzxVVzgTEALBBj5d30oOZ8,26018
@@ -92,9 +93,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
92
93
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
93
94
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
94
95
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
95
- docling-2.44.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
96
- docling-2.44.0.dist-info/METADATA,sha256=SjD3EXlvgfyXIo8YoeldcAFX0r_nbJszp7VPoMLPFBk,10459
97
- docling-2.44.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
98
- docling-2.44.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
99
- docling-2.44.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
100
- docling-2.44.0.dist-info/RECORD,,
96
+ docling-2.45.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
97
+ docling-2.45.0.dist-info/METADATA,sha256=-iB6xJ4H7DIStzPn-ruYcBa_Tq45Ijk52zfoM_6FkCE,10459
98
+ docling-2.45.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
99
+ docling-2.45.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
100
+ docling-2.45.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
101
+ docling-2.45.0.dist-info/RECORD,,