docling 2.44.0__py3-none-any.whl → 2.46.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,399 @@
1
+ """Backend for GBS Google Books schema."""
2
+
3
+ import logging
4
+ import tarfile
5
+ from collections.abc import Iterable
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from io import BytesIO
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
11
+
12
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
13
+ from docling_core.types.doc.page import (
14
+ BoundingRectangle,
15
+ PdfPageBoundaryType,
16
+ PdfPageGeometry,
17
+ SegmentedPdfPage,
18
+ TextCell,
19
+ )
20
+ from lxml import etree
21
+ from PIL import Image
22
+ from PIL.Image import Image as PILImage
23
+
24
+ from docling.backend.abstract_backend import PaginatedDocumentBackend
25
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
26
+ from docling.datamodel.base_models import InputFormat
27
+
28
+ if TYPE_CHECKING:
29
+ from docling.datamodel.document import InputDocument
30
+
31
+ _log = logging.getLogger(__name__)
32
+
33
+
34
+ def _get_pdf_page_geometry(
35
+ size: Size,
36
+ ) -> PdfPageGeometry:
37
+ boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
38
+
39
+ bbox_tuple = (0, 0, size.width, size.height)
40
+ bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
41
+
42
+ return PdfPageGeometry(
43
+ angle=0.0,
44
+ rect=BoundingRectangle.from_bounding_box(bbox),
45
+ boundary_type=boundary_type,
46
+ art_bbox=bbox,
47
+ bleed_bbox=bbox,
48
+ crop_bbox=bbox,
49
+ media_bbox=bbox,
50
+ trim_bbox=bbox,
51
+ )
52
+
53
+
54
+ class MetsGbsPageBackend(PdfPageBackend):
55
+ def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
56
+ self._im = page_im
57
+ self._dpage = parsed_page
58
+ self.valid = parsed_page is not None
59
+
60
+ def is_valid(self) -> bool:
61
+ return self.valid
62
+
63
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
64
+ # Find intersecting cells on the page
65
+ text_piece = ""
66
+ page_size = self.get_size()
67
+
68
+ scale = (
69
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
70
+ )
71
+
72
+ for i, cell in enumerate(self._dpage.textline_cells):
73
+ cell_bbox = (
74
+ cell.rect.to_bounding_box()
75
+ .to_top_left_origin(page_height=page_size.height)
76
+ .scaled(scale)
77
+ )
78
+
79
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
80
+
81
+ if overlap_frac > 0.5:
82
+ if len(text_piece) > 0:
83
+ text_piece += " "
84
+ text_piece += cell.text
85
+
86
+ return text_piece
87
+
88
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
89
+ return self._dpage
90
+
91
+ def get_text_cells(self) -> Iterable[TextCell]:
92
+ return self._dpage.textline_cells
93
+
94
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
95
+ AREA_THRESHOLD = 0 # 32 * 32
96
+
97
+ images = self._dpage.bitmap_resources
98
+
99
+ for img in images:
100
+ cropbox = img.rect.to_bounding_box().to_top_left_origin(
101
+ self.get_size().height
102
+ )
103
+
104
+ if cropbox.area() > AREA_THRESHOLD:
105
+ cropbox = cropbox.scaled(scale=scale)
106
+
107
+ yield cropbox
108
+
109
+ def get_page_image(
110
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
111
+ ) -> Image.Image:
112
+ page_size = self.get_size()
113
+ assert (
114
+ page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
115
+ )
116
+
117
+ if not cropbox:
118
+ cropbox = BoundingBox(
119
+ l=0,
120
+ r=page_size.width,
121
+ t=0,
122
+ b=page_size.height,
123
+ coord_origin=CoordOrigin.TOPLEFT,
124
+ )
125
+
126
+ image = self._im.resize(
127
+ size=(round(page_size.width * scale), round(page_size.height * scale))
128
+ ).crop(cropbox.scaled(scale=scale).as_tuple())
129
+ return image
130
+
131
+ def get_size(self) -> Size:
132
+ return Size(
133
+ width=self._dpage.dimension.width, height=self._dpage.dimension.height
134
+ )
135
+
136
+ def unload(self) -> None:
137
+ if hasattr(self, "_im"):
138
+ delattr(self, "_im")
139
+ if hasattr(self, "_dpage"):
140
+ delattr(self, "_dpage")
141
+
142
+
143
+ class _UseType(str, Enum):
144
+ IMAGE = "image"
145
+ OCR = "OCR"
146
+ COORD_OCR = "coordOCR"
147
+
148
+
149
+ @dataclass
150
+ class _FileInfo:
151
+ file_id: str
152
+ mimetype: str
153
+ path: str
154
+ use: _UseType
155
+
156
+
157
+ @dataclass
158
+ class _PageFiles:
159
+ image: Optional[_FileInfo] = None
160
+ ocr: Optional[_FileInfo] = None
161
+ coordOCR: Optional[_FileInfo] = None
162
+
163
+
164
+ def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
165
+ """
166
+ Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
167
+ """
168
+ parts = title_str.split(";")
169
+ for part in parts:
170
+ part = part.strip()
171
+ if part.startswith("bbox "):
172
+ try:
173
+ coords = part.split()[1:]
174
+ rect = BoundingRectangle.from_bounding_box(
175
+ bbox=BoundingBox.from_tuple(
176
+ tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
177
+ )
178
+ )
179
+ return rect
180
+ except Exception:
181
+ return None
182
+ return None
183
+
184
+
185
+ def _extract_confidence(title_str) -> float:
186
+ """Extracts x_wconf (OCR confidence) value from title string."""
187
+ for part in title_str.split(";"):
188
+ part = part.strip()
189
+ if part.startswith("x_wconf"):
190
+ try:
191
+ return float(part.split()[1]) / 100.0
192
+ except Exception:
193
+ return 1
194
+ return 1
195
+
196
+
197
+ class MetsGbsDocumentBackend(PdfDocumentBackend):
198
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
199
+ super().__init__(in_doc, path_or_stream)
200
+
201
+ self._tar: tarfile.TarFile = (
202
+ tarfile.open(name=self.path_or_stream, mode="r:gz")
203
+ if isinstance(self.path_or_stream, Path)
204
+ else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
205
+ )
206
+ self.root_mets: Optional[etree._Element] = None
207
+ self.page_map: Dict[int, _PageFiles] = {}
208
+
209
+ for member in self._tar.getmembers():
210
+ if member.name.endswith(".xml"):
211
+ file = self._tar.extractfile(member)
212
+ if file is not None:
213
+ content = file.read()
214
+ self.root_mets = self._validate_mets_xml(content)
215
+ if self.root_mets is not None:
216
+ break
217
+
218
+ if self.root_mets is None:
219
+ raise RuntimeError(
220
+ f"METS GBS backend could not load document {self.document_hash}."
221
+ )
222
+
223
+ ns = {
224
+ "mets": "http://www.loc.gov/METS/",
225
+ "xlink": "http://www.w3.org/1999/xlink",
226
+ "xsi": "http://www.w3.org/2001/XMLSchema-instance",
227
+ "gbs": "http://books.google.com/gbs",
228
+ "premis": "info:lc/xmlns/premis-v2",
229
+ "marc": "http://www.loc.gov/MARC21/slim",
230
+ }
231
+
232
+ file_info_by_id: Dict[str, _FileInfo] = {}
233
+
234
+ for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
235
+ use_raw = filegrp.get("USE")
236
+ try:
237
+ use = _UseType(use_raw)
238
+ except ValueError:
239
+ continue # Ignore unknown USE types
240
+
241
+ for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
242
+ file_id = file_elem.get("ID")
243
+ mimetype = file_elem.get("MIMETYPE")
244
+ flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
245
+ href = (
246
+ flocat_elem.get("{http://www.w3.org/1999/xlink}href")
247
+ if flocat_elem is not None
248
+ else None
249
+ )
250
+ if href is None:
251
+ continue
252
+
253
+ file_info_by_id[file_id] = _FileInfo(
254
+ file_id=file_id, mimetype=mimetype, path=href, use=use
255
+ )
256
+
257
+ USE_TO_ATTR = {
258
+ _UseType.IMAGE: "image",
259
+ _UseType.OCR: "ocr",
260
+ _UseType.COORD_OCR: "coordOCR",
261
+ }
262
+
263
+ for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
264
+ order_str = div.get("ORDER")
265
+ if not order_str:
266
+ continue
267
+ try:
268
+ page_no = int(order_str) - 1 # make 0-index pages
269
+ except ValueError:
270
+ continue
271
+
272
+ page_files = _PageFiles()
273
+
274
+ for fptr in div.xpath("./mets:fptr", namespaces=ns):
275
+ file_id = fptr.get("FILEID")
276
+ file_info = file_info_by_id.get(file_id)
277
+
278
+ if file_info:
279
+ attr = USE_TO_ATTR.get(file_info.use)
280
+ if attr:
281
+ setattr(page_files, attr, file_info)
282
+
283
+ self.page_map[page_no] = page_files
284
+
285
+ def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
286
+ root: etree._Element = etree.fromstring(xml_string)
287
+ if (
288
+ root.tag == "{http://www.loc.gov/METS/}mets"
289
+ and root.get("PROFILE") == "gbs"
290
+ ):
291
+ return root
292
+
293
+ _log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
294
+ return None
295
+
296
+ def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
297
+ # TODO: use better fallbacks...
298
+ image_info = self.page_map[page_no].image
299
+ assert image_info is not None
300
+ ocr_info = self.page_map[page_no].coordOCR
301
+ assert ocr_info is not None
302
+
303
+ image_file = self._tar.extractfile(image_info.path)
304
+ assert image_file is not None
305
+ buf = BytesIO(image_file.read())
306
+ im: PILImage = Image.open(buf)
307
+ ocr_file = self._tar.extractfile(ocr_info.path)
308
+ assert ocr_file is not None
309
+ ocr_content = ocr_file.read()
310
+ parser = etree.HTMLParser()
311
+ ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
312
+
313
+ line_cells: List[TextCell] = []
314
+ word_cells: List[TextCell] = []
315
+
316
+ page_div = ocr_root.xpath("//div[@class='ocr_page']")
317
+
318
+ size = Size(width=im.size[0], height=im.size[1])
319
+ if page_div:
320
+ title = page_div[0].attrib.get("title", "")
321
+ rect = _extract_rect(title)
322
+ if rect:
323
+ size = Size(width=rect.width, height=rect.height)
324
+ else:
325
+ _log.error(f"Could not find ocr_page for page {page_no}")
326
+
327
+ im = im.resize(size=(round(size.width), round(size.height)))
328
+ im = im.convert("RGB")
329
+
330
+ # Extract all ocrx_word spans
331
+ for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
332
+ text = "".join(word.itertext()).strip()
333
+ title = word.attrib.get("title", "")
334
+ rect = _extract_rect(title)
335
+ conf = _extract_confidence(title)
336
+ if rect:
337
+ word_cells.append(
338
+ TextCell(
339
+ index=ix,
340
+ text=text,
341
+ orig=text,
342
+ rect=rect,
343
+ from_ocr=True,
344
+ confidence=conf,
345
+ )
346
+ )
347
+
348
+ # Extract all ocr_line spans
349
+ # line: etree._Element
350
+ for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
351
+ text = "".join(line.itertext()).strip()
352
+ title = line.attrib.get("title", "")
353
+ rect = _extract_rect(title)
354
+ conf = _extract_confidence(title)
355
+ if rect:
356
+ line_cells.append(
357
+ TextCell(
358
+ index=ix,
359
+ text=text,
360
+ orig=text,
361
+ rect=rect,
362
+ from_ocr=True,
363
+ confidence=conf,
364
+ )
365
+ )
366
+
367
+ page = SegmentedPdfPage(
368
+ dimension=_get_pdf_page_geometry(size),
369
+ textline_cells=line_cells,
370
+ char_cells=[],
371
+ word_cells=word_cells,
372
+ has_textlines=True,
373
+ has_words=True,
374
+ has_chars=False,
375
+ )
376
+ return page, im
377
+
378
+ def page_count(self) -> int:
379
+ return len(self.page_map)
380
+
381
+ def load_page(self, page_no: int) -> MetsGbsPageBackend:
382
+ # TODO: is this thread-safe?
383
+ page, im = self._parse_page(page_no)
384
+ return MetsGbsPageBackend(parsed_page=page, page_im=im)
385
+
386
+ def is_valid(self) -> bool:
387
+ return self.root_mets is not None and self.page_count() > 0
388
+
389
+ @classmethod
390
+ def supported_formats(cls) -> Set[InputFormat]:
391
+ return {InputFormat.METS_GBS}
392
+
393
+ @classmethod
394
+ def supports_pagination(cls) -> bool:
395
+ return True
396
+
397
+ def unload(self) -> None:
398
+ super().unload()
399
+ self._tar.close()
@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
84
84
 
85
85
  buf.seek(0)
86
86
  self.path_or_stream = buf
87
- else:
87
+ elif self.input_format not in self.supported_formats():
88
88
  raise RuntimeError(
89
- f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
89
+ f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
90
90
  )
91
91
 
92
92
  @abstractmethod
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
99
99
 
100
100
  @classmethod
101
101
  def supported_formats(cls) -> Set[InputFormat]:
102
- return {InputFormat.PDF}
102
+ return {InputFormat.PDF, InputFormat.IMAGE}
103
103
 
104
104
  @classmethod
105
105
  def supports_pagination(cls) -> bool:
docling/cli/main.py CHANGED
@@ -26,6 +26,7 @@ from rich.console import Console
26
26
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
27
27
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
28
28
  from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
29
+ from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
29
30
  from docling.backend.pdf_backend import PdfDocumentBackend
30
31
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
31
32
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -607,9 +608,18 @@ def convert( # noqa: C901
607
608
  backend=backend, # pdf_backend
608
609
  )
609
610
 
611
+ # METS GBS options
612
+ mets_gbs_options = pipeline_options.model_copy()
613
+ mets_gbs_options.do_ocr = False
614
+ mets_gbs_format_option = PdfFormatOption(
615
+ pipeline_options=mets_gbs_options,
616
+ backend=MetsGbsDocumentBackend,
617
+ )
618
+
610
619
  format_options = {
611
620
  InputFormat.PDF: pdf_format_option,
612
621
  InputFormat.IMAGE: pdf_format_option,
622
+ InputFormat.METS_GBS: mets_gbs_format_option,
613
623
  }
614
624
 
615
625
  elif pipeline == ProcessingPipeline.VLM:
@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
56
56
  XLSX = "xlsx"
57
57
  XML_USPTO = "xml_uspto"
58
58
  XML_JATS = "xml_jats"
59
+ METS_GBS = "mets_gbs"
59
60
  JSON_DOCLING = "json_docling"
60
61
  AUDIO = "audio"
61
62
 
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
81
82
  InputFormat.CSV: ["csv"],
82
83
  InputFormat.XLSX: ["xlsx", "xlsm"],
83
84
  InputFormat.XML_USPTO: ["xml", "txt"],
85
+ InputFormat.METS_GBS: ["tar.gz"],
84
86
  InputFormat.JSON_DOCLING: ["json"],
85
87
  InputFormat.AUDIO: ["wav", "mp3"],
86
88
  }
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
113
115
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
114
116
  ],
115
117
  InputFormat.XML_USPTO: ["application/xml", "text/plain"],
118
+ InputFormat.METS_GBS: ["application/mets+xml"],
116
119
  InputFormat.JSON_DOCLING: ["application/json"],
117
120
  InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
118
121
  }
@@ -1,6 +1,7 @@
1
1
  import csv
2
2
  import logging
3
3
  import re
4
+ import tarfile
4
5
  from collections.abc import Iterable
5
6
  from enum import Enum
6
7
  from io import BytesIO
@@ -314,6 +315,10 @@ class _DocumentConversionInput(BaseModel):
314
315
  elif objname.endswith(".pptx"):
315
316
  mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
316
317
 
318
+ if mime is not None and mime.lower() == "application/gzip":
319
+ if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
320
+ mime = detected_mime
321
+
317
322
  mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
318
323
  mime = mime or _DocumentConversionInput._detect_csv(content)
319
324
  mime = mime or "text/plain"
@@ -457,3 +462,24 @@ class _DocumentConversionInput(BaseModel):
457
462
  return None
458
463
 
459
464
  return None
465
+
466
+ @staticmethod
467
+ def _detect_mets_gbs(
468
+ obj: Union[Path, DocumentStream],
469
+ ) -> Optional[Literal["application/mets+xml"]]:
470
+ content = obj if isinstance(obj, Path) else obj.stream
471
+ tar: tarfile.TarFile
472
+ member: tarfile.TarInfo
473
+ with tarfile.open(
474
+ name=content if isinstance(content, Path) else None,
475
+ fileobj=content if isinstance(content, BytesIO) else None,
476
+ mode="r:gz",
477
+ ) as tar:
478
+ for member in tar.getmembers():
479
+ if member.name.endswith(".xml"):
480
+ file = tar.extractfile(member)
481
+ if file is not None:
482
+ content_str = file.read().decode(errors="ignore")
483
+ if "http://www.loc.gov/METS/" in content_str:
484
+ return "application/mets+xml"
485
+ return None
@@ -323,9 +323,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
323
323
  ),
324
324
  )
325
325
 
326
- generate_parsed_pages: Literal[True] = (
327
- True # Always True since parsed_page is now mandatory
328
- )
326
+ generate_parsed_pages: bool = False
329
327
 
330
328
 
331
329
  class ProcessingPipeline(str, Enum):
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import Any, Callable, Dict, List, Literal, Optional, Union
2
+ from typing import Any, Dict, List, Literal, Optional
3
3
 
4
4
  from docling_core.types.doc.page import SegmentedPage
5
5
  from pydantic import AnyUrl, BaseModel
@@ -10,11 +10,17 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
10
10
 
11
11
  class BaseVlmOptions(BaseModel):
12
12
  kind: str
13
- prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
13
+ prompt: str
14
14
  scale: float = 2.0
15
15
  max_size: Optional[int] = None
16
16
  temperature: float = 0.0
17
17
 
18
+ def build_prompt(self, page: Optional[SegmentedPage]) -> str:
19
+ return self.prompt
20
+
21
+ def decode_response(self, text: str) -> str:
22
+ return text
23
+
18
24
 
19
25
  class ResponseFormat(str, Enum):
20
26
  DOCTAGS = "doctags"
@@ -20,6 +20,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
20
20
  from docling.backend.html_backend import HTMLDocumentBackend
21
21
  from docling.backend.json.docling_json_backend import DoclingJSONBackend
22
22
  from docling.backend.md_backend import MarkdownDocumentBackend
23
+ from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
23
24
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
24
25
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
25
26
  from docling.backend.msword_backend import MsWordDocumentBackend
@@ -159,6 +160,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
159
160
  InputFormat.XML_JATS: FormatOption(
160
161
  pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
161
162
  ),
163
+ InputFormat.METS_GBS: FormatOption(
164
+ pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
165
+ ),
162
166
  InputFormat.IMAGE: FormatOption(
163
167
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
164
168
  ),
@@ -53,11 +53,7 @@ class ApiVlmModel(BasePageModel):
53
53
  if hi_res_image.mode != "RGB":
54
54
  hi_res_image = hi_res_image.convert("RGB")
55
55
 
56
- if callable(self.vlm_options.prompt):
57
- prompt = self.vlm_options.prompt(page.parsed_page)
58
- else:
59
- prompt = self.vlm_options.prompt
60
-
56
+ prompt = self.vlm_options.build_prompt(page.parsed_page)
61
57
  page_tags = api_image_request(
62
58
  image=hi_res_image,
63
59
  prompt=prompt,
@@ -67,6 +63,7 @@ class ApiVlmModel(BasePageModel):
67
63
  **self.params,
68
64
  )
69
65
 
66
+ page_tags = self.vlm_options.decode_response(page_tags)
70
67
  page.predictions.vlm_response = VlmPrediction(text=page_tags)
71
68
 
72
69
  return page