docling 2.44.0__py3-none-any.whl → 2.46.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v4_backend.py +61 -27
- docling/backend/html_backend.py +356 -80
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/pdf_backend.py +3 -3
- docling/cli/main.py +10 -0
- docling/datamodel/base_models.py +3 -0
- docling/datamodel/document.py +26 -0
- docling/datamodel/pipeline_options.py +1 -3
- docling/datamodel/pipeline_options_vlm_model.py +8 -2
- docling/document_converter.py +4 -0
- docling/models/api_vlm_model.py +2 -5
- docling/models/code_formula_model.py +87 -76
- docling/models/tesseract_ocr_cli_model.py +4 -2
- docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
- docling/models/vlm_models_inline/mlx_model.py +2 -4
- docling/pipeline/base_pipeline.py +14 -5
- docling/pipeline/threaded_standard_pdf_pipeline.py +6 -4
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/METADATA +2 -2
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/RECORD +23 -22
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/WHEEL +0 -0
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/entry_points.txt +0 -0
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,399 @@
|
|
1
|
+
"""Backend for GBS Google Books schema."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import tarfile
|
5
|
+
from collections.abc import Iterable
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from enum import Enum
|
8
|
+
from io import BytesIO
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
|
11
|
+
|
12
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
13
|
+
from docling_core.types.doc.page import (
|
14
|
+
BoundingRectangle,
|
15
|
+
PdfPageBoundaryType,
|
16
|
+
PdfPageGeometry,
|
17
|
+
SegmentedPdfPage,
|
18
|
+
TextCell,
|
19
|
+
)
|
20
|
+
from lxml import etree
|
21
|
+
from PIL import Image
|
22
|
+
from PIL.Image import Image as PILImage
|
23
|
+
|
24
|
+
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
25
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
26
|
+
from docling.datamodel.base_models import InputFormat
|
27
|
+
|
28
|
+
if TYPE_CHECKING:
|
29
|
+
from docling.datamodel.document import InputDocument
|
30
|
+
|
31
|
+
_log = logging.getLogger(__name__)
|
32
|
+
|
33
|
+
|
34
|
+
def _get_pdf_page_geometry(
|
35
|
+
size: Size,
|
36
|
+
) -> PdfPageGeometry:
|
37
|
+
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
|
38
|
+
|
39
|
+
bbox_tuple = (0, 0, size.width, size.height)
|
40
|
+
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
|
41
|
+
|
42
|
+
return PdfPageGeometry(
|
43
|
+
angle=0.0,
|
44
|
+
rect=BoundingRectangle.from_bounding_box(bbox),
|
45
|
+
boundary_type=boundary_type,
|
46
|
+
art_bbox=bbox,
|
47
|
+
bleed_bbox=bbox,
|
48
|
+
crop_bbox=bbox,
|
49
|
+
media_bbox=bbox,
|
50
|
+
trim_bbox=bbox,
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
class MetsGbsPageBackend(PdfPageBackend):
|
55
|
+
def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
|
56
|
+
self._im = page_im
|
57
|
+
self._dpage = parsed_page
|
58
|
+
self.valid = parsed_page is not None
|
59
|
+
|
60
|
+
def is_valid(self) -> bool:
|
61
|
+
return self.valid
|
62
|
+
|
63
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
64
|
+
# Find intersecting cells on the page
|
65
|
+
text_piece = ""
|
66
|
+
page_size = self.get_size()
|
67
|
+
|
68
|
+
scale = (
|
69
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
70
|
+
)
|
71
|
+
|
72
|
+
for i, cell in enumerate(self._dpage.textline_cells):
|
73
|
+
cell_bbox = (
|
74
|
+
cell.rect.to_bounding_box()
|
75
|
+
.to_top_left_origin(page_height=page_size.height)
|
76
|
+
.scaled(scale)
|
77
|
+
)
|
78
|
+
|
79
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
80
|
+
|
81
|
+
if overlap_frac > 0.5:
|
82
|
+
if len(text_piece) > 0:
|
83
|
+
text_piece += " "
|
84
|
+
text_piece += cell.text
|
85
|
+
|
86
|
+
return text_piece
|
87
|
+
|
88
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
89
|
+
return self._dpage
|
90
|
+
|
91
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
92
|
+
return self._dpage.textline_cells
|
93
|
+
|
94
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
95
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
96
|
+
|
97
|
+
images = self._dpage.bitmap_resources
|
98
|
+
|
99
|
+
for img in images:
|
100
|
+
cropbox = img.rect.to_bounding_box().to_top_left_origin(
|
101
|
+
self.get_size().height
|
102
|
+
)
|
103
|
+
|
104
|
+
if cropbox.area() > AREA_THRESHOLD:
|
105
|
+
cropbox = cropbox.scaled(scale=scale)
|
106
|
+
|
107
|
+
yield cropbox
|
108
|
+
|
109
|
+
def get_page_image(
|
110
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
111
|
+
) -> Image.Image:
|
112
|
+
page_size = self.get_size()
|
113
|
+
assert (
|
114
|
+
page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
|
115
|
+
)
|
116
|
+
|
117
|
+
if not cropbox:
|
118
|
+
cropbox = BoundingBox(
|
119
|
+
l=0,
|
120
|
+
r=page_size.width,
|
121
|
+
t=0,
|
122
|
+
b=page_size.height,
|
123
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
124
|
+
)
|
125
|
+
|
126
|
+
image = self._im.resize(
|
127
|
+
size=(round(page_size.width * scale), round(page_size.height * scale))
|
128
|
+
).crop(cropbox.scaled(scale=scale).as_tuple())
|
129
|
+
return image
|
130
|
+
|
131
|
+
def get_size(self) -> Size:
|
132
|
+
return Size(
|
133
|
+
width=self._dpage.dimension.width, height=self._dpage.dimension.height
|
134
|
+
)
|
135
|
+
|
136
|
+
def unload(self) -> None:
|
137
|
+
if hasattr(self, "_im"):
|
138
|
+
delattr(self, "_im")
|
139
|
+
if hasattr(self, "_dpage"):
|
140
|
+
delattr(self, "_dpage")
|
141
|
+
|
142
|
+
|
143
|
+
class _UseType(str, Enum):
|
144
|
+
IMAGE = "image"
|
145
|
+
OCR = "OCR"
|
146
|
+
COORD_OCR = "coordOCR"
|
147
|
+
|
148
|
+
|
149
|
+
@dataclass
|
150
|
+
class _FileInfo:
|
151
|
+
file_id: str
|
152
|
+
mimetype: str
|
153
|
+
path: str
|
154
|
+
use: _UseType
|
155
|
+
|
156
|
+
|
157
|
+
@dataclass
|
158
|
+
class _PageFiles:
|
159
|
+
image: Optional[_FileInfo] = None
|
160
|
+
ocr: Optional[_FileInfo] = None
|
161
|
+
coordOCR: Optional[_FileInfo] = None
|
162
|
+
|
163
|
+
|
164
|
+
def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
|
165
|
+
"""
|
166
|
+
Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
|
167
|
+
"""
|
168
|
+
parts = title_str.split(";")
|
169
|
+
for part in parts:
|
170
|
+
part = part.strip()
|
171
|
+
if part.startswith("bbox "):
|
172
|
+
try:
|
173
|
+
coords = part.split()[1:]
|
174
|
+
rect = BoundingRectangle.from_bounding_box(
|
175
|
+
bbox=BoundingBox.from_tuple(
|
176
|
+
tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
|
177
|
+
)
|
178
|
+
)
|
179
|
+
return rect
|
180
|
+
except Exception:
|
181
|
+
return None
|
182
|
+
return None
|
183
|
+
|
184
|
+
|
185
|
+
def _extract_confidence(title_str) -> float:
|
186
|
+
"""Extracts x_wconf (OCR confidence) value from title string."""
|
187
|
+
for part in title_str.split(";"):
|
188
|
+
part = part.strip()
|
189
|
+
if part.startswith("x_wconf"):
|
190
|
+
try:
|
191
|
+
return float(part.split()[1]) / 100.0
|
192
|
+
except Exception:
|
193
|
+
return 1
|
194
|
+
return 1
|
195
|
+
|
196
|
+
|
197
|
+
class MetsGbsDocumentBackend(PdfDocumentBackend):
|
198
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
199
|
+
super().__init__(in_doc, path_or_stream)
|
200
|
+
|
201
|
+
self._tar: tarfile.TarFile = (
|
202
|
+
tarfile.open(name=self.path_or_stream, mode="r:gz")
|
203
|
+
if isinstance(self.path_or_stream, Path)
|
204
|
+
else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
|
205
|
+
)
|
206
|
+
self.root_mets: Optional[etree._Element] = None
|
207
|
+
self.page_map: Dict[int, _PageFiles] = {}
|
208
|
+
|
209
|
+
for member in self._tar.getmembers():
|
210
|
+
if member.name.endswith(".xml"):
|
211
|
+
file = self._tar.extractfile(member)
|
212
|
+
if file is not None:
|
213
|
+
content = file.read()
|
214
|
+
self.root_mets = self._validate_mets_xml(content)
|
215
|
+
if self.root_mets is not None:
|
216
|
+
break
|
217
|
+
|
218
|
+
if self.root_mets is None:
|
219
|
+
raise RuntimeError(
|
220
|
+
f"METS GBS backend could not load document {self.document_hash}."
|
221
|
+
)
|
222
|
+
|
223
|
+
ns = {
|
224
|
+
"mets": "http://www.loc.gov/METS/",
|
225
|
+
"xlink": "http://www.w3.org/1999/xlink",
|
226
|
+
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
|
227
|
+
"gbs": "http://books.google.com/gbs",
|
228
|
+
"premis": "info:lc/xmlns/premis-v2",
|
229
|
+
"marc": "http://www.loc.gov/MARC21/slim",
|
230
|
+
}
|
231
|
+
|
232
|
+
file_info_by_id: Dict[str, _FileInfo] = {}
|
233
|
+
|
234
|
+
for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
|
235
|
+
use_raw = filegrp.get("USE")
|
236
|
+
try:
|
237
|
+
use = _UseType(use_raw)
|
238
|
+
except ValueError:
|
239
|
+
continue # Ignore unknown USE types
|
240
|
+
|
241
|
+
for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
|
242
|
+
file_id = file_elem.get("ID")
|
243
|
+
mimetype = file_elem.get("MIMETYPE")
|
244
|
+
flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
|
245
|
+
href = (
|
246
|
+
flocat_elem.get("{http://www.w3.org/1999/xlink}href")
|
247
|
+
if flocat_elem is not None
|
248
|
+
else None
|
249
|
+
)
|
250
|
+
if href is None:
|
251
|
+
continue
|
252
|
+
|
253
|
+
file_info_by_id[file_id] = _FileInfo(
|
254
|
+
file_id=file_id, mimetype=mimetype, path=href, use=use
|
255
|
+
)
|
256
|
+
|
257
|
+
USE_TO_ATTR = {
|
258
|
+
_UseType.IMAGE: "image",
|
259
|
+
_UseType.OCR: "ocr",
|
260
|
+
_UseType.COORD_OCR: "coordOCR",
|
261
|
+
}
|
262
|
+
|
263
|
+
for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
|
264
|
+
order_str = div.get("ORDER")
|
265
|
+
if not order_str:
|
266
|
+
continue
|
267
|
+
try:
|
268
|
+
page_no = int(order_str) - 1 # make 0-index pages
|
269
|
+
except ValueError:
|
270
|
+
continue
|
271
|
+
|
272
|
+
page_files = _PageFiles()
|
273
|
+
|
274
|
+
for fptr in div.xpath("./mets:fptr", namespaces=ns):
|
275
|
+
file_id = fptr.get("FILEID")
|
276
|
+
file_info = file_info_by_id.get(file_id)
|
277
|
+
|
278
|
+
if file_info:
|
279
|
+
attr = USE_TO_ATTR.get(file_info.use)
|
280
|
+
if attr:
|
281
|
+
setattr(page_files, attr, file_info)
|
282
|
+
|
283
|
+
self.page_map[page_no] = page_files
|
284
|
+
|
285
|
+
def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
|
286
|
+
root: etree._Element = etree.fromstring(xml_string)
|
287
|
+
if (
|
288
|
+
root.tag == "{http://www.loc.gov/METS/}mets"
|
289
|
+
and root.get("PROFILE") == "gbs"
|
290
|
+
):
|
291
|
+
return root
|
292
|
+
|
293
|
+
_log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
|
294
|
+
return None
|
295
|
+
|
296
|
+
def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
|
297
|
+
# TODO: use better fallbacks...
|
298
|
+
image_info = self.page_map[page_no].image
|
299
|
+
assert image_info is not None
|
300
|
+
ocr_info = self.page_map[page_no].coordOCR
|
301
|
+
assert ocr_info is not None
|
302
|
+
|
303
|
+
image_file = self._tar.extractfile(image_info.path)
|
304
|
+
assert image_file is not None
|
305
|
+
buf = BytesIO(image_file.read())
|
306
|
+
im: PILImage = Image.open(buf)
|
307
|
+
ocr_file = self._tar.extractfile(ocr_info.path)
|
308
|
+
assert ocr_file is not None
|
309
|
+
ocr_content = ocr_file.read()
|
310
|
+
parser = etree.HTMLParser()
|
311
|
+
ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
|
312
|
+
|
313
|
+
line_cells: List[TextCell] = []
|
314
|
+
word_cells: List[TextCell] = []
|
315
|
+
|
316
|
+
page_div = ocr_root.xpath("//div[@class='ocr_page']")
|
317
|
+
|
318
|
+
size = Size(width=im.size[0], height=im.size[1])
|
319
|
+
if page_div:
|
320
|
+
title = page_div[0].attrib.get("title", "")
|
321
|
+
rect = _extract_rect(title)
|
322
|
+
if rect:
|
323
|
+
size = Size(width=rect.width, height=rect.height)
|
324
|
+
else:
|
325
|
+
_log.error(f"Could not find ocr_page for page {page_no}")
|
326
|
+
|
327
|
+
im = im.resize(size=(round(size.width), round(size.height)))
|
328
|
+
im = im.convert("RGB")
|
329
|
+
|
330
|
+
# Extract all ocrx_word spans
|
331
|
+
for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
|
332
|
+
text = "".join(word.itertext()).strip()
|
333
|
+
title = word.attrib.get("title", "")
|
334
|
+
rect = _extract_rect(title)
|
335
|
+
conf = _extract_confidence(title)
|
336
|
+
if rect:
|
337
|
+
word_cells.append(
|
338
|
+
TextCell(
|
339
|
+
index=ix,
|
340
|
+
text=text,
|
341
|
+
orig=text,
|
342
|
+
rect=rect,
|
343
|
+
from_ocr=True,
|
344
|
+
confidence=conf,
|
345
|
+
)
|
346
|
+
)
|
347
|
+
|
348
|
+
# Extract all ocr_line spans
|
349
|
+
# line: etree._Element
|
350
|
+
for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
|
351
|
+
text = "".join(line.itertext()).strip()
|
352
|
+
title = line.attrib.get("title", "")
|
353
|
+
rect = _extract_rect(title)
|
354
|
+
conf = _extract_confidence(title)
|
355
|
+
if rect:
|
356
|
+
line_cells.append(
|
357
|
+
TextCell(
|
358
|
+
index=ix,
|
359
|
+
text=text,
|
360
|
+
orig=text,
|
361
|
+
rect=rect,
|
362
|
+
from_ocr=True,
|
363
|
+
confidence=conf,
|
364
|
+
)
|
365
|
+
)
|
366
|
+
|
367
|
+
page = SegmentedPdfPage(
|
368
|
+
dimension=_get_pdf_page_geometry(size),
|
369
|
+
textline_cells=line_cells,
|
370
|
+
char_cells=[],
|
371
|
+
word_cells=word_cells,
|
372
|
+
has_textlines=True,
|
373
|
+
has_words=True,
|
374
|
+
has_chars=False,
|
375
|
+
)
|
376
|
+
return page, im
|
377
|
+
|
378
|
+
def page_count(self) -> int:
|
379
|
+
return len(self.page_map)
|
380
|
+
|
381
|
+
def load_page(self, page_no: int) -> MetsGbsPageBackend:
|
382
|
+
# TODO: is this thread-safe?
|
383
|
+
page, im = self._parse_page(page_no)
|
384
|
+
return MetsGbsPageBackend(parsed_page=page, page_im=im)
|
385
|
+
|
386
|
+
def is_valid(self) -> bool:
|
387
|
+
return self.root_mets is not None and self.page_count() > 0
|
388
|
+
|
389
|
+
@classmethod
|
390
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
391
|
+
return {InputFormat.METS_GBS}
|
392
|
+
|
393
|
+
@classmethod
|
394
|
+
def supports_pagination(cls) -> bool:
|
395
|
+
return True
|
396
|
+
|
397
|
+
def unload(self) -> None:
|
398
|
+
super().unload()
|
399
|
+
self._tar.close()
|
docling/backend/pdf_backend.py
CHANGED
@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
84
84
|
|
85
85
|
buf.seek(0)
|
86
86
|
self.path_or_stream = buf
|
87
|
-
|
87
|
+
elif self.input_format not in self.supported_formats():
|
88
88
|
raise RuntimeError(
|
89
|
-
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
|
89
|
+
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
|
90
90
|
)
|
91
91
|
|
92
92
|
@abstractmethod
|
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
99
99
|
|
100
100
|
@classmethod
|
101
101
|
def supported_formats(cls) -> Set[InputFormat]:
|
102
|
-
return {InputFormat.PDF}
|
102
|
+
return {InputFormat.PDF, InputFormat.IMAGE}
|
103
103
|
|
104
104
|
@classmethod
|
105
105
|
def supports_pagination(cls) -> bool:
|
docling/cli/main.py
CHANGED
@@ -26,6 +26,7 @@ from rich.console import Console
|
|
26
26
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
27
27
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
28
28
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
29
|
+
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
29
30
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
30
31
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
31
32
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
@@ -607,9 +608,18 @@ def convert( # noqa: C901
|
|
607
608
|
backend=backend, # pdf_backend
|
608
609
|
)
|
609
610
|
|
611
|
+
# METS GBS options
|
612
|
+
mets_gbs_options = pipeline_options.model_copy()
|
613
|
+
mets_gbs_options.do_ocr = False
|
614
|
+
mets_gbs_format_option = PdfFormatOption(
|
615
|
+
pipeline_options=mets_gbs_options,
|
616
|
+
backend=MetsGbsDocumentBackend,
|
617
|
+
)
|
618
|
+
|
610
619
|
format_options = {
|
611
620
|
InputFormat.PDF: pdf_format_option,
|
612
621
|
InputFormat.IMAGE: pdf_format_option,
|
622
|
+
InputFormat.METS_GBS: mets_gbs_format_option,
|
613
623
|
}
|
614
624
|
|
615
625
|
elif pipeline == ProcessingPipeline.VLM:
|
docling/datamodel/base_models.py
CHANGED
@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
|
|
56
56
|
XLSX = "xlsx"
|
57
57
|
XML_USPTO = "xml_uspto"
|
58
58
|
XML_JATS = "xml_jats"
|
59
|
+
METS_GBS = "mets_gbs"
|
59
60
|
JSON_DOCLING = "json_docling"
|
60
61
|
AUDIO = "audio"
|
61
62
|
|
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
81
82
|
InputFormat.CSV: ["csv"],
|
82
83
|
InputFormat.XLSX: ["xlsx", "xlsm"],
|
83
84
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
85
|
+
InputFormat.METS_GBS: ["tar.gz"],
|
84
86
|
InputFormat.JSON_DOCLING: ["json"],
|
85
87
|
InputFormat.AUDIO: ["wav", "mp3"],
|
86
88
|
}
|
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
113
115
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
114
116
|
],
|
115
117
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
118
|
+
InputFormat.METS_GBS: ["application/mets+xml"],
|
116
119
|
InputFormat.JSON_DOCLING: ["application/json"],
|
117
120
|
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
|
118
121
|
}
|
docling/datamodel/document.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import csv
|
2
2
|
import logging
|
3
3
|
import re
|
4
|
+
import tarfile
|
4
5
|
from collections.abc import Iterable
|
5
6
|
from enum import Enum
|
6
7
|
from io import BytesIO
|
@@ -314,6 +315,10 @@ class _DocumentConversionInput(BaseModel):
|
|
314
315
|
elif objname.endswith(".pptx"):
|
315
316
|
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
316
317
|
|
318
|
+
if mime is not None and mime.lower() == "application/gzip":
|
319
|
+
if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
|
320
|
+
mime = detected_mime
|
321
|
+
|
317
322
|
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
318
323
|
mime = mime or _DocumentConversionInput._detect_csv(content)
|
319
324
|
mime = mime or "text/plain"
|
@@ -457,3 +462,24 @@ class _DocumentConversionInput(BaseModel):
|
|
457
462
|
return None
|
458
463
|
|
459
464
|
return None
|
465
|
+
|
466
|
+
@staticmethod
|
467
|
+
def _detect_mets_gbs(
|
468
|
+
obj: Union[Path, DocumentStream],
|
469
|
+
) -> Optional[Literal["application/mets+xml"]]:
|
470
|
+
content = obj if isinstance(obj, Path) else obj.stream
|
471
|
+
tar: tarfile.TarFile
|
472
|
+
member: tarfile.TarInfo
|
473
|
+
with tarfile.open(
|
474
|
+
name=content if isinstance(content, Path) else None,
|
475
|
+
fileobj=content if isinstance(content, BytesIO) else None,
|
476
|
+
mode="r:gz",
|
477
|
+
) as tar:
|
478
|
+
for member in tar.getmembers():
|
479
|
+
if member.name.endswith(".xml"):
|
480
|
+
file = tar.extractfile(member)
|
481
|
+
if file is not None:
|
482
|
+
content_str = file.read().decode(errors="ignore")
|
483
|
+
if "http://www.loc.gov/METS/" in content_str:
|
484
|
+
return "application/mets+xml"
|
485
|
+
return None
|
@@ -323,9 +323,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
323
323
|
),
|
324
324
|
)
|
325
325
|
|
326
|
-
generate_parsed_pages:
|
327
|
-
True # Always True since parsed_page is now mandatory
|
328
|
-
)
|
326
|
+
generate_parsed_pages: bool = False
|
329
327
|
|
330
328
|
|
331
329
|
class ProcessingPipeline(str, Enum):
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import Any,
|
2
|
+
from typing import Any, Dict, List, Literal, Optional
|
3
3
|
|
4
4
|
from docling_core.types.doc.page import SegmentedPage
|
5
5
|
from pydantic import AnyUrl, BaseModel
|
@@ -10,11 +10,17 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
10
10
|
|
11
11
|
class BaseVlmOptions(BaseModel):
|
12
12
|
kind: str
|
13
|
-
prompt:
|
13
|
+
prompt: str
|
14
14
|
scale: float = 2.0
|
15
15
|
max_size: Optional[int] = None
|
16
16
|
temperature: float = 0.0
|
17
17
|
|
18
|
+
def build_prompt(self, page: Optional[SegmentedPage]) -> str:
|
19
|
+
return self.prompt
|
20
|
+
|
21
|
+
def decode_response(self, text: str) -> str:
|
22
|
+
return text
|
23
|
+
|
18
24
|
|
19
25
|
class ResponseFormat(str, Enum):
|
20
26
|
DOCTAGS = "doctags"
|
docling/document_converter.py
CHANGED
@@ -20,6 +20,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
|
20
20
|
from docling.backend.html_backend import HTMLDocumentBackend
|
21
21
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
22
22
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
23
|
+
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
23
24
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
24
25
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
25
26
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
@@ -159,6 +160,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
159
160
|
InputFormat.XML_JATS: FormatOption(
|
160
161
|
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
161
162
|
),
|
163
|
+
InputFormat.METS_GBS: FormatOption(
|
164
|
+
pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
|
165
|
+
),
|
162
166
|
InputFormat.IMAGE: FormatOption(
|
163
167
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
164
168
|
),
|
docling/models/api_vlm_model.py
CHANGED
@@ -53,11 +53,7 @@ class ApiVlmModel(BasePageModel):
|
|
53
53
|
if hi_res_image.mode != "RGB":
|
54
54
|
hi_res_image = hi_res_image.convert("RGB")
|
55
55
|
|
56
|
-
|
57
|
-
prompt = self.vlm_options.prompt(page.parsed_page)
|
58
|
-
else:
|
59
|
-
prompt = self.vlm_options.prompt
|
60
|
-
|
56
|
+
prompt = self.vlm_options.build_prompt(page.parsed_page)
|
61
57
|
page_tags = api_image_request(
|
62
58
|
image=hi_res_image,
|
63
59
|
prompt=prompt,
|
@@ -67,6 +63,7 @@ class ApiVlmModel(BasePageModel):
|
|
67
63
|
**self.params,
|
68
64
|
)
|
69
65
|
|
66
|
+
page_tags = self.vlm_options.decode_response(page_tags)
|
70
67
|
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
71
68
|
|
72
69
|
return page
|