docling 2.43.0__py3-none-any.whl → 2.45.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +406 -69
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/pdf_backend.py +3 -3
- docling/cli/main.py +16 -0
- docling/datamodel/base_models.py +3 -0
- docling/datamodel/document.py +26 -0
- docling/datamodel/pipeline_options_vlm_model.py +8 -2
- docling/document_converter.py +34 -0
- docling/models/api_vlm_model.py +2 -5
- docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
- docling/models/vlm_models_inline/mlx_model.py +4 -6
- docling/pipeline/base_pipeline.py +7 -4
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/METADATA +2 -2
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/RECORD +18 -17
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/WHEEL +0 -0
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/entry_points.txt +0 -0
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.43.0.dist-info → docling-2.45.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,399 @@
|
|
1
|
+
"""Backend for GBS Google Books schema."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import tarfile
|
5
|
+
from collections.abc import Iterable
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from enum import Enum
|
8
|
+
from io import BytesIO
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
|
11
|
+
|
12
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
13
|
+
from docling_core.types.doc.page import (
|
14
|
+
BoundingRectangle,
|
15
|
+
PdfPageBoundaryType,
|
16
|
+
PdfPageGeometry,
|
17
|
+
SegmentedPdfPage,
|
18
|
+
TextCell,
|
19
|
+
)
|
20
|
+
from lxml import etree
|
21
|
+
from PIL import Image
|
22
|
+
from PIL.Image import Image as PILImage
|
23
|
+
|
24
|
+
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
25
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
26
|
+
from docling.datamodel.base_models import InputFormat
|
27
|
+
|
28
|
+
if TYPE_CHECKING:
|
29
|
+
from docling.datamodel.document import InputDocument
|
30
|
+
|
31
|
+
_log = logging.getLogger(__name__)
|
32
|
+
|
33
|
+
|
34
|
+
def _get_pdf_page_geometry(
|
35
|
+
size: Size,
|
36
|
+
) -> PdfPageGeometry:
|
37
|
+
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
|
38
|
+
|
39
|
+
bbox_tuple = (0, 0, size.width, size.height)
|
40
|
+
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
|
41
|
+
|
42
|
+
return PdfPageGeometry(
|
43
|
+
angle=0.0,
|
44
|
+
rect=BoundingRectangle.from_bounding_box(bbox),
|
45
|
+
boundary_type=boundary_type,
|
46
|
+
art_bbox=bbox,
|
47
|
+
bleed_bbox=bbox,
|
48
|
+
crop_bbox=bbox,
|
49
|
+
media_bbox=bbox,
|
50
|
+
trim_bbox=bbox,
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
class MetsGbsPageBackend(PdfPageBackend):
|
55
|
+
def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
|
56
|
+
self._im = page_im
|
57
|
+
self._dpage = parsed_page
|
58
|
+
self.valid = parsed_page is not None
|
59
|
+
|
60
|
+
def is_valid(self) -> bool:
|
61
|
+
return self.valid
|
62
|
+
|
63
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
64
|
+
# Find intersecting cells on the page
|
65
|
+
text_piece = ""
|
66
|
+
page_size = self.get_size()
|
67
|
+
|
68
|
+
scale = (
|
69
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
70
|
+
)
|
71
|
+
|
72
|
+
for i, cell in enumerate(self._dpage.textline_cells):
|
73
|
+
cell_bbox = (
|
74
|
+
cell.rect.to_bounding_box()
|
75
|
+
.to_top_left_origin(page_height=page_size.height)
|
76
|
+
.scaled(scale)
|
77
|
+
)
|
78
|
+
|
79
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
80
|
+
|
81
|
+
if overlap_frac > 0.5:
|
82
|
+
if len(text_piece) > 0:
|
83
|
+
text_piece += " "
|
84
|
+
text_piece += cell.text
|
85
|
+
|
86
|
+
return text_piece
|
87
|
+
|
88
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
89
|
+
return self._dpage
|
90
|
+
|
91
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
92
|
+
return self._dpage.textline_cells
|
93
|
+
|
94
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
95
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
96
|
+
|
97
|
+
images = self._dpage.bitmap_resources
|
98
|
+
|
99
|
+
for img in images:
|
100
|
+
cropbox = img.rect.to_bounding_box().to_top_left_origin(
|
101
|
+
self.get_size().height
|
102
|
+
)
|
103
|
+
|
104
|
+
if cropbox.area() > AREA_THRESHOLD:
|
105
|
+
cropbox = cropbox.scaled(scale=scale)
|
106
|
+
|
107
|
+
yield cropbox
|
108
|
+
|
109
|
+
def get_page_image(
|
110
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
111
|
+
) -> Image.Image:
|
112
|
+
page_size = self.get_size()
|
113
|
+
assert (
|
114
|
+
page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
|
115
|
+
)
|
116
|
+
|
117
|
+
if not cropbox:
|
118
|
+
cropbox = BoundingBox(
|
119
|
+
l=0,
|
120
|
+
r=page_size.width,
|
121
|
+
t=0,
|
122
|
+
b=page_size.height,
|
123
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
124
|
+
)
|
125
|
+
|
126
|
+
image = self._im.resize(
|
127
|
+
size=(round(page_size.width * scale), round(page_size.height * scale))
|
128
|
+
).crop(cropbox.scaled(scale=scale).as_tuple())
|
129
|
+
return image
|
130
|
+
|
131
|
+
def get_size(self) -> Size:
|
132
|
+
return Size(
|
133
|
+
width=self._dpage.dimension.width, height=self._dpage.dimension.height
|
134
|
+
)
|
135
|
+
|
136
|
+
def unload(self) -> None:
|
137
|
+
if hasattr(self, "_im"):
|
138
|
+
delattr(self, "_im")
|
139
|
+
if hasattr(self, "_dpage"):
|
140
|
+
delattr(self, "_dpage")
|
141
|
+
|
142
|
+
|
143
|
+
class _UseType(str, Enum):
|
144
|
+
IMAGE = "image"
|
145
|
+
OCR = "OCR"
|
146
|
+
COORD_OCR = "coordOCR"
|
147
|
+
|
148
|
+
|
149
|
+
@dataclass
|
150
|
+
class _FileInfo:
|
151
|
+
file_id: str
|
152
|
+
mimetype: str
|
153
|
+
path: str
|
154
|
+
use: _UseType
|
155
|
+
|
156
|
+
|
157
|
+
@dataclass
|
158
|
+
class _PageFiles:
|
159
|
+
image: Optional[_FileInfo] = None
|
160
|
+
ocr: Optional[_FileInfo] = None
|
161
|
+
coordOCR: Optional[_FileInfo] = None
|
162
|
+
|
163
|
+
|
164
|
+
def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
|
165
|
+
"""
|
166
|
+
Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
|
167
|
+
"""
|
168
|
+
parts = title_str.split(";")
|
169
|
+
for part in parts:
|
170
|
+
part = part.strip()
|
171
|
+
if part.startswith("bbox "):
|
172
|
+
try:
|
173
|
+
coords = part.split()[1:]
|
174
|
+
rect = BoundingRectangle.from_bounding_box(
|
175
|
+
bbox=BoundingBox.from_tuple(
|
176
|
+
tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
|
177
|
+
)
|
178
|
+
)
|
179
|
+
return rect
|
180
|
+
except Exception:
|
181
|
+
return None
|
182
|
+
return None
|
183
|
+
|
184
|
+
|
185
|
+
def _extract_confidence(title_str) -> float:
|
186
|
+
"""Extracts x_wconf (OCR confidence) value from title string."""
|
187
|
+
for part in title_str.split(";"):
|
188
|
+
part = part.strip()
|
189
|
+
if part.startswith("x_wconf"):
|
190
|
+
try:
|
191
|
+
return float(part.split()[1]) / 100.0
|
192
|
+
except Exception:
|
193
|
+
return 1
|
194
|
+
return 1
|
195
|
+
|
196
|
+
|
197
|
+
class MetsGbsDocumentBackend(PdfDocumentBackend):
|
198
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
199
|
+
super().__init__(in_doc, path_or_stream)
|
200
|
+
|
201
|
+
self._tar: tarfile.TarFile = (
|
202
|
+
tarfile.open(name=self.path_or_stream, mode="r:gz")
|
203
|
+
if isinstance(self.path_or_stream, Path)
|
204
|
+
else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
|
205
|
+
)
|
206
|
+
self.root_mets: Optional[etree._Element] = None
|
207
|
+
self.page_map: Dict[int, _PageFiles] = {}
|
208
|
+
|
209
|
+
for member in self._tar.getmembers():
|
210
|
+
if member.name.endswith(".xml"):
|
211
|
+
file = self._tar.extractfile(member)
|
212
|
+
if file is not None:
|
213
|
+
content = file.read()
|
214
|
+
self.root_mets = self._validate_mets_xml(content)
|
215
|
+
if self.root_mets is not None:
|
216
|
+
break
|
217
|
+
|
218
|
+
if self.root_mets is None:
|
219
|
+
raise RuntimeError(
|
220
|
+
f"METS GBS backend could not load document {self.document_hash}."
|
221
|
+
)
|
222
|
+
|
223
|
+
ns = {
|
224
|
+
"mets": "http://www.loc.gov/METS/",
|
225
|
+
"xlink": "http://www.w3.org/1999/xlink",
|
226
|
+
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
|
227
|
+
"gbs": "http://books.google.com/gbs",
|
228
|
+
"premis": "info:lc/xmlns/premis-v2",
|
229
|
+
"marc": "http://www.loc.gov/MARC21/slim",
|
230
|
+
}
|
231
|
+
|
232
|
+
file_info_by_id: Dict[str, _FileInfo] = {}
|
233
|
+
|
234
|
+
for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
|
235
|
+
use_raw = filegrp.get("USE")
|
236
|
+
try:
|
237
|
+
use = _UseType(use_raw)
|
238
|
+
except ValueError:
|
239
|
+
continue # Ignore unknown USE types
|
240
|
+
|
241
|
+
for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
|
242
|
+
file_id = file_elem.get("ID")
|
243
|
+
mimetype = file_elem.get("MIMETYPE")
|
244
|
+
flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
|
245
|
+
href = (
|
246
|
+
flocat_elem.get("{http://www.w3.org/1999/xlink}href")
|
247
|
+
if flocat_elem is not None
|
248
|
+
else None
|
249
|
+
)
|
250
|
+
if href is None:
|
251
|
+
continue
|
252
|
+
|
253
|
+
file_info_by_id[file_id] = _FileInfo(
|
254
|
+
file_id=file_id, mimetype=mimetype, path=href, use=use
|
255
|
+
)
|
256
|
+
|
257
|
+
USE_TO_ATTR = {
|
258
|
+
_UseType.IMAGE: "image",
|
259
|
+
_UseType.OCR: "ocr",
|
260
|
+
_UseType.COORD_OCR: "coordOCR",
|
261
|
+
}
|
262
|
+
|
263
|
+
for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
|
264
|
+
order_str = div.get("ORDER")
|
265
|
+
if not order_str:
|
266
|
+
continue
|
267
|
+
try:
|
268
|
+
page_no = int(order_str) - 1 # make 0-index pages
|
269
|
+
except ValueError:
|
270
|
+
continue
|
271
|
+
|
272
|
+
page_files = _PageFiles()
|
273
|
+
|
274
|
+
for fptr in div.xpath("./mets:fptr", namespaces=ns):
|
275
|
+
file_id = fptr.get("FILEID")
|
276
|
+
file_info = file_info_by_id.get(file_id)
|
277
|
+
|
278
|
+
if file_info:
|
279
|
+
attr = USE_TO_ATTR.get(file_info.use)
|
280
|
+
if attr:
|
281
|
+
setattr(page_files, attr, file_info)
|
282
|
+
|
283
|
+
self.page_map[page_no] = page_files
|
284
|
+
|
285
|
+
def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
|
286
|
+
root: etree._Element = etree.fromstring(xml_string)
|
287
|
+
if (
|
288
|
+
root.tag == "{http://www.loc.gov/METS/}mets"
|
289
|
+
and root.get("PROFILE") == "gbs"
|
290
|
+
):
|
291
|
+
return root
|
292
|
+
|
293
|
+
_log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
|
294
|
+
return None
|
295
|
+
|
296
|
+
def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
|
297
|
+
# TODO: use better fallbacks...
|
298
|
+
image_info = self.page_map[page_no].image
|
299
|
+
assert image_info is not None
|
300
|
+
ocr_info = self.page_map[page_no].coordOCR
|
301
|
+
assert ocr_info is not None
|
302
|
+
|
303
|
+
image_file = self._tar.extractfile(image_info.path)
|
304
|
+
assert image_file is not None
|
305
|
+
buf = BytesIO(image_file.read())
|
306
|
+
im: PILImage = Image.open(buf)
|
307
|
+
ocr_file = self._tar.extractfile(ocr_info.path)
|
308
|
+
assert ocr_file is not None
|
309
|
+
ocr_content = ocr_file.read()
|
310
|
+
parser = etree.HTMLParser()
|
311
|
+
ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
|
312
|
+
|
313
|
+
line_cells: List[TextCell] = []
|
314
|
+
word_cells: List[TextCell] = []
|
315
|
+
|
316
|
+
page_div = ocr_root.xpath("//div[@class='ocr_page']")
|
317
|
+
|
318
|
+
size = Size(width=im.size[0], height=im.size[1])
|
319
|
+
if page_div:
|
320
|
+
title = page_div[0].attrib.get("title", "")
|
321
|
+
rect = _extract_rect(title)
|
322
|
+
if rect:
|
323
|
+
size = Size(width=rect.width, height=rect.height)
|
324
|
+
else:
|
325
|
+
_log.error(f"Could not find ocr_page for page {page_no}")
|
326
|
+
|
327
|
+
im = im.resize(size=(round(size.width), round(size.height)))
|
328
|
+
im = im.convert("RGB")
|
329
|
+
|
330
|
+
# Extract all ocrx_word spans
|
331
|
+
for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
|
332
|
+
text = "".join(word.itertext()).strip()
|
333
|
+
title = word.attrib.get("title", "")
|
334
|
+
rect = _extract_rect(title)
|
335
|
+
conf = _extract_confidence(title)
|
336
|
+
if rect:
|
337
|
+
word_cells.append(
|
338
|
+
TextCell(
|
339
|
+
index=ix,
|
340
|
+
text=text,
|
341
|
+
orig=text,
|
342
|
+
rect=rect,
|
343
|
+
from_ocr=True,
|
344
|
+
confidence=conf,
|
345
|
+
)
|
346
|
+
)
|
347
|
+
|
348
|
+
# Extract all ocr_line spans
|
349
|
+
# line: etree._Element
|
350
|
+
for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
|
351
|
+
text = "".join(line.itertext()).strip()
|
352
|
+
title = line.attrib.get("title", "")
|
353
|
+
rect = _extract_rect(title)
|
354
|
+
conf = _extract_confidence(title)
|
355
|
+
if rect:
|
356
|
+
line_cells.append(
|
357
|
+
TextCell(
|
358
|
+
index=ix,
|
359
|
+
text=text,
|
360
|
+
orig=text,
|
361
|
+
rect=rect,
|
362
|
+
from_ocr=True,
|
363
|
+
confidence=conf,
|
364
|
+
)
|
365
|
+
)
|
366
|
+
|
367
|
+
page = SegmentedPdfPage(
|
368
|
+
dimension=_get_pdf_page_geometry(size),
|
369
|
+
textline_cells=line_cells,
|
370
|
+
char_cells=[],
|
371
|
+
word_cells=word_cells,
|
372
|
+
has_textlines=True,
|
373
|
+
has_words=True,
|
374
|
+
has_chars=False,
|
375
|
+
)
|
376
|
+
return page, im
|
377
|
+
|
378
|
+
def page_count(self) -> int:
|
379
|
+
return len(self.page_map)
|
380
|
+
|
381
|
+
def load_page(self, page_no: int) -> MetsGbsPageBackend:
|
382
|
+
# TODO: is this thread-safe?
|
383
|
+
page, im = self._parse_page(page_no)
|
384
|
+
return MetsGbsPageBackend(parsed_page=page, page_im=im)
|
385
|
+
|
386
|
+
def is_valid(self) -> bool:
|
387
|
+
return self.root_mets is not None and self.page_count() > 0
|
388
|
+
|
389
|
+
@classmethod
|
390
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
391
|
+
return {InputFormat.METS_GBS}
|
392
|
+
|
393
|
+
@classmethod
|
394
|
+
def supports_pagination(cls) -> bool:
|
395
|
+
return True
|
396
|
+
|
397
|
+
def unload(self) -> None:
|
398
|
+
super().unload()
|
399
|
+
self._tar.close()
|
docling/backend/pdf_backend.py
CHANGED
@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
84
84
|
|
85
85
|
buf.seek(0)
|
86
86
|
self.path_or_stream = buf
|
87
|
-
|
87
|
+
elif self.input_format not in self.supported_formats():
|
88
88
|
raise RuntimeError(
|
89
|
-
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
|
89
|
+
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
|
90
90
|
)
|
91
91
|
|
92
92
|
@abstractmethod
|
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
99
99
|
|
100
100
|
@classmethod
|
101
101
|
def supported_formats(cls) -> Set[InputFormat]:
|
102
|
-
return {InputFormat.PDF}
|
102
|
+
return {InputFormat.PDF, InputFormat.IMAGE}
|
103
103
|
|
104
104
|
@classmethod
|
105
105
|
def supports_pagination(cls) -> bool:
|
docling/cli/main.py
CHANGED
@@ -26,6 +26,7 @@ from rich.console import Console
|
|
26
26
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
27
27
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
28
28
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
29
|
+
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
29
30
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
30
31
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
31
32
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
@@ -262,6 +263,12 @@ def export_documents(
|
|
262
263
|
|
263
264
|
else:
|
264
265
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
266
|
+
if _log.isEnabledFor(logging.INFO):
|
267
|
+
for err in conv_res.errors:
|
268
|
+
_log.info(
|
269
|
+
f" [Failure Detail] Component: {err.component_type}, "
|
270
|
+
f"Module: {err.module_name}, Message: {err.error_message}"
|
271
|
+
)
|
265
272
|
failure_count += 1
|
266
273
|
|
267
274
|
_log.info(
|
@@ -601,9 +608,18 @@ def convert( # noqa: C901
|
|
601
608
|
backend=backend, # pdf_backend
|
602
609
|
)
|
603
610
|
|
611
|
+
# METS GBS options
|
612
|
+
mets_gbs_options = pipeline_options.model_copy()
|
613
|
+
mets_gbs_options.do_ocr = False
|
614
|
+
mets_gbs_format_option = PdfFormatOption(
|
615
|
+
pipeline_options=mets_gbs_options,
|
616
|
+
backend=MetsGbsDocumentBackend,
|
617
|
+
)
|
618
|
+
|
604
619
|
format_options = {
|
605
620
|
InputFormat.PDF: pdf_format_option,
|
606
621
|
InputFormat.IMAGE: pdf_format_option,
|
622
|
+
InputFormat.METS_GBS: mets_gbs_format_option,
|
607
623
|
}
|
608
624
|
|
609
625
|
elif pipeline == ProcessingPipeline.VLM:
|
docling/datamodel/base_models.py
CHANGED
@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
|
|
56
56
|
XLSX = "xlsx"
|
57
57
|
XML_USPTO = "xml_uspto"
|
58
58
|
XML_JATS = "xml_jats"
|
59
|
+
METS_GBS = "mets_gbs"
|
59
60
|
JSON_DOCLING = "json_docling"
|
60
61
|
AUDIO = "audio"
|
61
62
|
|
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
81
82
|
InputFormat.CSV: ["csv"],
|
82
83
|
InputFormat.XLSX: ["xlsx", "xlsm"],
|
83
84
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
85
|
+
InputFormat.METS_GBS: ["tar.gz"],
|
84
86
|
InputFormat.JSON_DOCLING: ["json"],
|
85
87
|
InputFormat.AUDIO: ["wav", "mp3"],
|
86
88
|
}
|
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
113
115
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
114
116
|
],
|
115
117
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
118
|
+
InputFormat.METS_GBS: ["application/mets+xml"],
|
116
119
|
InputFormat.JSON_DOCLING: ["application/json"],
|
117
120
|
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
|
118
121
|
}
|
docling/datamodel/document.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import csv
|
2
2
|
import logging
|
3
3
|
import re
|
4
|
+
import tarfile
|
4
5
|
from collections.abc import Iterable
|
5
6
|
from enum import Enum
|
6
7
|
from io import BytesIO
|
@@ -314,6 +315,10 @@ class _DocumentConversionInput(BaseModel):
|
|
314
315
|
elif objname.endswith(".pptx"):
|
315
316
|
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
316
317
|
|
318
|
+
if mime is not None and mime.lower() == "application/gzip":
|
319
|
+
if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
|
320
|
+
mime = detected_mime
|
321
|
+
|
317
322
|
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
318
323
|
mime = mime or _DocumentConversionInput._detect_csv(content)
|
319
324
|
mime = mime or "text/plain"
|
@@ -457,3 +462,24 @@ class _DocumentConversionInput(BaseModel):
|
|
457
462
|
return None
|
458
463
|
|
459
464
|
return None
|
465
|
+
|
466
|
+
@staticmethod
|
467
|
+
def _detect_mets_gbs(
|
468
|
+
obj: Union[Path, DocumentStream],
|
469
|
+
) -> Optional[Literal["application/mets+xml"]]:
|
470
|
+
content = obj if isinstance(obj, Path) else obj.stream
|
471
|
+
tar: tarfile.TarFile
|
472
|
+
member: tarfile.TarInfo
|
473
|
+
with tarfile.open(
|
474
|
+
name=content if isinstance(content, Path) else None,
|
475
|
+
fileobj=content if isinstance(content, BytesIO) else None,
|
476
|
+
mode="r:gz",
|
477
|
+
) as tar:
|
478
|
+
for member in tar.getmembers():
|
479
|
+
if member.name.endswith(".xml"):
|
480
|
+
file = tar.extractfile(member)
|
481
|
+
if file is not None:
|
482
|
+
content_str = file.read().decode(errors="ignore")
|
483
|
+
if "http://www.loc.gov/METS/" in content_str:
|
484
|
+
return "application/mets+xml"
|
485
|
+
return None
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import Any,
|
2
|
+
from typing import Any, Dict, List, Literal, Optional
|
3
3
|
|
4
4
|
from docling_core.types.doc.page import SegmentedPage
|
5
5
|
from pydantic import AnyUrl, BaseModel
|
@@ -10,11 +10,17 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
10
10
|
|
11
11
|
class BaseVlmOptions(BaseModel):
|
12
12
|
kind: str
|
13
|
-
prompt:
|
13
|
+
prompt: str
|
14
14
|
scale: float = 2.0
|
15
15
|
max_size: Optional[int] = None
|
16
16
|
temperature: float = 0.0
|
17
17
|
|
18
|
+
def build_prompt(self, page: Optional[SegmentedPage]) -> str:
|
19
|
+
return self.prompt
|
20
|
+
|
21
|
+
def decode_response(self, text: str) -> str:
|
22
|
+
return text
|
23
|
+
|
18
24
|
|
19
25
|
class ResponseFormat(str, Enum):
|
20
26
|
DOCTAGS = "doctags"
|
docling/document_converter.py
CHANGED
@@ -5,7 +5,9 @@ import threading
|
|
5
5
|
import time
|
6
6
|
from collections.abc import Iterable, Iterator
|
7
7
|
from concurrent.futures import ThreadPoolExecutor
|
8
|
+
from datetime import datetime
|
8
9
|
from functools import partial
|
10
|
+
from io import BytesIO
|
9
11
|
from pathlib import Path
|
10
12
|
from typing import Dict, List, Optional, Tuple, Type, Union
|
11
13
|
|
@@ -18,6 +20,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
|
18
20
|
from docling.backend.html_backend import HTMLDocumentBackend
|
19
21
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
20
22
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
23
|
+
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
21
24
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
22
25
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
23
26
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
@@ -157,6 +160,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
157
160
|
InputFormat.XML_JATS: FormatOption(
|
158
161
|
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
159
162
|
),
|
163
|
+
InputFormat.METS_GBS: FormatOption(
|
164
|
+
pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
|
165
|
+
),
|
160
166
|
InputFormat.IMAGE: FormatOption(
|
161
167
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
162
168
|
),
|
@@ -275,6 +281,34 @@ class DocumentConverter:
|
|
275
281
|
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
276
282
|
)
|
277
283
|
|
284
|
+
@validate_call(config=ConfigDict(strict=True))
|
285
|
+
def convert_string(
|
286
|
+
self,
|
287
|
+
content: str,
|
288
|
+
format: InputFormat,
|
289
|
+
name: Optional[str],
|
290
|
+
) -> ConversionResult:
|
291
|
+
name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
292
|
+
|
293
|
+
if format == InputFormat.MD:
|
294
|
+
if not name.endswith(".md"):
|
295
|
+
name += ".md"
|
296
|
+
|
297
|
+
buff = BytesIO(content.encode("utf-8"))
|
298
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
299
|
+
|
300
|
+
return self.convert(doc_stream)
|
301
|
+
elif format == InputFormat.HTML:
|
302
|
+
if not name.endswith(".html"):
|
303
|
+
name += ".html"
|
304
|
+
|
305
|
+
buff = BytesIO(content.encode("utf-8"))
|
306
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
307
|
+
|
308
|
+
return self.convert(doc_stream)
|
309
|
+
else:
|
310
|
+
raise ValueError(f"format {format} is not supported in `convert_string`")
|
311
|
+
|
278
312
|
def _convert(
|
279
313
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
280
314
|
) -> Iterator[ConversionResult]:
|
docling/models/api_vlm_model.py
CHANGED
@@ -53,11 +53,7 @@ class ApiVlmModel(BasePageModel):
|
|
53
53
|
if hi_res_image.mode != "RGB":
|
54
54
|
hi_res_image = hi_res_image.convert("RGB")
|
55
55
|
|
56
|
-
|
57
|
-
prompt = self.vlm_options.prompt(page.parsed_page)
|
58
|
-
else:
|
59
|
-
prompt = self.vlm_options.prompt
|
60
|
-
|
56
|
+
prompt = self.vlm_options.build_prompt(page.parsed_page)
|
61
57
|
page_tags = api_image_request(
|
62
58
|
image=hi_res_image,
|
63
59
|
prompt=prompt,
|
@@ -67,6 +63,7 @@ class ApiVlmModel(BasePageModel):
|
|
67
63
|
**self.params,
|
68
64
|
)
|
69
65
|
|
66
|
+
page_tags = self.vlm_options.decode_response(page_tags)
|
70
67
|
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
71
68
|
|
72
69
|
return page
|
@@ -135,10 +135,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
135
135
|
)
|
136
136
|
|
137
137
|
# Define prompt structure
|
138
|
-
|
139
|
-
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
140
|
-
else:
|
141
|
-
user_prompt = self.vlm_options.prompt
|
138
|
+
user_prompt = self.vlm_options.build_prompt(page.parsed_page)
|
142
139
|
prompt = self.formulate_prompt(user_prompt)
|
143
140
|
|
144
141
|
inputs = self.processor(
|
@@ -166,6 +163,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
166
163
|
_log.debug(
|
167
164
|
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
168
165
|
)
|
166
|
+
generated_texts = self.vlm_options.decode_response(generated_texts)
|
169
167
|
page.predictions.vlm_response = VlmPrediction(
|
170
168
|
text=generated_texts,
|
171
169
|
generation_time=generation_time,
|
@@ -35,9 +35,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
35
35
|
|
36
36
|
if self.enabled:
|
37
37
|
try:
|
38
|
-
from mlx_vlm import generate, load # type: ignore
|
38
|
+
from mlx_vlm import generate, load, stream_generate # type: ignore
|
39
39
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
40
|
-
from mlx_vlm.utils import load_config
|
40
|
+
from mlx_vlm.utils import load_config # type: ignore
|
41
41
|
except ImportError:
|
42
42
|
raise ImportError(
|
43
43
|
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
@@ -84,10 +84,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
84
84
|
if hi_res_image.mode != "RGB":
|
85
85
|
hi_res_image = hi_res_image.convert("RGB")
|
86
86
|
|
87
|
-
|
88
|
-
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
89
|
-
else:
|
90
|
-
user_prompt = self.vlm_options.prompt
|
87
|
+
user_prompt = self.vlm_options.build_prompt(page.parsed_page)
|
91
88
|
prompt = self.apply_chat_template(
|
92
89
|
self.processor, self.config, user_prompt, num_images=1
|
93
90
|
)
|
@@ -142,6 +139,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
142
139
|
_log.debug(
|
143
140
|
f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
|
144
141
|
)
|
142
|
+
page_tags = self.vlm_options.decode_response(page_tags)
|
145
143
|
page.predictions.vlm_response = VlmPrediction(
|
146
144
|
text=page_tags,
|
147
145
|
generation_time=generation_time,
|