docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
"""Backend for GBS Google Books schema."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import tarfile
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
|
|
11
|
+
|
|
12
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
|
13
|
+
from docling_core.types.doc.page import (
|
|
14
|
+
BoundingRectangle,
|
|
15
|
+
PdfPageBoundaryType,
|
|
16
|
+
PdfPageGeometry,
|
|
17
|
+
SegmentedPdfPage,
|
|
18
|
+
TextCell,
|
|
19
|
+
)
|
|
20
|
+
from lxml import etree
|
|
21
|
+
from PIL import Image
|
|
22
|
+
from PIL.Image import Image as PILImage
|
|
23
|
+
|
|
24
|
+
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
|
25
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
26
|
+
from docling.datamodel.base_models import InputFormat
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from docling.datamodel.document import InputDocument
|
|
30
|
+
|
|
31
|
+
_log = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _get_pdf_page_geometry(
|
|
35
|
+
size: Size,
|
|
36
|
+
) -> PdfPageGeometry:
|
|
37
|
+
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
|
|
38
|
+
|
|
39
|
+
bbox_tuple = (0, 0, size.width, size.height)
|
|
40
|
+
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
|
|
41
|
+
|
|
42
|
+
return PdfPageGeometry(
|
|
43
|
+
angle=0.0,
|
|
44
|
+
rect=BoundingRectangle.from_bounding_box(bbox),
|
|
45
|
+
boundary_type=boundary_type,
|
|
46
|
+
art_bbox=bbox,
|
|
47
|
+
bleed_bbox=bbox,
|
|
48
|
+
crop_bbox=bbox,
|
|
49
|
+
media_bbox=bbox,
|
|
50
|
+
trim_bbox=bbox,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class MetsGbsPageBackend(PdfPageBackend):
|
|
55
|
+
def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
|
|
56
|
+
self._im = page_im
|
|
57
|
+
self._dpage = parsed_page
|
|
58
|
+
self.valid = parsed_page is not None
|
|
59
|
+
|
|
60
|
+
def is_valid(self) -> bool:
|
|
61
|
+
return self.valid
|
|
62
|
+
|
|
63
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
64
|
+
# Find intersecting cells on the page
|
|
65
|
+
text_piece = ""
|
|
66
|
+
page_size = self.get_size()
|
|
67
|
+
|
|
68
|
+
scale = (
|
|
69
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
for i, cell in enumerate(self._dpage.textline_cells):
|
|
73
|
+
cell_bbox = (
|
|
74
|
+
cell.rect.to_bounding_box()
|
|
75
|
+
.to_top_left_origin(page_height=page_size.height)
|
|
76
|
+
.scaled(scale)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
|
80
|
+
|
|
81
|
+
if overlap_frac > 0.5:
|
|
82
|
+
if len(text_piece) > 0:
|
|
83
|
+
text_piece += " "
|
|
84
|
+
text_piece += cell.text
|
|
85
|
+
|
|
86
|
+
return text_piece
|
|
87
|
+
|
|
88
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
|
89
|
+
return self._dpage
|
|
90
|
+
|
|
91
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
|
92
|
+
return self._dpage.textline_cells
|
|
93
|
+
|
|
94
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
95
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
|
96
|
+
|
|
97
|
+
images = self._dpage.bitmap_resources
|
|
98
|
+
|
|
99
|
+
for img in images:
|
|
100
|
+
cropbox = img.rect.to_bounding_box().to_top_left_origin(
|
|
101
|
+
self.get_size().height
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if cropbox.area() > AREA_THRESHOLD:
|
|
105
|
+
cropbox = cropbox.scaled(scale=scale)
|
|
106
|
+
|
|
107
|
+
yield cropbox
|
|
108
|
+
|
|
109
|
+
def get_page_image(
|
|
110
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
|
111
|
+
) -> Image.Image:
|
|
112
|
+
page_size = self.get_size()
|
|
113
|
+
assert (
|
|
114
|
+
page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if not cropbox:
|
|
118
|
+
cropbox = BoundingBox(
|
|
119
|
+
l=0,
|
|
120
|
+
r=page_size.width,
|
|
121
|
+
t=0,
|
|
122
|
+
b=page_size.height,
|
|
123
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
image = self._im.resize(
|
|
127
|
+
size=(round(page_size.width * scale), round(page_size.height * scale))
|
|
128
|
+
).crop(cropbox.scaled(scale=scale).as_tuple())
|
|
129
|
+
return image
|
|
130
|
+
|
|
131
|
+
def get_size(self) -> Size:
|
|
132
|
+
return Size(
|
|
133
|
+
width=self._dpage.dimension.width, height=self._dpage.dimension.height
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def unload(self) -> None:
|
|
137
|
+
if hasattr(self, "_im"):
|
|
138
|
+
delattr(self, "_im")
|
|
139
|
+
if hasattr(self, "_dpage"):
|
|
140
|
+
delattr(self, "_dpage")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class _UseType(str, Enum):
|
|
144
|
+
IMAGE = "image"
|
|
145
|
+
OCR = "OCR"
|
|
146
|
+
COORD_OCR = "coordOCR"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class _FileInfo:
|
|
151
|
+
file_id: str
|
|
152
|
+
mimetype: str
|
|
153
|
+
path: str
|
|
154
|
+
use: _UseType
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@dataclass
|
|
158
|
+
class _PageFiles:
|
|
159
|
+
image: Optional[_FileInfo] = None
|
|
160
|
+
ocr: Optional[_FileInfo] = None
|
|
161
|
+
coordOCR: Optional[_FileInfo] = None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
|
|
165
|
+
"""
|
|
166
|
+
Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
|
|
167
|
+
"""
|
|
168
|
+
parts = title_str.split(";")
|
|
169
|
+
for part in parts:
|
|
170
|
+
part = part.strip()
|
|
171
|
+
if part.startswith("bbox "):
|
|
172
|
+
try:
|
|
173
|
+
coords = part.split()[1:]
|
|
174
|
+
rect = BoundingRectangle.from_bounding_box(
|
|
175
|
+
bbox=BoundingBox.from_tuple(
|
|
176
|
+
tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
|
|
177
|
+
)
|
|
178
|
+
)
|
|
179
|
+
return rect
|
|
180
|
+
except Exception:
|
|
181
|
+
return None
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _extract_confidence(title_str) -> float:
|
|
186
|
+
"""Extracts x_wconf (OCR confidence) value from title string."""
|
|
187
|
+
for part in title_str.split(";"):
|
|
188
|
+
part = part.strip()
|
|
189
|
+
if part.startswith("x_wconf"):
|
|
190
|
+
try:
|
|
191
|
+
return float(part.split()[1]) / 100.0
|
|
192
|
+
except Exception:
|
|
193
|
+
return 1
|
|
194
|
+
return 1
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class MetsGbsDocumentBackend(PdfDocumentBackend):
|
|
198
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
|
199
|
+
super().__init__(in_doc, path_or_stream)
|
|
200
|
+
|
|
201
|
+
self._tar: tarfile.TarFile = (
|
|
202
|
+
tarfile.open(name=self.path_or_stream, mode="r:gz")
|
|
203
|
+
if isinstance(self.path_or_stream, Path)
|
|
204
|
+
else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
|
|
205
|
+
)
|
|
206
|
+
self.root_mets: Optional[etree._Element] = None
|
|
207
|
+
self.page_map: Dict[int, _PageFiles] = {}
|
|
208
|
+
|
|
209
|
+
for member in self._tar.getmembers():
|
|
210
|
+
if member.name.endswith(".xml"):
|
|
211
|
+
file = self._tar.extractfile(member)
|
|
212
|
+
if file is not None:
|
|
213
|
+
content = file.read()
|
|
214
|
+
self.root_mets = self._validate_mets_xml(content)
|
|
215
|
+
if self.root_mets is not None:
|
|
216
|
+
break
|
|
217
|
+
|
|
218
|
+
if self.root_mets is None:
|
|
219
|
+
raise RuntimeError(
|
|
220
|
+
f"METS GBS backend could not load document {self.document_hash}."
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
ns = {
|
|
224
|
+
"mets": "http://www.loc.gov/METS/",
|
|
225
|
+
"xlink": "http://www.w3.org/1999/xlink",
|
|
226
|
+
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
|
|
227
|
+
"gbs": "http://books.google.com/gbs",
|
|
228
|
+
"premis": "info:lc/xmlns/premis-v2",
|
|
229
|
+
"marc": "http://www.loc.gov/MARC21/slim",
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
file_info_by_id: Dict[str, _FileInfo] = {}
|
|
233
|
+
|
|
234
|
+
for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
|
|
235
|
+
use_raw = filegrp.get("USE")
|
|
236
|
+
try:
|
|
237
|
+
use = _UseType(use_raw)
|
|
238
|
+
except ValueError:
|
|
239
|
+
continue # Ignore unknown USE types
|
|
240
|
+
|
|
241
|
+
for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
|
|
242
|
+
file_id = file_elem.get("ID")
|
|
243
|
+
mimetype = file_elem.get("MIMETYPE")
|
|
244
|
+
flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
|
|
245
|
+
href = (
|
|
246
|
+
flocat_elem.get("{http://www.w3.org/1999/xlink}href")
|
|
247
|
+
if flocat_elem is not None
|
|
248
|
+
else None
|
|
249
|
+
)
|
|
250
|
+
if href is None:
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
file_info_by_id[file_id] = _FileInfo(
|
|
254
|
+
file_id=file_id, mimetype=mimetype, path=href, use=use
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
USE_TO_ATTR = {
|
|
258
|
+
_UseType.IMAGE: "image",
|
|
259
|
+
_UseType.OCR: "ocr",
|
|
260
|
+
_UseType.COORD_OCR: "coordOCR",
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
|
|
264
|
+
order_str = div.get("ORDER")
|
|
265
|
+
if not order_str:
|
|
266
|
+
continue
|
|
267
|
+
try:
|
|
268
|
+
page_no = int(order_str) - 1 # make 0-index pages
|
|
269
|
+
except ValueError:
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
page_files = _PageFiles()
|
|
273
|
+
|
|
274
|
+
for fptr in div.xpath("./mets:fptr", namespaces=ns):
|
|
275
|
+
file_id = fptr.get("FILEID")
|
|
276
|
+
file_info = file_info_by_id.get(file_id)
|
|
277
|
+
|
|
278
|
+
if file_info:
|
|
279
|
+
attr = USE_TO_ATTR.get(file_info.use)
|
|
280
|
+
if attr:
|
|
281
|
+
setattr(page_files, attr, file_info)
|
|
282
|
+
|
|
283
|
+
self.page_map[page_no] = page_files
|
|
284
|
+
|
|
285
|
+
def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
|
|
286
|
+
root: etree._Element = etree.fromstring(xml_string)
|
|
287
|
+
if (
|
|
288
|
+
root.tag == "{http://www.loc.gov/METS/}mets"
|
|
289
|
+
and root.get("PROFILE") == "gbs"
|
|
290
|
+
):
|
|
291
|
+
return root
|
|
292
|
+
|
|
293
|
+
_log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
|
|
294
|
+
return None
|
|
295
|
+
|
|
296
|
+
def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
|
|
297
|
+
# TODO: use better fallbacks...
|
|
298
|
+
image_info = self.page_map[page_no].image
|
|
299
|
+
assert image_info is not None
|
|
300
|
+
ocr_info = self.page_map[page_no].coordOCR
|
|
301
|
+
assert ocr_info is not None
|
|
302
|
+
|
|
303
|
+
image_file = self._tar.extractfile(image_info.path)
|
|
304
|
+
assert image_file is not None
|
|
305
|
+
buf = BytesIO(image_file.read())
|
|
306
|
+
im: PILImage = Image.open(buf)
|
|
307
|
+
ocr_file = self._tar.extractfile(ocr_info.path)
|
|
308
|
+
assert ocr_file is not None
|
|
309
|
+
ocr_content = ocr_file.read()
|
|
310
|
+
parser = etree.HTMLParser()
|
|
311
|
+
ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
|
|
312
|
+
|
|
313
|
+
line_cells: List[TextCell] = []
|
|
314
|
+
word_cells: List[TextCell] = []
|
|
315
|
+
|
|
316
|
+
page_div = ocr_root.xpath("//div[@class='ocr_page']")
|
|
317
|
+
|
|
318
|
+
size = Size(width=im.size[0], height=im.size[1])
|
|
319
|
+
if page_div:
|
|
320
|
+
title = page_div[0].attrib.get("title", "")
|
|
321
|
+
rect = _extract_rect(title)
|
|
322
|
+
if rect:
|
|
323
|
+
size = Size(width=rect.width, height=rect.height)
|
|
324
|
+
else:
|
|
325
|
+
_log.error(f"Could not find ocr_page for page {page_no}")
|
|
326
|
+
|
|
327
|
+
im = im.resize(size=(round(size.width), round(size.height)))
|
|
328
|
+
im = im.convert("RGB")
|
|
329
|
+
|
|
330
|
+
# Extract all ocrx_word spans
|
|
331
|
+
for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
|
|
332
|
+
text = "".join(word.itertext()).strip()
|
|
333
|
+
title = word.attrib.get("title", "")
|
|
334
|
+
rect = _extract_rect(title)
|
|
335
|
+
conf = _extract_confidence(title)
|
|
336
|
+
if rect:
|
|
337
|
+
word_cells.append(
|
|
338
|
+
TextCell(
|
|
339
|
+
index=ix,
|
|
340
|
+
text=text,
|
|
341
|
+
orig=text,
|
|
342
|
+
rect=rect,
|
|
343
|
+
from_ocr=True,
|
|
344
|
+
confidence=conf,
|
|
345
|
+
)
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# Extract all ocr_line spans
|
|
349
|
+
# line: etree._Element
|
|
350
|
+
for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
|
|
351
|
+
text = "".join(line.itertext()).strip()
|
|
352
|
+
title = line.attrib.get("title", "")
|
|
353
|
+
rect = _extract_rect(title)
|
|
354
|
+
conf = _extract_confidence(title)
|
|
355
|
+
if rect:
|
|
356
|
+
line_cells.append(
|
|
357
|
+
TextCell(
|
|
358
|
+
index=ix,
|
|
359
|
+
text=text,
|
|
360
|
+
orig=text,
|
|
361
|
+
rect=rect,
|
|
362
|
+
from_ocr=True,
|
|
363
|
+
confidence=conf,
|
|
364
|
+
)
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
page = SegmentedPdfPage(
|
|
368
|
+
dimension=_get_pdf_page_geometry(size),
|
|
369
|
+
textline_cells=line_cells,
|
|
370
|
+
char_cells=[],
|
|
371
|
+
word_cells=word_cells,
|
|
372
|
+
has_textlines=True,
|
|
373
|
+
has_words=True,
|
|
374
|
+
has_chars=False,
|
|
375
|
+
)
|
|
376
|
+
return page, im
|
|
377
|
+
|
|
378
|
+
def page_count(self) -> int:
|
|
379
|
+
return len(self.page_map)
|
|
380
|
+
|
|
381
|
+
def load_page(self, page_no: int) -> MetsGbsPageBackend:
|
|
382
|
+
# TODO: is this thread-safe?
|
|
383
|
+
page, im = self._parse_page(page_no)
|
|
384
|
+
return MetsGbsPageBackend(parsed_page=page, page_im=im)
|
|
385
|
+
|
|
386
|
+
def is_valid(self) -> bool:
|
|
387
|
+
return self.root_mets is not None and self.page_count() > 0
|
|
388
|
+
|
|
389
|
+
@classmethod
|
|
390
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
|
391
|
+
return {InputFormat.METS_GBS}
|
|
392
|
+
|
|
393
|
+
@classmethod
|
|
394
|
+
def supports_pagination(cls) -> bool:
|
|
395
|
+
return True
|
|
396
|
+
|
|
397
|
+
def unload(self) -> None:
|
|
398
|
+
super().unload()
|
|
399
|
+
self._tar.close()
|