docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import random
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Optional, Union
|
|
7
|
+
|
|
8
|
+
import pypdfium2 as pdfium
|
|
9
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
|
10
|
+
from docling_core.types.doc.page import (
|
|
11
|
+
BoundingRectangle,
|
|
12
|
+
SegmentedPdfPage,
|
|
13
|
+
TextCell,
|
|
14
|
+
)
|
|
15
|
+
from docling_parse.pdf_parsers import pdf_parser_v1
|
|
16
|
+
from PIL import Image, ImageDraw
|
|
17
|
+
from pypdfium2 import PdfPage
|
|
18
|
+
|
|
19
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
20
|
+
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
|
21
|
+
from docling.datamodel.document import InputDocument
|
|
22
|
+
|
|
23
|
+
_log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DoclingParsePageBackend(PdfPageBackend):
|
|
27
|
+
def __init__(
|
|
28
|
+
self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
|
|
29
|
+
):
|
|
30
|
+
self._ppage = page_obj
|
|
31
|
+
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
|
32
|
+
|
|
33
|
+
self.valid = "pages" in parsed_page
|
|
34
|
+
if self.valid:
|
|
35
|
+
self._dpage = parsed_page["pages"][0]
|
|
36
|
+
else:
|
|
37
|
+
_log.info(
|
|
38
|
+
f"An error occurred when loading page {page_no} of document {document_hash}."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def is_valid(self) -> bool:
|
|
42
|
+
return self.valid
|
|
43
|
+
|
|
44
|
+
def _compute_text_cells(self) -> List[TextCell]:
|
|
45
|
+
"""Compute text cells from docling-parse data."""
|
|
46
|
+
cells: List[TextCell] = []
|
|
47
|
+
cell_counter = 0
|
|
48
|
+
|
|
49
|
+
if not self.valid:
|
|
50
|
+
return cells
|
|
51
|
+
|
|
52
|
+
page_size = self.get_size()
|
|
53
|
+
|
|
54
|
+
parser_width = self._dpage["width"]
|
|
55
|
+
parser_height = self._dpage["height"]
|
|
56
|
+
|
|
57
|
+
for i in range(len(self._dpage["cells"])):
|
|
58
|
+
rect = self._dpage["cells"][i]["box"]["device"]
|
|
59
|
+
x0, y0, x1, y1 = rect
|
|
60
|
+
|
|
61
|
+
if x1 < x0:
|
|
62
|
+
x0, x1 = x1, x0
|
|
63
|
+
if y1 < y0:
|
|
64
|
+
y0, y1 = y1, y0
|
|
65
|
+
|
|
66
|
+
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
|
67
|
+
cells.append(
|
|
68
|
+
TextCell(
|
|
69
|
+
index=cell_counter,
|
|
70
|
+
text=text_piece,
|
|
71
|
+
orig=text_piece,
|
|
72
|
+
from_ocr=False,
|
|
73
|
+
rect=BoundingRectangle.from_bounding_box(
|
|
74
|
+
BoundingBox(
|
|
75
|
+
l=x0 * page_size.width / parser_width,
|
|
76
|
+
b=y0 * page_size.height / parser_height,
|
|
77
|
+
r=x1 * page_size.width / parser_width,
|
|
78
|
+
t=y1 * page_size.height / parser_height,
|
|
79
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
80
|
+
)
|
|
81
|
+
).to_top_left_origin(page_size.height),
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
cell_counter += 1
|
|
86
|
+
|
|
87
|
+
return cells
|
|
88
|
+
|
|
89
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
90
|
+
if not self.valid:
|
|
91
|
+
return ""
|
|
92
|
+
# Find intersecting cells on the page
|
|
93
|
+
text_piece = ""
|
|
94
|
+
page_size = self.get_size()
|
|
95
|
+
parser_width = self._dpage["width"]
|
|
96
|
+
parser_height = self._dpage["height"]
|
|
97
|
+
|
|
98
|
+
scale = (
|
|
99
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
for i in range(len(self._dpage["cells"])):
|
|
103
|
+
rect = self._dpage["cells"][i]["box"]["device"]
|
|
104
|
+
x0, y0, x1, y1 = rect
|
|
105
|
+
cell_bbox = BoundingBox(
|
|
106
|
+
l=x0 * scale * page_size.width / parser_width,
|
|
107
|
+
b=y0 * scale * page_size.height / parser_height,
|
|
108
|
+
r=x1 * scale * page_size.width / parser_width,
|
|
109
|
+
t=y1 * scale * page_size.height / parser_height,
|
|
110
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
111
|
+
).to_top_left_origin(page_height=page_size.height * scale)
|
|
112
|
+
|
|
113
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
|
114
|
+
|
|
115
|
+
if overlap_frac > 0.5:
|
|
116
|
+
if len(text_piece) > 0:
|
|
117
|
+
text_piece += " "
|
|
118
|
+
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
|
119
|
+
|
|
120
|
+
return text_piece
|
|
121
|
+
|
|
122
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
|
123
|
+
if not self.valid:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
text_cells = self._compute_text_cells()
|
|
127
|
+
|
|
128
|
+
# Get the PDF page geometry from pypdfium2
|
|
129
|
+
dimension = get_pdf_page_geometry(self._ppage)
|
|
130
|
+
|
|
131
|
+
# Create SegmentedPdfPage
|
|
132
|
+
return SegmentedPdfPage(
|
|
133
|
+
dimension=dimension,
|
|
134
|
+
textline_cells=text_cells,
|
|
135
|
+
char_cells=[],
|
|
136
|
+
word_cells=[],
|
|
137
|
+
has_lines=len(text_cells) > 0,
|
|
138
|
+
has_words=False,
|
|
139
|
+
has_chars=False,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
|
143
|
+
return self._compute_text_cells()
|
|
144
|
+
|
|
145
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
146
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
|
147
|
+
|
|
148
|
+
for i in range(len(self._dpage["images"])):
|
|
149
|
+
bitmap = self._dpage["images"][i]
|
|
150
|
+
cropbox = BoundingBox.from_tuple(
|
|
151
|
+
bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
|
|
152
|
+
).to_top_left_origin(self.get_size().height)
|
|
153
|
+
|
|
154
|
+
if cropbox.area() > AREA_THRESHOLD:
|
|
155
|
+
cropbox = cropbox.scaled(scale=scale)
|
|
156
|
+
|
|
157
|
+
yield cropbox
|
|
158
|
+
|
|
159
|
+
def get_page_image(
|
|
160
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
|
161
|
+
) -> Image.Image:
|
|
162
|
+
page_size = self.get_size()
|
|
163
|
+
|
|
164
|
+
if not cropbox:
|
|
165
|
+
cropbox = BoundingBox(
|
|
166
|
+
l=0,
|
|
167
|
+
r=page_size.width,
|
|
168
|
+
t=0,
|
|
169
|
+
b=page_size.height,
|
|
170
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
171
|
+
)
|
|
172
|
+
padbox = BoundingBox(
|
|
173
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
|
174
|
+
)
|
|
175
|
+
else:
|
|
176
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
|
177
|
+
padbox.r = page_size.width - padbox.r
|
|
178
|
+
padbox.t = page_size.height - padbox.t
|
|
179
|
+
|
|
180
|
+
image = (
|
|
181
|
+
self._ppage.render(
|
|
182
|
+
scale=scale * 1.5,
|
|
183
|
+
rotation=0, # no additional rotation
|
|
184
|
+
crop=padbox.as_tuple(),
|
|
185
|
+
)
|
|
186
|
+
.to_pil()
|
|
187
|
+
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
|
188
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
|
189
|
+
|
|
190
|
+
return image
|
|
191
|
+
|
|
192
|
+
def get_size(self) -> Size:
|
|
193
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
|
194
|
+
|
|
195
|
+
def unload(self):
|
|
196
|
+
self._ppage = None
|
|
197
|
+
self._dpage = None
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
201
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
|
202
|
+
super().__init__(in_doc, path_or_stream)
|
|
203
|
+
|
|
204
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
|
205
|
+
self.parser = pdf_parser_v1()
|
|
206
|
+
|
|
207
|
+
success = False
|
|
208
|
+
if isinstance(self.path_or_stream, BytesIO):
|
|
209
|
+
success = self.parser.load_document_from_bytesio(
|
|
210
|
+
self.document_hash, self.path_or_stream
|
|
211
|
+
)
|
|
212
|
+
elif isinstance(self.path_or_stream, Path):
|
|
213
|
+
success = self.parser.load_document(
|
|
214
|
+
self.document_hash, str(self.path_or_stream)
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
if not success:
|
|
218
|
+
raise RuntimeError(
|
|
219
|
+
f"docling-parse could not load document with hash {self.document_hash}."
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def page_count(self) -> int:
|
|
223
|
+
return len(self._pdoc) # To be replaced with docling-parse API
|
|
224
|
+
|
|
225
|
+
def load_page(self, page_no: int) -> DoclingParsePageBackend:
|
|
226
|
+
return DoclingParsePageBackend(
|
|
227
|
+
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
def is_valid(self) -> bool:
|
|
231
|
+
return self.page_count() > 0
|
|
232
|
+
|
|
233
|
+
def unload(self):
|
|
234
|
+
super().unload()
|
|
235
|
+
self.parser.unload_document(self.document_hash)
|
|
236
|
+
self._pdoc.close()
|
|
237
|
+
self._pdoc = None
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import random
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
import pypdfium2 as pdfium
|
|
9
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
10
|
+
from docling_core.types.doc.page import (
|
|
11
|
+
BoundingRectangle,
|
|
12
|
+
PdfPageBoundaryType,
|
|
13
|
+
PdfPageGeometry,
|
|
14
|
+
SegmentedPdfPage,
|
|
15
|
+
TextCell,
|
|
16
|
+
)
|
|
17
|
+
from docling_parse.pdf_parsers import pdf_parser_v2
|
|
18
|
+
from PIL import Image, ImageDraw
|
|
19
|
+
from pypdfium2 import PdfPage
|
|
20
|
+
|
|
21
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
22
|
+
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
|
23
|
+
from docling.datamodel.base_models import Size
|
|
24
|
+
from docling.utils.locks import pypdfium2_lock
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from docling.datamodel.document import InputDocument
|
|
28
|
+
|
|
29
|
+
_log = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DoclingParseV2PageBackend(PdfPageBackend):
|
|
33
|
+
def __init__(
|
|
34
|
+
self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
|
|
35
|
+
):
|
|
36
|
+
self._ppage = page_obj
|
|
37
|
+
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
|
38
|
+
|
|
39
|
+
self.valid = "pages" in parsed_page and len(parsed_page["pages"]) == 1
|
|
40
|
+
if self.valid:
|
|
41
|
+
self._dpage = parsed_page["pages"][0]
|
|
42
|
+
else:
|
|
43
|
+
_log.info(
|
|
44
|
+
f"An error occurred when loading page {page_no} of document {document_hash}."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def is_valid(self) -> bool:
|
|
48
|
+
return self.valid
|
|
49
|
+
|
|
50
|
+
def _compute_text_cells(self) -> List[TextCell]:
|
|
51
|
+
"""Compute text cells from docling-parse v2 data."""
|
|
52
|
+
cells: List[TextCell] = []
|
|
53
|
+
cell_counter = 0
|
|
54
|
+
|
|
55
|
+
if not self.valid:
|
|
56
|
+
return cells
|
|
57
|
+
|
|
58
|
+
page_size = self.get_size()
|
|
59
|
+
|
|
60
|
+
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
|
61
|
+
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
|
62
|
+
|
|
63
|
+
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
|
64
|
+
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
|
65
|
+
|
|
66
|
+
for i, cell_data in enumerate(cells_data):
|
|
67
|
+
x0 = cell_data[cells_header.index("x0")]
|
|
68
|
+
y0 = cell_data[cells_header.index("y0")]
|
|
69
|
+
x1 = cell_data[cells_header.index("x1")]
|
|
70
|
+
y1 = cell_data[cells_header.index("y1")]
|
|
71
|
+
|
|
72
|
+
if x1 < x0:
|
|
73
|
+
x0, x1 = x1, x0
|
|
74
|
+
if y1 < y0:
|
|
75
|
+
y0, y1 = y1, y0
|
|
76
|
+
|
|
77
|
+
text_piece = cell_data[cells_header.index("text")]
|
|
78
|
+
cells.append(
|
|
79
|
+
TextCell(
|
|
80
|
+
index=cell_counter,
|
|
81
|
+
text=text_piece,
|
|
82
|
+
orig=text_piece,
|
|
83
|
+
from_ocr=False,
|
|
84
|
+
rect=BoundingRectangle.from_bounding_box(
|
|
85
|
+
BoundingBox(
|
|
86
|
+
l=x0 * page_size.width / parser_width,
|
|
87
|
+
b=y0 * page_size.height / parser_height,
|
|
88
|
+
r=x1 * page_size.width / parser_width,
|
|
89
|
+
t=y1 * page_size.height / parser_height,
|
|
90
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
91
|
+
)
|
|
92
|
+
).to_top_left_origin(page_size.height),
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
cell_counter += 1
|
|
96
|
+
|
|
97
|
+
return cells
|
|
98
|
+
|
|
99
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
100
|
+
if not self.valid:
|
|
101
|
+
return ""
|
|
102
|
+
# Find intersecting cells on the page
|
|
103
|
+
text_piece = ""
|
|
104
|
+
page_size = self.get_size()
|
|
105
|
+
|
|
106
|
+
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
|
107
|
+
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
|
108
|
+
|
|
109
|
+
scale = (
|
|
110
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
|
114
|
+
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
|
115
|
+
|
|
116
|
+
for i, cell_data in enumerate(cells_data):
|
|
117
|
+
x0 = cell_data[cells_header.index("x0")]
|
|
118
|
+
y0 = cell_data[cells_header.index("y0")]
|
|
119
|
+
x1 = cell_data[cells_header.index("x1")]
|
|
120
|
+
y1 = cell_data[cells_header.index("y1")]
|
|
121
|
+
|
|
122
|
+
cell_bbox = BoundingBox(
|
|
123
|
+
l=x0 * scale * page_size.width / parser_width,
|
|
124
|
+
b=y0 * scale * page_size.height / parser_height,
|
|
125
|
+
r=x1 * scale * page_size.width / parser_width,
|
|
126
|
+
t=y1 * scale * page_size.height / parser_height,
|
|
127
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
128
|
+
).to_top_left_origin(page_height=page_size.height * scale)
|
|
129
|
+
|
|
130
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
|
131
|
+
|
|
132
|
+
if overlap_frac > 0.5:
|
|
133
|
+
if len(text_piece) > 0:
|
|
134
|
+
text_piece += " "
|
|
135
|
+
text_piece += cell_data[cells_header.index("text")]
|
|
136
|
+
|
|
137
|
+
return text_piece
|
|
138
|
+
|
|
139
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
|
140
|
+
if not self.valid:
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
text_cells = self._compute_text_cells()
|
|
144
|
+
|
|
145
|
+
# Get the PDF page geometry from pypdfium2
|
|
146
|
+
dimension = get_pdf_page_geometry(self._ppage)
|
|
147
|
+
|
|
148
|
+
# Create SegmentedPdfPage
|
|
149
|
+
return SegmentedPdfPage(
|
|
150
|
+
dimension=dimension,
|
|
151
|
+
textline_cells=text_cells,
|
|
152
|
+
char_cells=[],
|
|
153
|
+
word_cells=[],
|
|
154
|
+
has_textlines=len(text_cells) > 0,
|
|
155
|
+
has_words=False,
|
|
156
|
+
has_chars=False,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
|
160
|
+
return self._compute_text_cells()
|
|
161
|
+
|
|
162
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
163
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
|
164
|
+
|
|
165
|
+
images = self._dpage["sanitized"]["images"]["data"]
|
|
166
|
+
images_header = self._dpage["sanitized"]["images"]["header"]
|
|
167
|
+
|
|
168
|
+
for row in images:
|
|
169
|
+
x0 = row[images_header.index("x0")]
|
|
170
|
+
y0 = row[images_header.index("y0")]
|
|
171
|
+
x1 = row[images_header.index("x1")]
|
|
172
|
+
y1 = row[images_header.index("y1")]
|
|
173
|
+
|
|
174
|
+
cropbox = BoundingBox.from_tuple(
|
|
175
|
+
(x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
|
|
176
|
+
).to_top_left_origin(self.get_size().height)
|
|
177
|
+
|
|
178
|
+
if cropbox.area() > AREA_THRESHOLD:
|
|
179
|
+
cropbox = cropbox.scaled(scale=scale)
|
|
180
|
+
|
|
181
|
+
yield cropbox
|
|
182
|
+
|
|
183
|
+
def get_page_image(
|
|
184
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
|
185
|
+
) -> Image.Image:
|
|
186
|
+
page_size = self.get_size()
|
|
187
|
+
|
|
188
|
+
if not cropbox:
|
|
189
|
+
cropbox = BoundingBox(
|
|
190
|
+
l=0,
|
|
191
|
+
r=page_size.width,
|
|
192
|
+
t=0,
|
|
193
|
+
b=page_size.height,
|
|
194
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
195
|
+
)
|
|
196
|
+
padbox = BoundingBox(
|
|
197
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
|
198
|
+
)
|
|
199
|
+
else:
|
|
200
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
|
201
|
+
padbox.r = page_size.width - padbox.r
|
|
202
|
+
padbox.t = page_size.height - padbox.t
|
|
203
|
+
|
|
204
|
+
with pypdfium2_lock:
|
|
205
|
+
image = (
|
|
206
|
+
self._ppage.render(
|
|
207
|
+
scale=scale * 1.5,
|
|
208
|
+
rotation=0, # no additional rotation
|
|
209
|
+
crop=padbox.as_tuple(),
|
|
210
|
+
)
|
|
211
|
+
.to_pil()
|
|
212
|
+
.resize(
|
|
213
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
|
214
|
+
)
|
|
215
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
|
216
|
+
|
|
217
|
+
return image
|
|
218
|
+
|
|
219
|
+
def get_size(self) -> Size:
|
|
220
|
+
with pypdfium2_lock:
|
|
221
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
|
222
|
+
|
|
223
|
+
def unload(self):
|
|
224
|
+
self._ppage = None
|
|
225
|
+
self._dpage = None
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|
229
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
|
230
|
+
super().__init__(in_doc, path_or_stream)
|
|
231
|
+
|
|
232
|
+
with pypdfium2_lock:
|
|
233
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
|
234
|
+
self.parser = pdf_parser_v2("fatal")
|
|
235
|
+
|
|
236
|
+
success = False
|
|
237
|
+
if isinstance(self.path_or_stream, BytesIO):
|
|
238
|
+
success = self.parser.load_document_from_bytesio(
|
|
239
|
+
self.document_hash, self.path_or_stream
|
|
240
|
+
)
|
|
241
|
+
elif isinstance(self.path_or_stream, Path):
|
|
242
|
+
success = self.parser.load_document(
|
|
243
|
+
self.document_hash, str(self.path_or_stream)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
if not success:
|
|
247
|
+
raise RuntimeError(
|
|
248
|
+
f"docling-parse v2 could not load document {self.document_hash}."
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
def page_count(self) -> int:
|
|
252
|
+
# return len(self._pdoc) # To be replaced with docling-parse API
|
|
253
|
+
|
|
254
|
+
len_1 = len(self._pdoc)
|
|
255
|
+
len_2 = self.parser.number_of_pages(self.document_hash)
|
|
256
|
+
|
|
257
|
+
if len_1 != len_2:
|
|
258
|
+
_log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
|
|
259
|
+
|
|
260
|
+
return len_2
|
|
261
|
+
|
|
262
|
+
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
|
|
263
|
+
with pypdfium2_lock:
|
|
264
|
+
return DoclingParseV2PageBackend(
|
|
265
|
+
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def is_valid(self) -> bool:
|
|
269
|
+
return self.page_count() > 0
|
|
270
|
+
|
|
271
|
+
def unload(self):
|
|
272
|
+
super().unload()
|
|
273
|
+
self.parser.unload_document(self.document_hash)
|
|
274
|
+
with pypdfium2_lock:
|
|
275
|
+
self._pdoc.close()
|
|
276
|
+
self._pdoc = None
|