docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Set, Union
|
|
5
|
+
|
|
6
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
7
|
+
from docling.datamodel.base_models import InputFormat
|
|
8
|
+
from docling.datamodel.document import InputDocument
|
|
9
|
+
|
|
10
|
+
_log = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class NoOpBackend(AbstractDocumentBackend):
|
|
14
|
+
"""
|
|
15
|
+
A no-op backend that only validates input existence.
|
|
16
|
+
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
|
20
|
+
super().__init__(in_doc, path_or_stream)
|
|
21
|
+
|
|
22
|
+
_log.debug(f"NoOpBackend initialized for: {path_or_stream}")
|
|
23
|
+
|
|
24
|
+
# Validate input
|
|
25
|
+
try:
|
|
26
|
+
if isinstance(self.path_or_stream, BytesIO):
|
|
27
|
+
# Check if stream has content
|
|
28
|
+
self.valid = len(self.path_or_stream.getvalue()) > 0
|
|
29
|
+
_log.debug(
|
|
30
|
+
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
|
|
31
|
+
)
|
|
32
|
+
elif isinstance(self.path_or_stream, Path):
|
|
33
|
+
# Check if file exists
|
|
34
|
+
self.valid = self.path_or_stream.exists()
|
|
35
|
+
_log.debug(f"File exists: {self.valid}")
|
|
36
|
+
else:
|
|
37
|
+
self.valid = False
|
|
38
|
+
except Exception as e:
|
|
39
|
+
_log.error(f"NoOpBackend validation failed: {e}")
|
|
40
|
+
self.valid = False
|
|
41
|
+
|
|
42
|
+
def is_valid(self) -> bool:
|
|
43
|
+
return self.valid
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def supports_pagination(cls) -> bool:
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
|
51
|
+
return set(InputFormat)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Set, Union
|
|
6
|
+
|
|
7
|
+
from docling_core.types.doc import BoundingBox, Size
|
|
8
|
+
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
|
9
|
+
from PIL import Image
|
|
10
|
+
|
|
11
|
+
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
|
12
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
13
|
+
from docling.datamodel.base_models import InputFormat
|
|
14
|
+
from docling.datamodel.document import InputDocument
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PdfPageBackend(ABC):
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def get_page_image(
|
|
36
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
|
37
|
+
) -> Image.Image:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def get_size(self) -> Size:
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def is_valid(self) -> bool:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def unload(self):
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
in_doc: InputDocument,
|
|
57
|
+
path_or_stream: Union[BytesIO, Path],
|
|
58
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
59
|
+
):
|
|
60
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
61
|
+
self.options: PdfBackendOptions
|
|
62
|
+
|
|
63
|
+
if self.input_format not in self.supported_formats():
|
|
64
|
+
raise RuntimeError(
|
|
65
|
+
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def load_page(self, page_no: int) -> PdfPageBackend:
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def page_count(self) -> int:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
|
78
|
+
return {InputFormat.PDF}
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def supports_pagination(cls) -> bool:
|
|
82
|
+
return True
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import random
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from importlib.metadata import version
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
8
|
+
|
|
9
|
+
import pypdfium2 as pdfium
|
|
10
|
+
import pypdfium2.raw as pdfium_c
|
|
11
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
|
12
|
+
from docling_core.types.doc.page import (
|
|
13
|
+
BoundingRectangle,
|
|
14
|
+
PdfPageBoundaryType,
|
|
15
|
+
PdfPageGeometry,
|
|
16
|
+
SegmentedPdfPage,
|
|
17
|
+
TextCell,
|
|
18
|
+
)
|
|
19
|
+
from PIL import Image, ImageDraw
|
|
20
|
+
from pypdfium2 import PdfTextPage
|
|
21
|
+
from pypdfium2._helpers.misc import PdfiumError
|
|
22
|
+
|
|
23
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
24
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
25
|
+
from docling.utils.locks import pypdfium2_lock
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_pdf_page_geometry(
|
|
29
|
+
ppage: pdfium.PdfPage,
|
|
30
|
+
angle: float = 0.0,
|
|
31
|
+
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
|
|
32
|
+
) -> PdfPageGeometry:
|
|
33
|
+
"""
|
|
34
|
+
Create PdfPageGeometry from a pypdfium2 PdfPage object.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
ppage: pypdfium2 PdfPage object
|
|
38
|
+
angle: Page rotation angle in degrees (default: 0.0)
|
|
39
|
+
boundary_type: The boundary type for the page (default: CROP_BOX)
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
PdfPageGeometry with all the different bounding boxes properly set
|
|
43
|
+
"""
|
|
44
|
+
with pypdfium2_lock:
|
|
45
|
+
# Get the main bounding box (intersection of crop_box and media_box)
|
|
46
|
+
bbox_tuple = ppage.get_bbox()
|
|
47
|
+
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
|
|
48
|
+
|
|
49
|
+
# Get all the different page boxes from pypdfium2
|
|
50
|
+
media_box_tuple = ppage.get_mediabox()
|
|
51
|
+
crop_box_tuple = ppage.get_cropbox()
|
|
52
|
+
art_box_tuple = ppage.get_artbox()
|
|
53
|
+
bleed_box_tuple = ppage.get_bleedbox()
|
|
54
|
+
trim_box_tuple = ppage.get_trimbox()
|
|
55
|
+
|
|
56
|
+
# Convert to BoundingBox objects using existing from_tuple method
|
|
57
|
+
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
|
|
58
|
+
# Use bbox as fallback when specific box types are not defined
|
|
59
|
+
media_bbox = (
|
|
60
|
+
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
|
|
61
|
+
if media_box_tuple
|
|
62
|
+
else bbox
|
|
63
|
+
)
|
|
64
|
+
crop_bbox = (
|
|
65
|
+
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
|
|
66
|
+
if crop_box_tuple
|
|
67
|
+
else bbox
|
|
68
|
+
)
|
|
69
|
+
art_bbox = (
|
|
70
|
+
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
|
|
71
|
+
if art_box_tuple
|
|
72
|
+
else bbox
|
|
73
|
+
)
|
|
74
|
+
bleed_bbox = (
|
|
75
|
+
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
|
|
76
|
+
if bleed_box_tuple
|
|
77
|
+
else bbox
|
|
78
|
+
)
|
|
79
|
+
trim_bbox = (
|
|
80
|
+
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
|
|
81
|
+
if trim_box_tuple
|
|
82
|
+
else bbox
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return PdfPageGeometry(
|
|
86
|
+
angle=angle,
|
|
87
|
+
rect=BoundingRectangle.from_bounding_box(bbox),
|
|
88
|
+
boundary_type=boundary_type,
|
|
89
|
+
art_bbox=art_bbox,
|
|
90
|
+
bleed_bbox=bleed_bbox,
|
|
91
|
+
crop_bbox=crop_bbox,
|
|
92
|
+
media_bbox=media_bbox,
|
|
93
|
+
trim_bbox=trim_bbox,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if TYPE_CHECKING:
|
|
98
|
+
from docling.datamodel.document import InputDocument
|
|
99
|
+
|
|
100
|
+
_log = logging.getLogger(__name__)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# Resolve pypdfium2 major version
|
|
104
|
+
# pypdfium2 5.x renamed PdfObject.get_pos() -> get_bounds()
|
|
105
|
+
_PYPDFIUM2_MAJOR_VERSION = int(version("pypdfium2").split(".")[0])
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class PyPdfiumPageBackend(PdfPageBackend):
|
|
109
|
+
def __init__(
|
|
110
|
+
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
|
111
|
+
):
|
|
112
|
+
# Note: lock applied by the caller
|
|
113
|
+
self.valid = True # No better way to tell from pypdfium.
|
|
114
|
+
try:
|
|
115
|
+
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
|
116
|
+
except PdfiumError:
|
|
117
|
+
_log.info(
|
|
118
|
+
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
|
119
|
+
exc_info=True,
|
|
120
|
+
)
|
|
121
|
+
self.valid = False
|
|
122
|
+
self.text_page: Optional[PdfTextPage] = None
|
|
123
|
+
|
|
124
|
+
def is_valid(self) -> bool:
|
|
125
|
+
return self.valid
|
|
126
|
+
|
|
127
|
+
def _compute_text_cells(self) -> List[TextCell]:
|
|
128
|
+
"""Compute text cells from pypdfium."""
|
|
129
|
+
with pypdfium2_lock:
|
|
130
|
+
if not self.text_page:
|
|
131
|
+
self.text_page = self._ppage.get_textpage()
|
|
132
|
+
|
|
133
|
+
cells = []
|
|
134
|
+
cell_counter = 0
|
|
135
|
+
|
|
136
|
+
page_size = self.get_size()
|
|
137
|
+
|
|
138
|
+
with pypdfium2_lock:
|
|
139
|
+
for i in range(self.text_page.count_rects()):
|
|
140
|
+
rect = self.text_page.get_rect(i)
|
|
141
|
+
text_piece = self.text_page.get_text_bounded(*rect)
|
|
142
|
+
x0, y0, x1, y1 = rect
|
|
143
|
+
cells.append(
|
|
144
|
+
TextCell(
|
|
145
|
+
index=cell_counter,
|
|
146
|
+
text=text_piece,
|
|
147
|
+
orig=text_piece,
|
|
148
|
+
from_ocr=False,
|
|
149
|
+
rect=BoundingRectangle.from_bounding_box(
|
|
150
|
+
BoundingBox(
|
|
151
|
+
l=x0,
|
|
152
|
+
b=y0,
|
|
153
|
+
r=x1,
|
|
154
|
+
t=y1,
|
|
155
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
156
|
+
)
|
|
157
|
+
).to_top_left_origin(page_size.height),
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
cell_counter += 1
|
|
161
|
+
|
|
162
|
+
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
|
163
|
+
# The cell merging code below is to clean this up.
|
|
164
|
+
def merge_horizontal_cells(
|
|
165
|
+
cells: List[TextCell],
|
|
166
|
+
horizontal_threshold_factor: float = 1.0,
|
|
167
|
+
vertical_threshold_factor: float = 0.5,
|
|
168
|
+
) -> List[TextCell]:
|
|
169
|
+
if not cells:
|
|
170
|
+
return []
|
|
171
|
+
|
|
172
|
+
def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
|
|
173
|
+
rows = []
|
|
174
|
+
current_row = [cells[0]]
|
|
175
|
+
row_top = cells[0].rect.to_bounding_box().t
|
|
176
|
+
row_bottom = cells[0].rect.to_bounding_box().b
|
|
177
|
+
row_height = cells[0].rect.to_bounding_box().height
|
|
178
|
+
|
|
179
|
+
for cell in cells[1:]:
|
|
180
|
+
vertical_threshold = row_height * vertical_threshold_factor
|
|
181
|
+
if (
|
|
182
|
+
abs(cell.rect.to_bounding_box().t - row_top)
|
|
183
|
+
<= vertical_threshold
|
|
184
|
+
and abs(cell.rect.to_bounding_box().b - row_bottom)
|
|
185
|
+
<= vertical_threshold
|
|
186
|
+
):
|
|
187
|
+
current_row.append(cell)
|
|
188
|
+
row_top = min(row_top, cell.rect.to_bounding_box().t)
|
|
189
|
+
row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
|
|
190
|
+
row_height = row_bottom - row_top
|
|
191
|
+
else:
|
|
192
|
+
rows.append(current_row)
|
|
193
|
+
current_row = [cell]
|
|
194
|
+
row_top = cell.rect.to_bounding_box().t
|
|
195
|
+
row_bottom = cell.rect.to_bounding_box().b
|
|
196
|
+
row_height = cell.rect.to_bounding_box().height
|
|
197
|
+
|
|
198
|
+
if current_row:
|
|
199
|
+
rows.append(current_row)
|
|
200
|
+
|
|
201
|
+
return rows
|
|
202
|
+
|
|
203
|
+
def merge_row(row: List[TextCell]) -> List[TextCell]:
|
|
204
|
+
merged = []
|
|
205
|
+
current_group = [row[0]]
|
|
206
|
+
|
|
207
|
+
for cell in row[1:]:
|
|
208
|
+
prev_cell = current_group[-1]
|
|
209
|
+
avg_height = (
|
|
210
|
+
prev_cell.rect.height + cell.rect.to_bounding_box().height
|
|
211
|
+
) / 2
|
|
212
|
+
if (
|
|
213
|
+
cell.rect.to_bounding_box().l
|
|
214
|
+
- prev_cell.rect.to_bounding_box().r
|
|
215
|
+
<= avg_height * horizontal_threshold_factor
|
|
216
|
+
):
|
|
217
|
+
current_group.append(cell)
|
|
218
|
+
else:
|
|
219
|
+
merged.append(merge_group(current_group))
|
|
220
|
+
current_group = [cell]
|
|
221
|
+
|
|
222
|
+
if current_group:
|
|
223
|
+
merged.append(merge_group(current_group))
|
|
224
|
+
|
|
225
|
+
return merged
|
|
226
|
+
|
|
227
|
+
def merge_group(group: List[TextCell]) -> TextCell:
|
|
228
|
+
if len(group) == 1:
|
|
229
|
+
return group[0]
|
|
230
|
+
|
|
231
|
+
merged_bbox = BoundingBox(
|
|
232
|
+
l=min(cell.rect.to_bounding_box().l for cell in group),
|
|
233
|
+
t=min(cell.rect.to_bounding_box().t for cell in group),
|
|
234
|
+
r=max(cell.rect.to_bounding_box().r for cell in group),
|
|
235
|
+
b=max(cell.rect.to_bounding_box().b for cell in group),
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
assert self.text_page is not None
|
|
239
|
+
bbox = merged_bbox.to_bottom_left_origin(page_size.height)
|
|
240
|
+
with pypdfium2_lock:
|
|
241
|
+
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
|
|
242
|
+
|
|
243
|
+
return TextCell(
|
|
244
|
+
index=group[0].index,
|
|
245
|
+
text=merged_text,
|
|
246
|
+
orig=merged_text,
|
|
247
|
+
rect=BoundingRectangle.from_bounding_box(merged_bbox),
|
|
248
|
+
from_ocr=False,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
rows = group_rows(cells)
|
|
252
|
+
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
|
253
|
+
|
|
254
|
+
for i, cell in enumerate(merged_cells, 1):
|
|
255
|
+
cell.index = i
|
|
256
|
+
|
|
257
|
+
return merged_cells
|
|
258
|
+
|
|
259
|
+
return merge_horizontal_cells(cells)
|
|
260
|
+
|
|
261
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
262
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
|
263
|
+
page_size = self.get_size()
|
|
264
|
+
|
|
265
|
+
with pypdfium2_lock:
|
|
266
|
+
rotation = self._ppage.get_rotation()
|
|
267
|
+
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
|
268
|
+
if _PYPDFIUM2_MAJOR_VERSION >= 5:
|
|
269
|
+
pos = obj.get_bounds() # pypdfium2 >= 5.x
|
|
270
|
+
else:
|
|
271
|
+
pos = obj.get_pos() # pypdfium2 <= 4.x
|
|
272
|
+
if rotation == 90:
|
|
273
|
+
pos = (
|
|
274
|
+
pos[1],
|
|
275
|
+
page_size.height - pos[2],
|
|
276
|
+
pos[3],
|
|
277
|
+
page_size.height - pos[0],
|
|
278
|
+
)
|
|
279
|
+
elif rotation == 180:
|
|
280
|
+
pos = (
|
|
281
|
+
page_size.width - pos[2],
|
|
282
|
+
page_size.height - pos[3],
|
|
283
|
+
page_size.width - pos[0],
|
|
284
|
+
page_size.height - pos[1],
|
|
285
|
+
)
|
|
286
|
+
elif rotation == 270:
|
|
287
|
+
pos = (
|
|
288
|
+
page_size.width - pos[3],
|
|
289
|
+
pos[0],
|
|
290
|
+
page_size.width - pos[1],
|
|
291
|
+
pos[2],
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
cropbox = BoundingBox.from_tuple(
|
|
295
|
+
pos, origin=CoordOrigin.BOTTOMLEFT
|
|
296
|
+
).to_top_left_origin(page_height=page_size.height)
|
|
297
|
+
if cropbox.area() > AREA_THRESHOLD:
|
|
298
|
+
cropbox = cropbox.scaled(scale=scale)
|
|
299
|
+
yield cropbox
|
|
300
|
+
|
|
301
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
302
|
+
with pypdfium2_lock:
|
|
303
|
+
if not self.text_page:
|
|
304
|
+
self.text_page = self._ppage.get_textpage()
|
|
305
|
+
|
|
306
|
+
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
|
307
|
+
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
|
308
|
+
|
|
309
|
+
with pypdfium2_lock:
|
|
310
|
+
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
|
311
|
+
|
|
312
|
+
return text_piece
|
|
313
|
+
|
|
314
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
|
315
|
+
if not self.valid:
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
text_cells = self._compute_text_cells()
|
|
319
|
+
|
|
320
|
+
# Get the PDF page geometry from pypdfium2
|
|
321
|
+
dimension = get_pdf_page_geometry(self._ppage)
|
|
322
|
+
|
|
323
|
+
# Create SegmentedPdfPage
|
|
324
|
+
return SegmentedPdfPage(
|
|
325
|
+
dimension=dimension,
|
|
326
|
+
textline_cells=text_cells,
|
|
327
|
+
char_cells=[],
|
|
328
|
+
word_cells=[],
|
|
329
|
+
has_textlines=len(text_cells) > 0,
|
|
330
|
+
has_words=False,
|
|
331
|
+
has_chars=False,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
|
335
|
+
return self._compute_text_cells()
|
|
336
|
+
|
|
337
|
+
def get_page_image(
|
|
338
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
|
339
|
+
) -> Image.Image:
|
|
340
|
+
page_size = self.get_size()
|
|
341
|
+
|
|
342
|
+
if not cropbox:
|
|
343
|
+
cropbox = BoundingBox(
|
|
344
|
+
l=0,
|
|
345
|
+
r=page_size.width,
|
|
346
|
+
t=0,
|
|
347
|
+
b=page_size.height,
|
|
348
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
349
|
+
)
|
|
350
|
+
padbox = BoundingBox(
|
|
351
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
|
352
|
+
)
|
|
353
|
+
else:
|
|
354
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
|
355
|
+
padbox.r = page_size.width - padbox.r
|
|
356
|
+
padbox.t = page_size.height - padbox.t
|
|
357
|
+
|
|
358
|
+
with pypdfium2_lock:
|
|
359
|
+
image = (
|
|
360
|
+
self._ppage.render(
|
|
361
|
+
scale=scale * 1.5,
|
|
362
|
+
rotation=0, # no additional rotation
|
|
363
|
+
crop=padbox.as_tuple(),
|
|
364
|
+
)
|
|
365
|
+
.to_pil()
|
|
366
|
+
.resize(
|
|
367
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
|
368
|
+
)
|
|
369
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
|
370
|
+
|
|
371
|
+
return image
|
|
372
|
+
|
|
373
|
+
def get_size(self) -> Size:
|
|
374
|
+
with pypdfium2_lock:
|
|
375
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
|
376
|
+
|
|
377
|
+
def unload(self):
|
|
378
|
+
self._ppage = None
|
|
379
|
+
self.text_page = None
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
383
|
+
def __init__(
|
|
384
|
+
self,
|
|
385
|
+
in_doc: "InputDocument",
|
|
386
|
+
path_or_stream: Union[BytesIO, Path],
|
|
387
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
388
|
+
):
|
|
389
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
390
|
+
|
|
391
|
+
password = (
|
|
392
|
+
self.options.password.get_secret_value() if self.options.password else None
|
|
393
|
+
)
|
|
394
|
+
try:
|
|
395
|
+
with pypdfium2_lock:
|
|
396
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
|
|
397
|
+
except PdfiumError as e:
|
|
398
|
+
raise RuntimeError(
|
|
399
|
+
f"pypdfium could not load document with hash {self.document_hash}"
|
|
400
|
+
) from e
|
|
401
|
+
|
|
402
|
+
def page_count(self) -> int:
|
|
403
|
+
with pypdfium2_lock:
|
|
404
|
+
return len(self._pdoc)
|
|
405
|
+
|
|
406
|
+
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
|
407
|
+
with pypdfium2_lock:
|
|
408
|
+
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
|
409
|
+
|
|
410
|
+
def is_valid(self) -> bool:
|
|
411
|
+
return self.page_count() > 0
|
|
412
|
+
|
|
413
|
+
def unload(self):
|
|
414
|
+
super().unload()
|
|
415
|
+
with pypdfium2_lock:
|
|
416
|
+
self._pdoc.close()
|
|
417
|
+
self._pdoc = None
|