docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,51 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ from docling.backend.abstract_backend import AbstractDocumentBackend
7
+ from docling.datamodel.base_models import InputFormat
8
+ from docling.datamodel.document import InputDocument
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class NoOpBackend(AbstractDocumentBackend):
14
+ """
15
+ A no-op backend that only validates input existence.
16
+ Used e.g. for audio files where actual processing is handled by the ASR pipeline.
17
+ """
18
+
19
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
20
+ super().__init__(in_doc, path_or_stream)
21
+
22
+ _log.debug(f"NoOpBackend initialized for: {path_or_stream}")
23
+
24
+ # Validate input
25
+ try:
26
+ if isinstance(self.path_or_stream, BytesIO):
27
+ # Check if stream has content
28
+ self.valid = len(self.path_or_stream.getvalue()) > 0
29
+ _log.debug(
30
+ f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
31
+ )
32
+ elif isinstance(self.path_or_stream, Path):
33
+ # Check if file exists
34
+ self.valid = self.path_or_stream.exists()
35
+ _log.debug(f"File exists: {self.valid}")
36
+ else:
37
+ self.valid = False
38
+ except Exception as e:
39
+ _log.error(f"NoOpBackend validation failed: {e}")
40
+ self.valid = False
41
+
42
+ def is_valid(self) -> bool:
43
+ return self.valid
44
+
45
+ @classmethod
46
+ def supports_pagination(cls) -> bool:
47
+ return False
48
+
49
+ @classmethod
50
+ def supported_formats(cls) -> Set[InputFormat]:
51
+ return set(InputFormat)
@@ -0,0 +1,82 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Iterable
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import Optional, Set, Union
6
+
7
+ from docling_core.types.doc import BoundingBox, Size
8
+ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
9
+ from PIL import Image
10
+
11
+ from docling.backend.abstract_backend import PaginatedDocumentBackend
12
+ from docling.datamodel.backend_options import PdfBackendOptions
13
+ from docling.datamodel.base_models import InputFormat
14
+ from docling.datamodel.document import InputDocument
15
+
16
+
17
+ class PdfPageBackend(ABC):
18
+ @abstractmethod
19
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
20
+ pass
21
+
22
+ @abstractmethod
23
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
24
+ pass
25
+
26
+ @abstractmethod
27
+ def get_text_cells(self) -> Iterable[TextCell]:
28
+ pass
29
+
30
+ @abstractmethod
31
+ def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
32
+ pass
33
+
34
+ @abstractmethod
35
+ def get_page_image(
36
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
37
+ ) -> Image.Image:
38
+ pass
39
+
40
+ @abstractmethod
41
+ def get_size(self) -> Size:
42
+ pass
43
+
44
+ @abstractmethod
45
+ def is_valid(self) -> bool:
46
+ pass
47
+
48
+ @abstractmethod
49
+ def unload(self):
50
+ pass
51
+
52
+
53
+ class PdfDocumentBackend(PaginatedDocumentBackend):
54
+ def __init__(
55
+ self,
56
+ in_doc: InputDocument,
57
+ path_or_stream: Union[BytesIO, Path],
58
+ options: PdfBackendOptions = PdfBackendOptions(),
59
+ ):
60
+ super().__init__(in_doc, path_or_stream, options)
61
+ self.options: PdfBackendOptions
62
+
63
+ if self.input_format not in self.supported_formats():
64
+ raise RuntimeError(
65
+ f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
66
+ )
67
+
68
+ @abstractmethod
69
+ def load_page(self, page_no: int) -> PdfPageBackend:
70
+ pass
71
+
72
+ @abstractmethod
73
+ def page_count(self) -> int:
74
+ pass
75
+
76
+ @classmethod
77
+ def supported_formats(cls) -> Set[InputFormat]:
78
+ return {InputFormat.PDF}
79
+
80
+ @classmethod
81
+ def supports_pagination(cls) -> bool:
82
+ return True
@@ -0,0 +1,417 @@
1
+ import logging
2
+ import random
3
+ from collections.abc import Iterable
4
+ from importlib.metadata import version
5
+ from io import BytesIO
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, List, Optional, Union
8
+
9
+ import pypdfium2 as pdfium
10
+ import pypdfium2.raw as pdfium_c
11
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
12
+ from docling_core.types.doc.page import (
13
+ BoundingRectangle,
14
+ PdfPageBoundaryType,
15
+ PdfPageGeometry,
16
+ SegmentedPdfPage,
17
+ TextCell,
18
+ )
19
+ from PIL import Image, ImageDraw
20
+ from pypdfium2 import PdfTextPage
21
+ from pypdfium2._helpers.misc import PdfiumError
22
+
23
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
24
+ from docling.datamodel.backend_options import PdfBackendOptions
25
+ from docling.utils.locks import pypdfium2_lock
26
+
27
+
28
+ def get_pdf_page_geometry(
29
+ ppage: pdfium.PdfPage,
30
+ angle: float = 0.0,
31
+ boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
32
+ ) -> PdfPageGeometry:
33
+ """
34
+ Create PdfPageGeometry from a pypdfium2 PdfPage object.
35
+
36
+ Args:
37
+ ppage: pypdfium2 PdfPage object
38
+ angle: Page rotation angle in degrees (default: 0.0)
39
+ boundary_type: The boundary type for the page (default: CROP_BOX)
40
+
41
+ Returns:
42
+ PdfPageGeometry with all the different bounding boxes properly set
43
+ """
44
+ with pypdfium2_lock:
45
+ # Get the main bounding box (intersection of crop_box and media_box)
46
+ bbox_tuple = ppage.get_bbox()
47
+ bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
48
+
49
+ # Get all the different page boxes from pypdfium2
50
+ media_box_tuple = ppage.get_mediabox()
51
+ crop_box_tuple = ppage.get_cropbox()
52
+ art_box_tuple = ppage.get_artbox()
53
+ bleed_box_tuple = ppage.get_bleedbox()
54
+ trim_box_tuple = ppage.get_trimbox()
55
+
56
+ # Convert to BoundingBox objects using existing from_tuple method
57
+ # pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
58
+ # Use bbox as fallback when specific box types are not defined
59
+ media_bbox = (
60
+ BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
61
+ if media_box_tuple
62
+ else bbox
63
+ )
64
+ crop_bbox = (
65
+ BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
66
+ if crop_box_tuple
67
+ else bbox
68
+ )
69
+ art_bbox = (
70
+ BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
71
+ if art_box_tuple
72
+ else bbox
73
+ )
74
+ bleed_bbox = (
75
+ BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
76
+ if bleed_box_tuple
77
+ else bbox
78
+ )
79
+ trim_bbox = (
80
+ BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
81
+ if trim_box_tuple
82
+ else bbox
83
+ )
84
+
85
+ return PdfPageGeometry(
86
+ angle=angle,
87
+ rect=BoundingRectangle.from_bounding_box(bbox),
88
+ boundary_type=boundary_type,
89
+ art_bbox=art_bbox,
90
+ bleed_bbox=bleed_bbox,
91
+ crop_bbox=crop_bbox,
92
+ media_bbox=media_bbox,
93
+ trim_bbox=trim_bbox,
94
+ )
95
+
96
+
97
+ if TYPE_CHECKING:
98
+ from docling.datamodel.document import InputDocument
99
+
100
+ _log = logging.getLogger(__name__)
101
+
102
+
103
+ # Resolve pypdfium2 major version
104
+ # pypdfium2 5.x renamed PdfObject.get_pos() -> get_bounds()
105
+ _PYPDFIUM2_MAJOR_VERSION = int(version("pypdfium2").split(".")[0])
106
+
107
+
108
+ class PyPdfiumPageBackend(PdfPageBackend):
109
+ def __init__(
110
+ self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
111
+ ):
112
+ # Note: lock applied by the caller
113
+ self.valid = True # No better way to tell from pypdfium.
114
+ try:
115
+ self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
116
+ except PdfiumError:
117
+ _log.info(
118
+ f"An exception occurred when loading page {page_no} of document {document_hash}.",
119
+ exc_info=True,
120
+ )
121
+ self.valid = False
122
+ self.text_page: Optional[PdfTextPage] = None
123
+
124
+ def is_valid(self) -> bool:
125
+ return self.valid
126
+
127
+ def _compute_text_cells(self) -> List[TextCell]:
128
+ """Compute text cells from pypdfium."""
129
+ with pypdfium2_lock:
130
+ if not self.text_page:
131
+ self.text_page = self._ppage.get_textpage()
132
+
133
+ cells = []
134
+ cell_counter = 0
135
+
136
+ page_size = self.get_size()
137
+
138
+ with pypdfium2_lock:
139
+ for i in range(self.text_page.count_rects()):
140
+ rect = self.text_page.get_rect(i)
141
+ text_piece = self.text_page.get_text_bounded(*rect)
142
+ x0, y0, x1, y1 = rect
143
+ cells.append(
144
+ TextCell(
145
+ index=cell_counter,
146
+ text=text_piece,
147
+ orig=text_piece,
148
+ from_ocr=False,
149
+ rect=BoundingRectangle.from_bounding_box(
150
+ BoundingBox(
151
+ l=x0,
152
+ b=y0,
153
+ r=x1,
154
+ t=y1,
155
+ coord_origin=CoordOrigin.BOTTOMLEFT,
156
+ )
157
+ ).to_top_left_origin(page_size.height),
158
+ )
159
+ )
160
+ cell_counter += 1
161
+
162
+ # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
163
+ # The cell merging code below is to clean this up.
164
+ def merge_horizontal_cells(
165
+ cells: List[TextCell],
166
+ horizontal_threshold_factor: float = 1.0,
167
+ vertical_threshold_factor: float = 0.5,
168
+ ) -> List[TextCell]:
169
+ if not cells:
170
+ return []
171
+
172
+ def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
173
+ rows = []
174
+ current_row = [cells[0]]
175
+ row_top = cells[0].rect.to_bounding_box().t
176
+ row_bottom = cells[0].rect.to_bounding_box().b
177
+ row_height = cells[0].rect.to_bounding_box().height
178
+
179
+ for cell in cells[1:]:
180
+ vertical_threshold = row_height * vertical_threshold_factor
181
+ if (
182
+ abs(cell.rect.to_bounding_box().t - row_top)
183
+ <= vertical_threshold
184
+ and abs(cell.rect.to_bounding_box().b - row_bottom)
185
+ <= vertical_threshold
186
+ ):
187
+ current_row.append(cell)
188
+ row_top = min(row_top, cell.rect.to_bounding_box().t)
189
+ row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
190
+ row_height = row_bottom - row_top
191
+ else:
192
+ rows.append(current_row)
193
+ current_row = [cell]
194
+ row_top = cell.rect.to_bounding_box().t
195
+ row_bottom = cell.rect.to_bounding_box().b
196
+ row_height = cell.rect.to_bounding_box().height
197
+
198
+ if current_row:
199
+ rows.append(current_row)
200
+
201
+ return rows
202
+
203
+ def merge_row(row: List[TextCell]) -> List[TextCell]:
204
+ merged = []
205
+ current_group = [row[0]]
206
+
207
+ for cell in row[1:]:
208
+ prev_cell = current_group[-1]
209
+ avg_height = (
210
+ prev_cell.rect.height + cell.rect.to_bounding_box().height
211
+ ) / 2
212
+ if (
213
+ cell.rect.to_bounding_box().l
214
+ - prev_cell.rect.to_bounding_box().r
215
+ <= avg_height * horizontal_threshold_factor
216
+ ):
217
+ current_group.append(cell)
218
+ else:
219
+ merged.append(merge_group(current_group))
220
+ current_group = [cell]
221
+
222
+ if current_group:
223
+ merged.append(merge_group(current_group))
224
+
225
+ return merged
226
+
227
+ def merge_group(group: List[TextCell]) -> TextCell:
228
+ if len(group) == 1:
229
+ return group[0]
230
+
231
+ merged_bbox = BoundingBox(
232
+ l=min(cell.rect.to_bounding_box().l for cell in group),
233
+ t=min(cell.rect.to_bounding_box().t for cell in group),
234
+ r=max(cell.rect.to_bounding_box().r for cell in group),
235
+ b=max(cell.rect.to_bounding_box().b for cell in group),
236
+ )
237
+
238
+ assert self.text_page is not None
239
+ bbox = merged_bbox.to_bottom_left_origin(page_size.height)
240
+ with pypdfium2_lock:
241
+ merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
242
+
243
+ return TextCell(
244
+ index=group[0].index,
245
+ text=merged_text,
246
+ orig=merged_text,
247
+ rect=BoundingRectangle.from_bounding_box(merged_bbox),
248
+ from_ocr=False,
249
+ )
250
+
251
+ rows = group_rows(cells)
252
+ merged_cells = [cell for row in rows for cell in merge_row(row)]
253
+
254
+ for i, cell in enumerate(merged_cells, 1):
255
+ cell.index = i
256
+
257
+ return merged_cells
258
+
259
+ return merge_horizontal_cells(cells)
260
+
261
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
262
+ AREA_THRESHOLD = 0 # 32 * 32
263
+ page_size = self.get_size()
264
+
265
+ with pypdfium2_lock:
266
+ rotation = self._ppage.get_rotation()
267
+ for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
268
+ if _PYPDFIUM2_MAJOR_VERSION >= 5:
269
+ pos = obj.get_bounds() # pypdfium2 >= 5.x
270
+ else:
271
+ pos = obj.get_pos() # pypdfium2 <= 4.x
272
+ if rotation == 90:
273
+ pos = (
274
+ pos[1],
275
+ page_size.height - pos[2],
276
+ pos[3],
277
+ page_size.height - pos[0],
278
+ )
279
+ elif rotation == 180:
280
+ pos = (
281
+ page_size.width - pos[2],
282
+ page_size.height - pos[3],
283
+ page_size.width - pos[0],
284
+ page_size.height - pos[1],
285
+ )
286
+ elif rotation == 270:
287
+ pos = (
288
+ page_size.width - pos[3],
289
+ pos[0],
290
+ page_size.width - pos[1],
291
+ pos[2],
292
+ )
293
+
294
+ cropbox = BoundingBox.from_tuple(
295
+ pos, origin=CoordOrigin.BOTTOMLEFT
296
+ ).to_top_left_origin(page_height=page_size.height)
297
+ if cropbox.area() > AREA_THRESHOLD:
298
+ cropbox = cropbox.scaled(scale=scale)
299
+ yield cropbox
300
+
301
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
302
+ with pypdfium2_lock:
303
+ if not self.text_page:
304
+ self.text_page = self._ppage.get_textpage()
305
+
306
+ if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
307
+ bbox = bbox.to_bottom_left_origin(self.get_size().height)
308
+
309
+ with pypdfium2_lock:
310
+ text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
311
+
312
+ return text_piece
313
+
314
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
315
+ if not self.valid:
316
+ return None
317
+
318
+ text_cells = self._compute_text_cells()
319
+
320
+ # Get the PDF page geometry from pypdfium2
321
+ dimension = get_pdf_page_geometry(self._ppage)
322
+
323
+ # Create SegmentedPdfPage
324
+ return SegmentedPdfPage(
325
+ dimension=dimension,
326
+ textline_cells=text_cells,
327
+ char_cells=[],
328
+ word_cells=[],
329
+ has_textlines=len(text_cells) > 0,
330
+ has_words=False,
331
+ has_chars=False,
332
+ )
333
+
334
+ def get_text_cells(self) -> Iterable[TextCell]:
335
+ return self._compute_text_cells()
336
+
337
+ def get_page_image(
338
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
339
+ ) -> Image.Image:
340
+ page_size = self.get_size()
341
+
342
+ if not cropbox:
343
+ cropbox = BoundingBox(
344
+ l=0,
345
+ r=page_size.width,
346
+ t=0,
347
+ b=page_size.height,
348
+ coord_origin=CoordOrigin.TOPLEFT,
349
+ )
350
+ padbox = BoundingBox(
351
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
352
+ )
353
+ else:
354
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
355
+ padbox.r = page_size.width - padbox.r
356
+ padbox.t = page_size.height - padbox.t
357
+
358
+ with pypdfium2_lock:
359
+ image = (
360
+ self._ppage.render(
361
+ scale=scale * 1.5,
362
+ rotation=0, # no additional rotation
363
+ crop=padbox.as_tuple(),
364
+ )
365
+ .to_pil()
366
+ .resize(
367
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
368
+ )
369
+ ) # We resize the image from 1.5x the given scale to make it sharper.
370
+
371
+ return image
372
+
373
+ def get_size(self) -> Size:
374
+ with pypdfium2_lock:
375
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
376
+
377
+ def unload(self):
378
+ self._ppage = None
379
+ self.text_page = None
380
+
381
+
382
+ class PyPdfiumDocumentBackend(PdfDocumentBackend):
383
+ def __init__(
384
+ self,
385
+ in_doc: "InputDocument",
386
+ path_or_stream: Union[BytesIO, Path],
387
+ options: PdfBackendOptions = PdfBackendOptions(),
388
+ ):
389
+ super().__init__(in_doc, path_or_stream, options)
390
+
391
+ password = (
392
+ self.options.password.get_secret_value() if self.options.password else None
393
+ )
394
+ try:
395
+ with pypdfium2_lock:
396
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
397
+ except PdfiumError as e:
398
+ raise RuntimeError(
399
+ f"pypdfium could not load document with hash {self.document_hash}"
400
+ ) from e
401
+
402
+ def page_count(self) -> int:
403
+ with pypdfium2_lock:
404
+ return len(self._pdoc)
405
+
406
+ def load_page(self, page_no: int) -> PyPdfiumPageBackend:
407
+ with pypdfium2_lock:
408
+ return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
409
+
410
+ def is_valid(self) -> bool:
411
+ return self.page_count() > 0
412
+
413
+ def unload(self):
414
+ super().unload()
415
+ with pypdfium2_lock:
416
+ self._pdoc.close()
417
+ self._pdoc = None