docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,237 @@
1
+ import logging
2
+ import random
3
+ from collections.abc import Iterable
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import List, Optional, Union
7
+
8
+ import pypdfium2 as pdfium
9
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
10
+ from docling_core.types.doc.page import (
11
+ BoundingRectangle,
12
+ SegmentedPdfPage,
13
+ TextCell,
14
+ )
15
+ from docling_parse.pdf_parsers import pdf_parser_v1
16
+ from PIL import Image, ImageDraw
17
+ from pypdfium2 import PdfPage
18
+
19
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
20
+ from docling.backend.pypdfium2_backend import get_pdf_page_geometry
21
+ from docling.datamodel.document import InputDocument
22
+
23
+ _log = logging.getLogger(__name__)
24
+
25
+
26
+ class DoclingParsePageBackend(PdfPageBackend):
27
+ def __init__(
28
+ self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
29
+ ):
30
+ self._ppage = page_obj
31
+ parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
32
+
33
+ self.valid = "pages" in parsed_page
34
+ if self.valid:
35
+ self._dpage = parsed_page["pages"][0]
36
+ else:
37
+ _log.info(
38
+ f"An error occurred when loading page {page_no} of document {document_hash}."
39
+ )
40
+
41
+ def is_valid(self) -> bool:
42
+ return self.valid
43
+
44
+ def _compute_text_cells(self) -> List[TextCell]:
45
+ """Compute text cells from docling-parse data."""
46
+ cells: List[TextCell] = []
47
+ cell_counter = 0
48
+
49
+ if not self.valid:
50
+ return cells
51
+
52
+ page_size = self.get_size()
53
+
54
+ parser_width = self._dpage["width"]
55
+ parser_height = self._dpage["height"]
56
+
57
+ for i in range(len(self._dpage["cells"])):
58
+ rect = self._dpage["cells"][i]["box"]["device"]
59
+ x0, y0, x1, y1 = rect
60
+
61
+ if x1 < x0:
62
+ x0, x1 = x1, x0
63
+ if y1 < y0:
64
+ y0, y1 = y1, y0
65
+
66
+ text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
67
+ cells.append(
68
+ TextCell(
69
+ index=cell_counter,
70
+ text=text_piece,
71
+ orig=text_piece,
72
+ from_ocr=False,
73
+ rect=BoundingRectangle.from_bounding_box(
74
+ BoundingBox(
75
+ l=x0 * page_size.width / parser_width,
76
+ b=y0 * page_size.height / parser_height,
77
+ r=x1 * page_size.width / parser_width,
78
+ t=y1 * page_size.height / parser_height,
79
+ coord_origin=CoordOrigin.BOTTOMLEFT,
80
+ )
81
+ ).to_top_left_origin(page_size.height),
82
+ )
83
+ )
84
+
85
+ cell_counter += 1
86
+
87
+ return cells
88
+
89
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
90
+ if not self.valid:
91
+ return ""
92
+ # Find intersecting cells on the page
93
+ text_piece = ""
94
+ page_size = self.get_size()
95
+ parser_width = self._dpage["width"]
96
+ parser_height = self._dpage["height"]
97
+
98
+ scale = (
99
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
100
+ )
101
+
102
+ for i in range(len(self._dpage["cells"])):
103
+ rect = self._dpage["cells"][i]["box"]["device"]
104
+ x0, y0, x1, y1 = rect
105
+ cell_bbox = BoundingBox(
106
+ l=x0 * scale * page_size.width / parser_width,
107
+ b=y0 * scale * page_size.height / parser_height,
108
+ r=x1 * scale * page_size.width / parser_width,
109
+ t=y1 * scale * page_size.height / parser_height,
110
+ coord_origin=CoordOrigin.BOTTOMLEFT,
111
+ ).to_top_left_origin(page_height=page_size.height * scale)
112
+
113
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
114
+
115
+ if overlap_frac > 0.5:
116
+ if len(text_piece) > 0:
117
+ text_piece += " "
118
+ text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
119
+
120
+ return text_piece
121
+
122
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
123
+ if not self.valid:
124
+ return None
125
+
126
+ text_cells = self._compute_text_cells()
127
+
128
+ # Get the PDF page geometry from pypdfium2
129
+ dimension = get_pdf_page_geometry(self._ppage)
130
+
131
+ # Create SegmentedPdfPage
132
+ return SegmentedPdfPage(
133
+ dimension=dimension,
134
+ textline_cells=text_cells,
135
+ char_cells=[],
136
+ word_cells=[],
137
+ has_lines=len(text_cells) > 0,
138
+ has_words=False,
139
+ has_chars=False,
140
+ )
141
+
142
+ def get_text_cells(self) -> Iterable[TextCell]:
143
+ return self._compute_text_cells()
144
+
145
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
146
+ AREA_THRESHOLD = 0 # 32 * 32
147
+
148
+ for i in range(len(self._dpage["images"])):
149
+ bitmap = self._dpage["images"][i]
150
+ cropbox = BoundingBox.from_tuple(
151
+ bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
152
+ ).to_top_left_origin(self.get_size().height)
153
+
154
+ if cropbox.area() > AREA_THRESHOLD:
155
+ cropbox = cropbox.scaled(scale=scale)
156
+
157
+ yield cropbox
158
+
159
+ def get_page_image(
160
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
161
+ ) -> Image.Image:
162
+ page_size = self.get_size()
163
+
164
+ if not cropbox:
165
+ cropbox = BoundingBox(
166
+ l=0,
167
+ r=page_size.width,
168
+ t=0,
169
+ b=page_size.height,
170
+ coord_origin=CoordOrigin.TOPLEFT,
171
+ )
172
+ padbox = BoundingBox(
173
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
174
+ )
175
+ else:
176
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
177
+ padbox.r = page_size.width - padbox.r
178
+ padbox.t = page_size.height - padbox.t
179
+
180
+ image = (
181
+ self._ppage.render(
182
+ scale=scale * 1.5,
183
+ rotation=0, # no additional rotation
184
+ crop=padbox.as_tuple(),
185
+ )
186
+ .to_pil()
187
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
188
+ ) # We resize the image from 1.5x the given scale to make it sharper.
189
+
190
+ return image
191
+
192
+ def get_size(self) -> Size:
193
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
194
+
195
+ def unload(self):
196
+ self._ppage = None
197
+ self._dpage = None
198
+
199
+
200
+ class DoclingParseDocumentBackend(PdfDocumentBackend):
201
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
202
+ super().__init__(in_doc, path_or_stream)
203
+
204
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
205
+ self.parser = pdf_parser_v1()
206
+
207
+ success = False
208
+ if isinstance(self.path_or_stream, BytesIO):
209
+ success = self.parser.load_document_from_bytesio(
210
+ self.document_hash, self.path_or_stream
211
+ )
212
+ elif isinstance(self.path_or_stream, Path):
213
+ success = self.parser.load_document(
214
+ self.document_hash, str(self.path_or_stream)
215
+ )
216
+
217
+ if not success:
218
+ raise RuntimeError(
219
+ f"docling-parse could not load document with hash {self.document_hash}."
220
+ )
221
+
222
+ def page_count(self) -> int:
223
+ return len(self._pdoc) # To be replaced with docling-parse API
224
+
225
+ def load_page(self, page_no: int) -> DoclingParsePageBackend:
226
+ return DoclingParsePageBackend(
227
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
228
+ )
229
+
230
+ def is_valid(self) -> bool:
231
+ return self.page_count() > 0
232
+
233
+ def unload(self):
234
+ super().unload()
235
+ self.parser.unload_document(self.document_hash)
236
+ self._pdoc.close()
237
+ self._pdoc = None
@@ -0,0 +1,276 @@
1
+ import logging
2
+ import random
3
+ from collections.abc import Iterable
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, List, Optional, Union
7
+
8
+ import pypdfium2 as pdfium
9
+ from docling_core.types.doc import BoundingBox, CoordOrigin
10
+ from docling_core.types.doc.page import (
11
+ BoundingRectangle,
12
+ PdfPageBoundaryType,
13
+ PdfPageGeometry,
14
+ SegmentedPdfPage,
15
+ TextCell,
16
+ )
17
+ from docling_parse.pdf_parsers import pdf_parser_v2
18
+ from PIL import Image, ImageDraw
19
+ from pypdfium2 import PdfPage
20
+
21
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
22
+ from docling.backend.pypdfium2_backend import get_pdf_page_geometry
23
+ from docling.datamodel.base_models import Size
24
+ from docling.utils.locks import pypdfium2_lock
25
+
26
+ if TYPE_CHECKING:
27
+ from docling.datamodel.document import InputDocument
28
+
29
+ _log = logging.getLogger(__name__)
30
+
31
+
32
+ class DoclingParseV2PageBackend(PdfPageBackend):
33
+ def __init__(
34
+ self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
35
+ ):
36
+ self._ppage = page_obj
37
+ parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
38
+
39
+ self.valid = "pages" in parsed_page and len(parsed_page["pages"]) == 1
40
+ if self.valid:
41
+ self._dpage = parsed_page["pages"][0]
42
+ else:
43
+ _log.info(
44
+ f"An error occurred when loading page {page_no} of document {document_hash}."
45
+ )
46
+
47
+ def is_valid(self) -> bool:
48
+ return self.valid
49
+
50
+ def _compute_text_cells(self) -> List[TextCell]:
51
+ """Compute text cells from docling-parse v2 data."""
52
+ cells: List[TextCell] = []
53
+ cell_counter = 0
54
+
55
+ if not self.valid:
56
+ return cells
57
+
58
+ page_size = self.get_size()
59
+
60
+ parser_width = self._dpage["sanitized"]["dimension"]["width"]
61
+ parser_height = self._dpage["sanitized"]["dimension"]["height"]
62
+
63
+ cells_data = self._dpage["sanitized"]["cells"]["data"]
64
+ cells_header = self._dpage["sanitized"]["cells"]["header"]
65
+
66
+ for i, cell_data in enumerate(cells_data):
67
+ x0 = cell_data[cells_header.index("x0")]
68
+ y0 = cell_data[cells_header.index("y0")]
69
+ x1 = cell_data[cells_header.index("x1")]
70
+ y1 = cell_data[cells_header.index("y1")]
71
+
72
+ if x1 < x0:
73
+ x0, x1 = x1, x0
74
+ if y1 < y0:
75
+ y0, y1 = y1, y0
76
+
77
+ text_piece = cell_data[cells_header.index("text")]
78
+ cells.append(
79
+ TextCell(
80
+ index=cell_counter,
81
+ text=text_piece,
82
+ orig=text_piece,
83
+ from_ocr=False,
84
+ rect=BoundingRectangle.from_bounding_box(
85
+ BoundingBox(
86
+ l=x0 * page_size.width / parser_width,
87
+ b=y0 * page_size.height / parser_height,
88
+ r=x1 * page_size.width / parser_width,
89
+ t=y1 * page_size.height / parser_height,
90
+ coord_origin=CoordOrigin.BOTTOMLEFT,
91
+ )
92
+ ).to_top_left_origin(page_size.height),
93
+ )
94
+ )
95
+ cell_counter += 1
96
+
97
+ return cells
98
+
99
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
100
+ if not self.valid:
101
+ return ""
102
+ # Find intersecting cells on the page
103
+ text_piece = ""
104
+ page_size = self.get_size()
105
+
106
+ parser_width = self._dpage["sanitized"]["dimension"]["width"]
107
+ parser_height = self._dpage["sanitized"]["dimension"]["height"]
108
+
109
+ scale = (
110
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
111
+ )
112
+
113
+ cells_data = self._dpage["sanitized"]["cells"]["data"]
114
+ cells_header = self._dpage["sanitized"]["cells"]["header"]
115
+
116
+ for i, cell_data in enumerate(cells_data):
117
+ x0 = cell_data[cells_header.index("x0")]
118
+ y0 = cell_data[cells_header.index("y0")]
119
+ x1 = cell_data[cells_header.index("x1")]
120
+ y1 = cell_data[cells_header.index("y1")]
121
+
122
+ cell_bbox = BoundingBox(
123
+ l=x0 * scale * page_size.width / parser_width,
124
+ b=y0 * scale * page_size.height / parser_height,
125
+ r=x1 * scale * page_size.width / parser_width,
126
+ t=y1 * scale * page_size.height / parser_height,
127
+ coord_origin=CoordOrigin.BOTTOMLEFT,
128
+ ).to_top_left_origin(page_height=page_size.height * scale)
129
+
130
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
131
+
132
+ if overlap_frac > 0.5:
133
+ if len(text_piece) > 0:
134
+ text_piece += " "
135
+ text_piece += cell_data[cells_header.index("text")]
136
+
137
+ return text_piece
138
+
139
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
140
+ if not self.valid:
141
+ return None
142
+
143
+ text_cells = self._compute_text_cells()
144
+
145
+ # Get the PDF page geometry from pypdfium2
146
+ dimension = get_pdf_page_geometry(self._ppage)
147
+
148
+ # Create SegmentedPdfPage
149
+ return SegmentedPdfPage(
150
+ dimension=dimension,
151
+ textline_cells=text_cells,
152
+ char_cells=[],
153
+ word_cells=[],
154
+ has_textlines=len(text_cells) > 0,
155
+ has_words=False,
156
+ has_chars=False,
157
+ )
158
+
159
+ def get_text_cells(self) -> Iterable[TextCell]:
160
+ return self._compute_text_cells()
161
+
162
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
163
+ AREA_THRESHOLD = 0 # 32 * 32
164
+
165
+ images = self._dpage["sanitized"]["images"]["data"]
166
+ images_header = self._dpage["sanitized"]["images"]["header"]
167
+
168
+ for row in images:
169
+ x0 = row[images_header.index("x0")]
170
+ y0 = row[images_header.index("y0")]
171
+ x1 = row[images_header.index("x1")]
172
+ y1 = row[images_header.index("y1")]
173
+
174
+ cropbox = BoundingBox.from_tuple(
175
+ (x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
176
+ ).to_top_left_origin(self.get_size().height)
177
+
178
+ if cropbox.area() > AREA_THRESHOLD:
179
+ cropbox = cropbox.scaled(scale=scale)
180
+
181
+ yield cropbox
182
+
183
+ def get_page_image(
184
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
185
+ ) -> Image.Image:
186
+ page_size = self.get_size()
187
+
188
+ if not cropbox:
189
+ cropbox = BoundingBox(
190
+ l=0,
191
+ r=page_size.width,
192
+ t=0,
193
+ b=page_size.height,
194
+ coord_origin=CoordOrigin.TOPLEFT,
195
+ )
196
+ padbox = BoundingBox(
197
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
198
+ )
199
+ else:
200
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
201
+ padbox.r = page_size.width - padbox.r
202
+ padbox.t = page_size.height - padbox.t
203
+
204
+ with pypdfium2_lock:
205
+ image = (
206
+ self._ppage.render(
207
+ scale=scale * 1.5,
208
+ rotation=0, # no additional rotation
209
+ crop=padbox.as_tuple(),
210
+ )
211
+ .to_pil()
212
+ .resize(
213
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
214
+ )
215
+ ) # We resize the image from 1.5x the given scale to make it sharper.
216
+
217
+ return image
218
+
219
+ def get_size(self) -> Size:
220
+ with pypdfium2_lock:
221
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
222
+
223
+ def unload(self):
224
+ self._ppage = None
225
+ self._dpage = None
226
+
227
+
228
+ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
229
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
230
+ super().__init__(in_doc, path_or_stream)
231
+
232
+ with pypdfium2_lock:
233
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
234
+ self.parser = pdf_parser_v2("fatal")
235
+
236
+ success = False
237
+ if isinstance(self.path_or_stream, BytesIO):
238
+ success = self.parser.load_document_from_bytesio(
239
+ self.document_hash, self.path_or_stream
240
+ )
241
+ elif isinstance(self.path_or_stream, Path):
242
+ success = self.parser.load_document(
243
+ self.document_hash, str(self.path_or_stream)
244
+ )
245
+
246
+ if not success:
247
+ raise RuntimeError(
248
+ f"docling-parse v2 could not load document {self.document_hash}."
249
+ )
250
+
251
+ def page_count(self) -> int:
252
+ # return len(self._pdoc) # To be replaced with docling-parse API
253
+
254
+ len_1 = len(self._pdoc)
255
+ len_2 = self.parser.number_of_pages(self.document_hash)
256
+
257
+ if len_1 != len_2:
258
+ _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
259
+
260
+ return len_2
261
+
262
+ def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
263
+ with pypdfium2_lock:
264
+ return DoclingParseV2PageBackend(
265
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
266
+ )
267
+
268
+ def is_valid(self) -> bool:
269
+ return self.page_count() > 0
270
+
271
+ def unload(self):
272
+ super().unload()
273
+ self.parser.unload_document(self.document_hash)
274
+ with pypdfium2_lock:
275
+ self._pdoc.close()
276
+ self._pdoc = None