docling 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,59 @@
1
+ from abc import ABC, abstractmethod
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Any, Iterable, Optional, Union
5
+
6
+ from PIL import Image
7
+
8
+
9
+ class PdfPageBackend(ABC):
10
+ def __init__(self, page_obj: Any) -> object:
11
+ pass
12
+
13
+ @abstractmethod
14
+ def get_text_in_rect(self, bbox: "BoundingBox") -> str:
15
+ pass
16
+
17
+ @abstractmethod
18
+ def get_text_cells(self) -> Iterable["Cell"]:
19
+ pass
20
+
21
+ @abstractmethod
22
+ def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
23
+ pass
24
+
25
+ @abstractmethod
26
+ def get_page_image(
27
+ self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
28
+ ) -> Image.Image:
29
+ pass
30
+
31
+ @abstractmethod
32
+ def get_size(self) -> "PageSize":
33
+ pass
34
+
35
+ @abstractmethod
36
+ def unload(self):
37
+ pass
38
+
39
+
40
+ class PdfDocumentBackend(ABC):
41
+ @abstractmethod
42
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
43
+ pass
44
+
45
+ @abstractmethod
46
+ def load_page(self, page_no: int) -> PdfPageBackend:
47
+ pass
48
+
49
+ @abstractmethod
50
+ def page_count(self) -> int:
51
+ pass
52
+
53
+ @abstractmethod
54
+ def is_valid(self) -> bool:
55
+ pass
56
+
57
+ @abstractmethod
58
+ def unload(self):
59
+ pass
@@ -0,0 +1,207 @@
1
+ import logging
2
+ import random
3
+ import time
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import Iterable, Optional, Union
7
+
8
+ import pypdfium2 as pdfium
9
+ from docling_parse.docling_parse import pdf_parser
10
+ from PIL import Image, ImageDraw
11
+ from pypdfium2 import PdfPage
12
+
13
+ from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
15
+
16
+ _log = logging.getLogger(__name__)
17
+
18
+
19
+ class DoclingParsePageBackend(PdfPageBackend):
20
+ def __init__(self, page_obj: PdfPage, docling_page_obj):
21
+ super().__init__(page_obj)
22
+ self._ppage = page_obj
23
+ self._dpage = docling_page_obj
24
+ self.text_page = None
25
+
26
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
27
+ # Find intersecting cells on the page
28
+ text_piece = ""
29
+ page_size = self.get_size()
30
+ parser_width = self._dpage["width"]
31
+ parser_height = self._dpage["height"]
32
+
33
+ scale = (
34
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
35
+ )
36
+
37
+ for i in range(len(self._dpage["cells"])):
38
+ rect = self._dpage["cells"][i]["box"]["device"]
39
+ x0, y0, x1, y1 = rect
40
+ cell_bbox = BoundingBox(
41
+ l=x0 * scale * page_size.width / parser_width,
42
+ b=y0 * scale * page_size.height / parser_height,
43
+ r=x1 * scale * page_size.width / parser_width,
44
+ t=y1 * scale * page_size.height / parser_height,
45
+ coord_origin=CoordOrigin.BOTTOMLEFT,
46
+ ).to_top_left_origin(page_height=page_size.height * scale)
47
+
48
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
49
+
50
+ if overlap_frac > 0.5:
51
+ if len(text_piece) > 0:
52
+ text_piece += " "
53
+ text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
54
+
55
+ return text_piece
56
+
57
+ def get_text_cells(self) -> Iterable[Cell]:
58
+ cells = []
59
+ cell_counter = 0
60
+
61
+ page_size = self.get_size()
62
+
63
+ parser_width = self._dpage["width"]
64
+ parser_height = self._dpage["height"]
65
+
66
+ for i in range(len(self._dpage["cells"])):
67
+ rect = self._dpage["cells"][i]["box"]["device"]
68
+ x0, y0, x1, y1 = rect
69
+
70
+ if x1 < x0:
71
+ x0, x1 = x1, x0
72
+ if y1 < y0:
73
+ y0, y1 = y1, y0
74
+
75
+ text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
76
+ cells.append(
77
+ Cell(
78
+ id=cell_counter,
79
+ text=text_piece,
80
+ bbox=BoundingBox(
81
+ # l=x0, b=y0, r=x1, t=y1,
82
+ l=x0 * page_size.width / parser_width,
83
+ b=y0 * page_size.height / parser_height,
84
+ r=x1 * page_size.width / parser_width,
85
+ t=y1 * page_size.height / parser_height,
86
+ coord_origin=CoordOrigin.BOTTOMLEFT,
87
+ ).to_top_left_origin(page_size.height),
88
+ )
89
+ )
90
+ cell_counter += 1
91
+
92
+ def draw_clusters_and_cells():
93
+ image = (
94
+ self.get_page_image()
95
+ ) # make new image to avoid drawing on the saved ones
96
+ draw = ImageDraw.Draw(image)
97
+ for c in cells:
98
+ x0, y0, x1, y1 = c.bbox.as_tuple()
99
+ cell_color = (
100
+ random.randint(30, 140),
101
+ random.randint(30, 140),
102
+ random.randint(30, 140),
103
+ )
104
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
105
+ image.show()
106
+
107
+ # before merge:
108
+ # draw_clusters_and_cells()
109
+
110
+ # cells = merge_horizontal_cells(cells)
111
+
112
+ # after merge:
113
+ # draw_clusters_and_cells()
114
+
115
+ return cells
116
+
117
+ def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
118
+ AREA_THRESHOLD = 32 * 32
119
+
120
+ for i in range(len(self._dpage["images"])):
121
+ bitmap = self._dpage["images"][i]
122
+ cropbox = BoundingBox.from_tuple(
123
+ bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
124
+ ).to_top_left_origin(self.get_size().height)
125
+
126
+ if cropbox.area() > AREA_THRESHOLD:
127
+ cropbox = cropbox.scaled(scale=scale)
128
+
129
+ yield cropbox
130
+
131
+ def get_page_image(
132
+ self, scale: int = 1, cropbox: Optional[BoundingBox] = None
133
+ ) -> Image.Image:
134
+
135
+ page_size = self.get_size()
136
+
137
+ if not cropbox:
138
+ cropbox = BoundingBox(
139
+ l=0,
140
+ r=page_size.width,
141
+ t=0,
142
+ b=page_size.height,
143
+ coord_origin=CoordOrigin.TOPLEFT,
144
+ )
145
+ padbox = BoundingBox(
146
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
147
+ )
148
+ else:
149
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
150
+ padbox.r = page_size.width - padbox.r
151
+ padbox.t = page_size.height - padbox.t
152
+
153
+ image = (
154
+ self._ppage.render(
155
+ scale=scale * 1.5,
156
+ rotation=0, # no additional rotation
157
+ crop=padbox.as_tuple(),
158
+ )
159
+ .to_pil()
160
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
161
+ ) # We resize the image from 1.5x the given scale to make it sharper.
162
+
163
+ return image
164
+
165
+ def get_size(self) -> PageSize:
166
+ return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
167
+
168
+ def unload(self):
169
+ self._ppage = None
170
+ self._dpage = None
171
+ self.text_page = None
172
+
173
+
174
+ class DoclingParseDocumentBackend(PdfDocumentBackend):
175
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
176
+ super().__init__(path_or_stream)
177
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
178
+ # Parsing cells with docling_parser call
179
+ parser = pdf_parser()
180
+
181
+ start_pb_time = time.time()
182
+
183
+ if isinstance(path_or_stream, BytesIO):
184
+ self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
185
+ else:
186
+ self._parser_doc = parser.find_cells(str(path_or_stream))
187
+
188
+ end_pb_time = time.time() - start_pb_time
189
+ _log.info(
190
+ f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
191
+ )
192
+
193
+ def page_count(self) -> int:
194
+ return len(self._parser_doc["pages"])
195
+
196
+ def load_page(self, page_no: int) -> DoclingParsePageBackend:
197
+ return DoclingParsePageBackend(
198
+ self._pdoc[page_no], self._parser_doc["pages"][page_no]
199
+ )
200
+
201
+ def is_valid(self) -> bool:
202
+ return self.page_count() > 0
203
+
204
+ def unload(self):
205
+ self._pdoc.close()
206
+ self._pdoc = None
207
+ self._parser_doc = None
@@ -0,0 +1,233 @@
1
+ import random
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Optional, Union
5
+
6
+ import pypdfium2 as pdfium
7
+ import pypdfium2.raw as pdfium_c
8
+ from PIL import Image, ImageDraw
9
+ from pypdfium2 import PdfPage
10
+
11
+ from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
12
+ from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+
14
+
15
+ class PyPdfiumPageBackend(PdfPageBackend):
16
+ def __init__(self, page_obj: PdfPage):
17
+ super().__init__(page_obj)
18
+ self._ppage = page_obj
19
+ self.text_page = None
20
+
21
+ def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
22
+ AREA_THRESHOLD = 32 * 32
23
+ for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
24
+ pos = obj.get_pos()
25
+ cropbox = BoundingBox.from_tuple(
26
+ pos, origin=CoordOrigin.BOTTOMLEFT
27
+ ).to_top_left_origin(page_height=self.get_size().height)
28
+
29
+ if cropbox.area() > AREA_THRESHOLD:
30
+ cropbox = cropbox.scaled(scale=scale)
31
+
32
+ yield cropbox
33
+
34
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
35
+ if not self.text_page:
36
+ self.text_page = self._ppage.get_textpage()
37
+
38
+ if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
39
+ bbox = bbox.to_bottom_left_origin(self.get_size().height)
40
+
41
+ text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
42
+
43
+ return text_piece
44
+
45
+ def get_text_cells(self) -> Iterable[Cell]:
46
+ if not self.text_page:
47
+ self.text_page = self._ppage.get_textpage()
48
+
49
+ cells = []
50
+ cell_counter = 0
51
+
52
+ page_size = self.get_size()
53
+
54
+ for i in range(self.text_page.count_rects()):
55
+ rect = self.text_page.get_rect(i)
56
+ text_piece = self.text_page.get_text_bounded(*rect)
57
+ x0, y0, x1, y1 = rect
58
+ cells.append(
59
+ Cell(
60
+ id=cell_counter,
61
+ text=text_piece,
62
+ bbox=BoundingBox(
63
+ l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
64
+ ).to_top_left_origin(page_size.height),
65
+ )
66
+ )
67
+ cell_counter += 1
68
+
69
+ # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
70
+ # The cell merging code below is to clean this up.
71
+ def merge_horizontal_cells(
72
+ cells: List[Cell],
73
+ horizontal_threshold_factor: float = 1.0,
74
+ vertical_threshold_factor: float = 0.5,
75
+ ) -> List[Cell]:
76
+ if not cells:
77
+ return []
78
+
79
+ def group_rows(cells: List[Cell]) -> List[List[Cell]]:
80
+ rows = []
81
+ current_row = [cells[0]]
82
+ row_top = cells[0].bbox.t
83
+ row_bottom = cells[0].bbox.b
84
+ row_height = cells[0].bbox.height
85
+
86
+ for cell in cells[1:]:
87
+ vertical_threshold = row_height * vertical_threshold_factor
88
+ if (
89
+ abs(cell.bbox.t - row_top) <= vertical_threshold
90
+ and abs(cell.bbox.b - row_bottom) <= vertical_threshold
91
+ ):
92
+ current_row.append(cell)
93
+ row_top = min(row_top, cell.bbox.t)
94
+ row_bottom = max(row_bottom, cell.bbox.b)
95
+ row_height = row_bottom - row_top
96
+ else:
97
+ rows.append(current_row)
98
+ current_row = [cell]
99
+ row_top = cell.bbox.t
100
+ row_bottom = cell.bbox.b
101
+ row_height = cell.bbox.height
102
+
103
+ if current_row:
104
+ rows.append(current_row)
105
+
106
+ return rows
107
+
108
+ def merge_row(row: List[Cell]) -> List[Cell]:
109
+ merged = []
110
+ current_group = [row[0]]
111
+
112
+ for cell in row[1:]:
113
+ prev_cell = current_group[-1]
114
+ avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
115
+ if (
116
+ cell.bbox.l - prev_cell.bbox.r
117
+ <= avg_height * horizontal_threshold_factor
118
+ ):
119
+ current_group.append(cell)
120
+ else:
121
+ merged.append(merge_group(current_group))
122
+ current_group = [cell]
123
+
124
+ if current_group:
125
+ merged.append(merge_group(current_group))
126
+
127
+ return merged
128
+
129
+ def merge_group(group: List[Cell]) -> Cell:
130
+ if len(group) == 1:
131
+ return group[0]
132
+
133
+ merged_text = "".join(cell.text for cell in group)
134
+ merged_bbox = BoundingBox(
135
+ l=min(cell.bbox.l for cell in group),
136
+ t=min(cell.bbox.t for cell in group),
137
+ r=max(cell.bbox.r for cell in group),
138
+ b=max(cell.bbox.b for cell in group),
139
+ )
140
+ return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
141
+
142
+ rows = group_rows(cells)
143
+ merged_cells = [cell for row in rows for cell in merge_row(row)]
144
+
145
+ for i, cell in enumerate(merged_cells, 1):
146
+ cell.id = i
147
+
148
+ return merged_cells
149
+
150
+ def draw_clusters_and_cells():
151
+ image = (
152
+ self.get_page_image()
153
+ ) # make new image to avoid drawing on the saved ones
154
+ draw = ImageDraw.Draw(image)
155
+ for c in cells:
156
+ x0, y0, x1, y1 = c.bbox.as_tuple()
157
+ cell_color = (
158
+ random.randint(30, 140),
159
+ random.randint(30, 140),
160
+ random.randint(30, 140),
161
+ )
162
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
163
+ image.show()
164
+
165
+ # before merge:
166
+ # draw_clusters_and_cells()
167
+
168
+ cells = merge_horizontal_cells(cells)
169
+
170
+ # after merge:
171
+ # draw_clusters_and_cells()
172
+
173
+ return cells
174
+
175
+ def get_page_image(
176
+ self, scale: int = 1, cropbox: Optional[BoundingBox] = None
177
+ ) -> Image.Image:
178
+
179
+ page_size = self.get_size()
180
+
181
+ if not cropbox:
182
+ cropbox = BoundingBox(
183
+ l=0,
184
+ r=page_size.width,
185
+ t=0,
186
+ b=page_size.height,
187
+ coord_origin=CoordOrigin.TOPLEFT,
188
+ )
189
+ padbox = BoundingBox(
190
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
191
+ )
192
+ else:
193
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
194
+ padbox.r = page_size.width - padbox.r
195
+ padbox.t = page_size.height - padbox.t
196
+
197
+ image = (
198
+ self._ppage.render(
199
+ scale=scale * 1.5,
200
+ rotation=0, # no additional rotation
201
+ crop=padbox.as_tuple(),
202
+ )
203
+ .to_pil()
204
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
205
+ ) # We resize the image from 1.5x the given scale to make it sharper.
206
+
207
+ return image
208
+
209
+ def get_size(self) -> PageSize:
210
+ return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
211
+
212
+ def unload(self):
213
+ self._ppage = None
214
+ self.text_page = None
215
+
216
+
217
+ class PyPdfiumDocumentBackend(PdfDocumentBackend):
218
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
219
+ super().__init__(path_or_stream)
220
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
221
+
222
+ def page_count(self) -> int:
223
+ return len(self._pdoc)
224
+
225
+ def load_page(self, page_no: int) -> PyPdfiumPageBackend:
226
+ return PyPdfiumPageBackend(self._pdoc[page_no])
227
+
228
+ def is_valid(self) -> bool:
229
+ return self.page_count() > 0
230
+
231
+ def unload(self):
232
+ self._pdoc.close()
233
+ self._pdoc = None
File without changes