docling-core 2.21.2__py3-none-any.whl → 2.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/__init__.py +6 -0
- docling_core/experimental/serializer/__init__.py +6 -0
- docling_core/experimental/serializer/base.py +227 -0
- docling_core/experimental/serializer/common.py +353 -0
- docling_core/experimental/serializer/markdown.py +461 -0
- docling_core/types/doc/document.py +779 -330
- docling_core/types/doc/page.py +1238 -0
- docling_core/types/doc/tokens.py +1 -0
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/METADATA +1 -1
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/RECORD +13 -7
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/LICENSE +0 -0
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/WHEEL +0 -0
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,1238 @@
|
|
|
1
|
+
"""Datastructures for PaginatedDocument."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
import typing
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import (
|
|
11
|
+
Annotated,
|
|
12
|
+
Dict,
|
|
13
|
+
Iterator,
|
|
14
|
+
List,
|
|
15
|
+
Literal,
|
|
16
|
+
NamedTuple,
|
|
17
|
+
Optional,
|
|
18
|
+
Tuple,
|
|
19
|
+
Union,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
from PIL import Image as PILImage
|
|
24
|
+
from PIL import ImageColor, ImageDraw, ImageFont
|
|
25
|
+
from PIL.ImageFont import FreeTypeFont
|
|
26
|
+
from pydantic import AnyUrl, BaseModel, Field, model_validator
|
|
27
|
+
|
|
28
|
+
from docling_core.types.doc.base import BoundingBox, CoordOrigin
|
|
29
|
+
from docling_core.types.doc.document import ImageRef
|
|
30
|
+
|
|
31
|
+
_logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
PageNumber = typing.Annotated[int, Field(ge=1)]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TextCellUnit(str, Enum):
|
|
37
|
+
"""Enumeration of text cell units for segmented PDF page processing."""
|
|
38
|
+
|
|
39
|
+
CHAR = "char"
|
|
40
|
+
WORD = "word"
|
|
41
|
+
LINE = "line"
|
|
42
|
+
|
|
43
|
+
def __str__(self) -> str:
|
|
44
|
+
"""Return string representation of the enum value."""
|
|
45
|
+
return str(self.value)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class PdfPageBoundaryType(str, Enum):
|
|
49
|
+
"""Enumeration of PDF page boundary types."""
|
|
50
|
+
|
|
51
|
+
ART_BOX = "art_box"
|
|
52
|
+
BLEED_BOX = "bleed_box"
|
|
53
|
+
CROP_BOX = "crop_box"
|
|
54
|
+
MEDIA_BOX = "media_box"
|
|
55
|
+
TRIM_BOX = "trim_box"
|
|
56
|
+
|
|
57
|
+
def __str__(self) -> str:
|
|
58
|
+
"""Return string representation of the enum value."""
|
|
59
|
+
return str(self.value)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
ColorChannelValue = Annotated[int, Field(ge=0, le=255)]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ColorRGBA(BaseModel):
|
|
66
|
+
"""Model representing an RGBA color value."""
|
|
67
|
+
|
|
68
|
+
r: ColorChannelValue
|
|
69
|
+
g: ColorChannelValue
|
|
70
|
+
b: ColorChannelValue
|
|
71
|
+
a: ColorChannelValue = 255
|
|
72
|
+
|
|
73
|
+
def as_tuple(self) -> tuple[int, int, int, int]:
|
|
74
|
+
"""Return the color components as a tuple."""
|
|
75
|
+
return (self.r, self.g, self.b, self.a)
|
|
76
|
+
|
|
77
|
+
def __iter__(self):
|
|
78
|
+
"""Yield the color components for iteration."""
|
|
79
|
+
yield from (self.r, self.g, self.b, self.a)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class Coord2D(NamedTuple):
|
|
83
|
+
"""A 2D coordinate with x and y components."""
|
|
84
|
+
|
|
85
|
+
x: float
|
|
86
|
+
y: float
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class BoundingRectangle(BaseModel):
|
|
90
|
+
"""Model representing a rectangular boundary with four corner points."""
|
|
91
|
+
|
|
92
|
+
r_x0: float
|
|
93
|
+
r_y0: float
|
|
94
|
+
|
|
95
|
+
r_x1: float
|
|
96
|
+
r_y1: float
|
|
97
|
+
|
|
98
|
+
r_x2: float
|
|
99
|
+
r_y2: float
|
|
100
|
+
|
|
101
|
+
r_x3: float
|
|
102
|
+
r_y3: float
|
|
103
|
+
|
|
104
|
+
coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def width(self) -> float:
|
|
108
|
+
"""Calculate the width of the rectangle."""
|
|
109
|
+
return np.sqrt((self.r_x1 - self.r_x0) ** 2 + (self.r_y1 - self.r_y0) ** 2)
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def height(self) -> float:
|
|
113
|
+
"""Calculate the height of the rectangle."""
|
|
114
|
+
return np.sqrt((self.r_x3 - self.r_x0) ** 2 + (self.r_y3 - self.r_y0) ** 2)
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def angle(self) -> float:
|
|
118
|
+
"""Calculate the angle of the rectangle in radians."""
|
|
119
|
+
p_0 = ((self.r_x0 + self.r_x3) / 2.0, (self.r_y0 + self.r_y3) / 2.0)
|
|
120
|
+
p_1 = ((self.r_x1 + self.r_x2) / 2.0, (self.r_y1 + self.r_y2) / 2.0)
|
|
121
|
+
|
|
122
|
+
delta_x, delta_y = p_1[0] - p_0[0], p_1[1] - p_0[1]
|
|
123
|
+
|
|
124
|
+
if abs(delta_x) > 1.0e-3:
|
|
125
|
+
return math.atan(delta_y / delta_x)
|
|
126
|
+
elif delta_y > 0:
|
|
127
|
+
return 3.142592 / 2.0
|
|
128
|
+
else:
|
|
129
|
+
return -3.142592 / 2.0
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def angle_360(self) -> int:
|
|
133
|
+
"""Calculate the angle of the rectangle in degrees (0-360 range)."""
|
|
134
|
+
p_0 = ((self.r_x0 + self.r_x3) / 2.0, (self.r_y0 + self.r_y3) / 2.0)
|
|
135
|
+
p_1 = ((self.r_x1 + self.r_x2) / 2.0, (self.r_y1 + self.r_y2) / 2.0)
|
|
136
|
+
|
|
137
|
+
delta_x, delta_y = p_1[0] - p_0[0], p_1[1] - p_0[1]
|
|
138
|
+
|
|
139
|
+
if abs(delta_y) < 1.0e-2:
|
|
140
|
+
return 0
|
|
141
|
+
elif abs(delta_x) < 1.0e-2:
|
|
142
|
+
return 90
|
|
143
|
+
else:
|
|
144
|
+
return round(-math.atan(delta_y / delta_x) / np.pi * 180)
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def centre(self):
|
|
148
|
+
"""Calculate the center point of the rectangle."""
|
|
149
|
+
return (self.r_x0 + self.r_x1 + self.r_x2 + self.r_x3) / 4.0, (
|
|
150
|
+
self.r_y0 + self.r_y1 + self.r_y2 + self.r_y3
|
|
151
|
+
) / 4.0
|
|
152
|
+
|
|
153
|
+
def to_bounding_box(self) -> BoundingBox:
|
|
154
|
+
"""Convert to a BoundingBox representation."""
|
|
155
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
156
|
+
top = max([self.r_y0, self.r_y1, self.r_y2, self.r_y3])
|
|
157
|
+
bottom = min([self.r_y0, self.r_y1, self.r_y2, self.r_y3])
|
|
158
|
+
else:
|
|
159
|
+
top = min([self.r_y0, self.r_y1, self.r_y2, self.r_y3])
|
|
160
|
+
bottom = max([self.r_y0, self.r_y1, self.r_y2, self.r_y3])
|
|
161
|
+
|
|
162
|
+
left = min([self.r_x0, self.r_x1, self.r_x2, self.r_x3])
|
|
163
|
+
right = max([self.r_x0, self.r_x1, self.r_x2, self.r_x3])
|
|
164
|
+
|
|
165
|
+
return BoundingBox(
|
|
166
|
+
l=left,
|
|
167
|
+
b=bottom,
|
|
168
|
+
r=right,
|
|
169
|
+
t=top,
|
|
170
|
+
coord_origin=self.coord_origin,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_bounding_box(cls, bbox: BoundingBox) -> "BoundingRectangle":
|
|
175
|
+
"""Convert a BoundingBox into a BoundingRectangle."""
|
|
176
|
+
return cls(
|
|
177
|
+
r_x0=bbox.l,
|
|
178
|
+
r_y0=bbox.b,
|
|
179
|
+
r_x2=bbox.r,
|
|
180
|
+
r_y2=bbox.t,
|
|
181
|
+
r_x1=bbox.r,
|
|
182
|
+
r_y1=bbox.b,
|
|
183
|
+
r_x3=bbox.l,
|
|
184
|
+
r_y3=bbox.t,
|
|
185
|
+
coord_origin=bbox.coord_origin,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def to_polygon(self) -> List[Coord2D]:
|
|
189
|
+
"""Convert to a list of point coordinates forming a polygon."""
|
|
190
|
+
return [
|
|
191
|
+
Coord2D(self.r_x0, self.r_y0),
|
|
192
|
+
Coord2D(self.r_x1, self.r_y1),
|
|
193
|
+
Coord2D(self.r_x2, self.r_y2),
|
|
194
|
+
Coord2D(self.r_x3, self.r_y3),
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
def to_bottom_left_origin(self, page_height: float) -> "BoundingRectangle":
|
|
198
|
+
"""Convert coordinates to use bottom-left origin.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
page_height: The height of the page
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
BoundingRectangle with bottom-left origin
|
|
205
|
+
"""
|
|
206
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
207
|
+
return self
|
|
208
|
+
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
|
209
|
+
return BoundingRectangle(
|
|
210
|
+
r_x0=self.r_x0,
|
|
211
|
+
r_x1=self.r_x1,
|
|
212
|
+
r_x2=self.r_x2,
|
|
213
|
+
r_x3=self.r_x3,
|
|
214
|
+
r_y0=page_height - self.r_y0,
|
|
215
|
+
r_y1=page_height - self.r_y1,
|
|
216
|
+
r_y2=page_height - self.r_y2,
|
|
217
|
+
r_y3=page_height - self.r_y3,
|
|
218
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
def to_top_left_origin(self, page_height: float) -> "BoundingRectangle":
|
|
222
|
+
"""Convert coordinates to use top-left origin.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
page_height: The height of the page
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
BoundingRectangle with top-left origin
|
|
229
|
+
"""
|
|
230
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
231
|
+
return self
|
|
232
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
233
|
+
return BoundingRectangle(
|
|
234
|
+
r_x0=self.r_x0,
|
|
235
|
+
r_x1=self.r_x1,
|
|
236
|
+
r_x2=self.r_x2,
|
|
237
|
+
r_x3=self.r_x3,
|
|
238
|
+
r_y0=page_height - self.r_y0,
|
|
239
|
+
r_y1=page_height - self.r_y1,
|
|
240
|
+
r_y2=page_height - self.r_y2,
|
|
241
|
+
r_y3=page_height - self.r_y3,
|
|
242
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class OrderedElement(BaseModel):
|
|
247
|
+
"""Base model for elements that have an ordering index."""
|
|
248
|
+
|
|
249
|
+
index: int = -1
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class ColorMixin(BaseModel):
|
|
253
|
+
"""Mixin class that adds color attributes to a model."""
|
|
254
|
+
|
|
255
|
+
rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class TextDirection(str, Enum):
|
|
259
|
+
"""Enumeration for text direction options."""
|
|
260
|
+
|
|
261
|
+
LEFT_TO_RIGHT = "left_to_right"
|
|
262
|
+
RIGHT_TO_LEFT = "right_to_left"
|
|
263
|
+
UNSPECIFIED = "unspecified"
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class TextCell(ColorMixin, OrderedElement):
|
|
267
|
+
"""Model representing a text cell with positioning and content information."""
|
|
268
|
+
|
|
269
|
+
rect: BoundingRectangle
|
|
270
|
+
|
|
271
|
+
text: str
|
|
272
|
+
orig: str
|
|
273
|
+
|
|
274
|
+
text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT
|
|
275
|
+
|
|
276
|
+
confidence: float = 1.0
|
|
277
|
+
from_ocr: bool
|
|
278
|
+
|
|
279
|
+
def to_bounding_box(self) -> BoundingBox:
|
|
280
|
+
"""Convert the cell rectangle to a BoundingBox."""
|
|
281
|
+
return self.rect.to_bounding_box()
|
|
282
|
+
|
|
283
|
+
def to_bottom_left_origin(self, page_height: float):
|
|
284
|
+
"""Convert the cell's coordinates to use bottom-left origin.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
page_height: The height of the page
|
|
288
|
+
"""
|
|
289
|
+
self.rect = self.rect.to_bottom_left_origin(page_height=page_height)
|
|
290
|
+
|
|
291
|
+
def to_top_left_origin(self, page_height: float):
|
|
292
|
+
"""Convert the cell's coordinates to use top-left origin.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
page_height: The height of the page
|
|
296
|
+
"""
|
|
297
|
+
self.rect = self.rect.to_top_left_origin(page_height=page_height)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class PdfCellRenderingMode(int, Enum):
|
|
301
|
+
"""Text Rendering Mode, according to PDF32000."""
|
|
302
|
+
|
|
303
|
+
FILL_TEXT = 0
|
|
304
|
+
STROKE_TEXT = 1
|
|
305
|
+
FILL_THEN_STROKE = 2
|
|
306
|
+
INVISIBLE = 3
|
|
307
|
+
FILL_AND_CLIPPING = 4
|
|
308
|
+
STROKE_AND_CLIPPING = 5
|
|
309
|
+
FILL_THEN_STROKE_AND_CLIPPING = 6
|
|
310
|
+
ONLY_CLIPPING = 7
|
|
311
|
+
UNKNOWN = -1
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class PdfTextCell(TextCell):
|
|
315
|
+
"""Specialized text cell for PDF documents with font information."""
|
|
316
|
+
|
|
317
|
+
rendering_mode: (
|
|
318
|
+
PdfCellRenderingMode # Turn into enum (PDF32000 Text Rendering Mode)
|
|
319
|
+
)
|
|
320
|
+
widget: bool # Determines if this belongs to fillable PDF field.
|
|
321
|
+
|
|
322
|
+
font_key: str
|
|
323
|
+
font_name: str
|
|
324
|
+
|
|
325
|
+
from_ocr: Literal[False] = False
|
|
326
|
+
|
|
327
|
+
@model_validator(mode="before")
|
|
328
|
+
@classmethod
|
|
329
|
+
def update_ltr_property(cls, data: dict) -> dict:
|
|
330
|
+
"""Update text direction property from left_to_right flag."""
|
|
331
|
+
if "left_to_right" in data:
|
|
332
|
+
data["text_direction"] = (
|
|
333
|
+
"left_to_right" if data["left_to_right"] else "right_to_left"
|
|
334
|
+
)
|
|
335
|
+
# if "ordering" in data:
|
|
336
|
+
# data["index"] = data["ordering"]
|
|
337
|
+
return data
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
class BitmapResource(OrderedElement):
|
|
341
|
+
"""Model representing a bitmap resource with positioning and URI information."""
|
|
342
|
+
|
|
343
|
+
rect: BoundingRectangle
|
|
344
|
+
uri: Optional[AnyUrl] = None
|
|
345
|
+
|
|
346
|
+
def to_bottom_left_origin(self, page_height: float):
|
|
347
|
+
"""Convert the resource's coordinates to use bottom-left origin.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
page_height: The height of the page
|
|
351
|
+
"""
|
|
352
|
+
self.rect = self.rect.to_bottom_left_origin(page_height=page_height)
|
|
353
|
+
|
|
354
|
+
def to_top_left_origin(self, page_height: float):
|
|
355
|
+
"""Convert the resource's coordinates to use top-left origin.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
page_height: The height of the page
|
|
359
|
+
"""
|
|
360
|
+
self.rect = self.rect.to_top_left_origin(page_height=page_height)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
class PdfLine(ColorMixin, OrderedElement):
|
|
364
|
+
"""Model representing a line in a PDF document."""
|
|
365
|
+
|
|
366
|
+
parent_id: int
|
|
367
|
+
points: List[Coord2D]
|
|
368
|
+
width: float = 1.0
|
|
369
|
+
|
|
370
|
+
coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT
|
|
371
|
+
|
|
372
|
+
def __len__(self) -> int:
|
|
373
|
+
"""Return the number of points in the line."""
|
|
374
|
+
return len(self.points)
|
|
375
|
+
|
|
376
|
+
def iterate_segments(
|
|
377
|
+
self,
|
|
378
|
+
) -> Iterator[Tuple[Coord2D, Coord2D]]:
|
|
379
|
+
"""Iterate through line segments defined by consecutive point pairs."""
|
|
380
|
+
for k in range(0, len(self.points) - 1):
|
|
381
|
+
yield (self.points[k], self.points[k + 1])
|
|
382
|
+
|
|
383
|
+
def to_bottom_left_origin(self, page_height: float):
|
|
384
|
+
"""Convert the line's coordinates to use bottom-left origin.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
page_height: The height of the page
|
|
388
|
+
"""
|
|
389
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
390
|
+
return self
|
|
391
|
+
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
|
392
|
+
for i, point in enumerate(self.points):
|
|
393
|
+
self.points[i] = Coord2D(point[0], page_height - point[1])
|
|
394
|
+
|
|
395
|
+
self.coord_origin = CoordOrigin.BOTTOMLEFT
|
|
396
|
+
|
|
397
|
+
def to_top_left_origin(self, page_height: float):
|
|
398
|
+
"""Convert the line's coordinates to use top-left origin.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
page_height: The height of the page
|
|
402
|
+
"""
|
|
403
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
404
|
+
return self
|
|
405
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
406
|
+
for i, point in enumerate(self.points):
|
|
407
|
+
self.points[i] = Coord2D(point[0], page_height - point[1])
|
|
408
|
+
|
|
409
|
+
self.coord_origin = CoordOrigin.TOPLEFT
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class PageGeometry(BaseModel):
|
|
413
|
+
"""Model representing dimensions of a page."""
|
|
414
|
+
|
|
415
|
+
angle: float
|
|
416
|
+
rect: BoundingRectangle
|
|
417
|
+
|
|
418
|
+
@property
|
|
419
|
+
def width(self):
|
|
420
|
+
"""Get the width of the page."""
|
|
421
|
+
# FIXME: think about angle, boundary_type and coord_origin ...
|
|
422
|
+
return self.rect.width
|
|
423
|
+
|
|
424
|
+
@property
|
|
425
|
+
def height(self):
|
|
426
|
+
"""Get the height of the page."""
|
|
427
|
+
# FIXME: think about angle, boundary_type and coord_origin ...
|
|
428
|
+
return self.rect.height
|
|
429
|
+
|
|
430
|
+
@property
|
|
431
|
+
def origin(self):
|
|
432
|
+
"""Get the origin point of the page."""
|
|
433
|
+
# FIXME: think about angle, boundary_type and coord_origin ...
|
|
434
|
+
return (self.rect.to_bounding_box().l, self.rect.to_bounding_box().b)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
class PdfPageGeometry(PageGeometry):
|
|
438
|
+
"""Extended dimensions model specific to PDF pages with boundary types."""
|
|
439
|
+
|
|
440
|
+
boundary_type: PdfPageBoundaryType
|
|
441
|
+
|
|
442
|
+
art_bbox: BoundingBox
|
|
443
|
+
bleed_bbox: BoundingBox
|
|
444
|
+
crop_bbox: BoundingBox
|
|
445
|
+
media_bbox: BoundingBox
|
|
446
|
+
trim_bbox: BoundingBox
|
|
447
|
+
|
|
448
|
+
@property
|
|
449
|
+
def width(self):
|
|
450
|
+
"""Get the width of the PDF page based on crop box."""
|
|
451
|
+
# FIXME: think about angle, boundary_type and coord_origin ...
|
|
452
|
+
return self.crop_bbox.width
|
|
453
|
+
|
|
454
|
+
@property
|
|
455
|
+
def height(self):
|
|
456
|
+
"""Get the height of the PDF page based on crop box."""
|
|
457
|
+
# FIXME: think about angle, boundary_type and coord_origin ...
|
|
458
|
+
return self.crop_bbox.height
|
|
459
|
+
|
|
460
|
+
@property
|
|
461
|
+
def origin(self):
|
|
462
|
+
"""Get the origin point of the PDF page based on crop box."""
|
|
463
|
+
# FIXME: think about angle, boundary_type and coord_origin ...
|
|
464
|
+
return (self.crop_bbox.l, self.crop_bbox.b)
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
class SegmentedPage(BaseModel):
|
|
468
|
+
"""Model representing a segmented page with text cells and resources."""
|
|
469
|
+
|
|
470
|
+
dimension: PageGeometry
|
|
471
|
+
|
|
472
|
+
bitmap_resources: List[BitmapResource] = []
|
|
473
|
+
|
|
474
|
+
char_cells: List[TextCell] = []
|
|
475
|
+
word_cells: List[TextCell] = []
|
|
476
|
+
textline_cells: List[TextCell] = []
|
|
477
|
+
|
|
478
|
+
image: Optional[ImageRef] = None
|
|
479
|
+
|
|
480
|
+
def iterate_cells(self, unit_type: TextCellUnit) -> Iterator[TextCell]:
|
|
481
|
+
"""Iterate through text cells of the specified unit type.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
unit_type: Type of text unit to iterate through
|
|
485
|
+
|
|
486
|
+
Returns:
|
|
487
|
+
Iterator of text cells
|
|
488
|
+
|
|
489
|
+
Raises:
|
|
490
|
+
ValueError: If an incompatible unit type is provided
|
|
491
|
+
"""
|
|
492
|
+
if unit_type == TextCellUnit.CHAR:
|
|
493
|
+
yield from self.char_cells
|
|
494
|
+
|
|
495
|
+
elif unit_type == TextCellUnit.WORD:
|
|
496
|
+
yield from self.word_cells
|
|
497
|
+
|
|
498
|
+
elif unit_type == TextCellUnit.LINE:
|
|
499
|
+
yield from self.textline_cells
|
|
500
|
+
|
|
501
|
+
else:
|
|
502
|
+
raise ValueError(f"incompatible {unit_type}")
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
class SegmentedPdfPage(SegmentedPage):
|
|
506
|
+
"""Extended segmented page model specific to PDF documents."""
|
|
507
|
+
|
|
508
|
+
# Redefine typing to use PdfPageDimensions
|
|
509
|
+
dimension: PdfPageGeometry
|
|
510
|
+
|
|
511
|
+
lines: List[PdfLine] = []
|
|
512
|
+
|
|
513
|
+
# Redefine typing of elements to include PdfTextCell
|
|
514
|
+
char_cells: List[Union[PdfTextCell, TextCell]]
|
|
515
|
+
word_cells: List[Union[PdfTextCell, TextCell]]
|
|
516
|
+
textline_cells: List[Union[PdfTextCell, TextCell]]
|
|
517
|
+
|
|
518
|
+
def get_cells_in_bbox(
|
|
519
|
+
self, cell_unit: TextCellUnit, bbox: BoundingBox, ios: float = 0.8
|
|
520
|
+
) -> List[Union[PdfTextCell, TextCell]]:
|
|
521
|
+
"""Get text cells that are within the specified bounding box.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
cell_unit: Type of text unit to check
|
|
525
|
+
bbox: Bounding box to check against
|
|
526
|
+
ios: Minimum intersection over self ratio
|
|
527
|
+
|
|
528
|
+
Returns:
|
|
529
|
+
List of text cells within the bounding box
|
|
530
|
+
"""
|
|
531
|
+
cells = []
|
|
532
|
+
for page_cell in self.iterate_cells(cell_unit):
|
|
533
|
+
cell_bbox = page_cell.to_bounding_box()
|
|
534
|
+
if cell_bbox.intersection_over_self(bbox) > ios:
|
|
535
|
+
cells.append(page_cell)
|
|
536
|
+
|
|
537
|
+
return cells
|
|
538
|
+
|
|
539
|
+
def export_to_dict(self) -> Dict:
|
|
540
|
+
"""Export the page data to a dictionary.
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
Dictionary representation of the page
|
|
544
|
+
"""
|
|
545
|
+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
546
|
+
|
|
547
|
+
def save_as_json(
|
|
548
|
+
self,
|
|
549
|
+
filename: Path,
|
|
550
|
+
indent: int = 2,
|
|
551
|
+
):
|
|
552
|
+
"""Save the page data as a JSON file.
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
filename: Path to save the JSON file
|
|
556
|
+
indent: Indentation level for JSON formatting
|
|
557
|
+
"""
|
|
558
|
+
out = self.export_to_dict()
|
|
559
|
+
with open(filename, "w", encoding="utf-8") as fw:
|
|
560
|
+
json.dump(out, fw, indent=indent)
|
|
561
|
+
|
|
562
|
+
@classmethod
|
|
563
|
+
def load_from_json(cls, filename: Path) -> "SegmentedPdfPage":
|
|
564
|
+
"""Load page data from a JSON file.
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
filename: Path to the JSON file
|
|
568
|
+
|
|
569
|
+
Returns:
|
|
570
|
+
Instantiated SegmentedPdfPage object
|
|
571
|
+
"""
|
|
572
|
+
with open(filename, "r", encoding="utf-8") as f:
|
|
573
|
+
return cls.model_validate_json(f.read())
|
|
574
|
+
|
|
575
|
+
def crop_text(self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0):
|
|
576
|
+
"""Extract text from cells within the specified bounding box.
|
|
577
|
+
|
|
578
|
+
Args:
|
|
579
|
+
cell_unit: Type of text unit to extract
|
|
580
|
+
bbox: Bounding box to extract from
|
|
581
|
+
eps: Epsilon value for position comparison
|
|
582
|
+
"""
|
|
583
|
+
selection = []
|
|
584
|
+
for page_cell in self.iterate_cells(cell_unit):
|
|
585
|
+
cell_bbox = page_cell.rect.to_bottom_left_origin(
|
|
586
|
+
page_height=self.dimension.height
|
|
587
|
+
).to_bounding_box()
|
|
588
|
+
|
|
589
|
+
if (
|
|
590
|
+
bbox.l <= cell_bbox.l
|
|
591
|
+
and cell_bbox.r <= bbox.r
|
|
592
|
+
and bbox.b <= cell_bbox.b
|
|
593
|
+
and cell_bbox.t <= bbox.t
|
|
594
|
+
):
|
|
595
|
+
selection.append(page_cell.copy())
|
|
596
|
+
|
|
597
|
+
selection = sorted(selection, key=lambda x: x.index)
|
|
598
|
+
|
|
599
|
+
text = ""
|
|
600
|
+
for i, cell in enumerate(selection):
|
|
601
|
+
|
|
602
|
+
if i == 0:
|
|
603
|
+
text += cell.text
|
|
604
|
+
else:
|
|
605
|
+
prev = selection[i - 1]
|
|
606
|
+
|
|
607
|
+
if (
|
|
608
|
+
abs(cell.rect.r_x0 - prev.rect.r_x1) < eps
|
|
609
|
+
and abs(cell.rect.r_y0 - prev.rect.r_y1) < eps
|
|
610
|
+
):
|
|
611
|
+
text += cell.text
|
|
612
|
+
else:
|
|
613
|
+
text += " "
|
|
614
|
+
text += cell.text
|
|
615
|
+
|
|
616
|
+
def export_to_textlines(
|
|
617
|
+
self,
|
|
618
|
+
cell_unit: TextCellUnit,
|
|
619
|
+
add_location: bool = True,
|
|
620
|
+
add_fontkey: bool = False,
|
|
621
|
+
add_fontname: bool = True,
|
|
622
|
+
) -> List[str]:
|
|
623
|
+
"""Export text cells as formatted text lines.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
cell_unit: Type of text unit to export
|
|
627
|
+
add_location: Whether to include position information
|
|
628
|
+
add_fontkey: Whether to include font key information
|
|
629
|
+
add_fontname: Whether to include font name information
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
List of formatted text lines
|
|
633
|
+
"""
|
|
634
|
+
lines: List[str] = []
|
|
635
|
+
for cell in self.iterate_cells(cell_unit):
|
|
636
|
+
|
|
637
|
+
line = ""
|
|
638
|
+
if add_location:
|
|
639
|
+
line += f"({cell.rect.r_x0:06.02f}, {cell.rect.r_y0:06.02f}) "
|
|
640
|
+
line += f"({cell.rect.r_x1:06.02f}, {cell.rect.r_y1:06.02f}) "
|
|
641
|
+
line += f"({cell.rect.r_x2:06.02f}, {cell.rect.r_y2:06.02f}) "
|
|
642
|
+
line += f"({cell.rect.r_x3:06.02f}, {cell.rect.r_y3:06.02f}) "
|
|
643
|
+
|
|
644
|
+
if add_fontkey and isinstance(cell, PdfTextCell):
|
|
645
|
+
line += f"{cell.font_key:>10} "
|
|
646
|
+
|
|
647
|
+
if add_fontname and isinstance(cell, PdfTextCell):
|
|
648
|
+
line += f"{cell.font_name:>10} "
|
|
649
|
+
|
|
650
|
+
line += f"{cell.text}"
|
|
651
|
+
lines.append(line)
|
|
652
|
+
|
|
653
|
+
return lines
|
|
654
|
+
|
|
655
|
+
def render_as_image(
|
|
656
|
+
self,
|
|
657
|
+
cell_unit: TextCellUnit,
|
|
658
|
+
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, # media_box
|
|
659
|
+
draw_cells_bbox: bool = False,
|
|
660
|
+
draw_cells_text: bool = True,
|
|
661
|
+
draw_cells_bl: bool = False,
|
|
662
|
+
draw_cells_tr: bool = False,
|
|
663
|
+
cell_outline: str = "black",
|
|
664
|
+
cell_color: str = "cyan",
|
|
665
|
+
cell_alpha: float = 1.0,
|
|
666
|
+
cell_bl_color: str = "red",
|
|
667
|
+
cell_bl_outline: str = "red",
|
|
668
|
+
cell_bl_alpha: float = 1.0,
|
|
669
|
+
cell_bl_radius: float = 3.0,
|
|
670
|
+
cell_tr_color: str = "green",
|
|
671
|
+
cell_tr_outline: str = "green",
|
|
672
|
+
cell_tr_alpha: float = 1.0,
|
|
673
|
+
cell_tr_radius: float = 3.0,
|
|
674
|
+
draw_bitmap_resources: bool = True,
|
|
675
|
+
bitmap_resources_outline: str = "black",
|
|
676
|
+
bitmap_resources_fill: str = "yellow",
|
|
677
|
+
bitmap_resources_alpha: float = 1.0,
|
|
678
|
+
draw_lines: bool = True,
|
|
679
|
+
line_color: str = "black",
|
|
680
|
+
line_width: int = 1,
|
|
681
|
+
line_alpha: float = 1.0,
|
|
682
|
+
draw_annotations: bool = True,
|
|
683
|
+
annotations_outline: str = "white",
|
|
684
|
+
annotations_color: str = "green",
|
|
685
|
+
annotations_alpha: float = 0.5,
|
|
686
|
+
draw_crop_box: bool = True,
|
|
687
|
+
cropbox_outline: str = "red",
|
|
688
|
+
cropbox_width: int = 3,
|
|
689
|
+
cropbox_alpha: float = 1.0,
|
|
690
|
+
) -> PILImage.Image:
|
|
691
|
+
"""Render the page as an image with various visualization options.
|
|
692
|
+
|
|
693
|
+
Args:
|
|
694
|
+
cell_unit: Type of text unit to render
|
|
695
|
+
boundary_type: Type of page boundary to use
|
|
696
|
+
draw_cells_bbox: Whether to draw bounding boxes for cells
|
|
697
|
+
draw_cells_text: Whether to draw text content of cells
|
|
698
|
+
draw_cells_bl: Whether to draw bottom left points of cells
|
|
699
|
+
draw_cells_tr: Whether to draw top right points of cells
|
|
700
|
+
cell_outline: Color for cell outlines
|
|
701
|
+
cell_color: Fill color for cells
|
|
702
|
+
cell_alpha: Alpha value for cell visualization
|
|
703
|
+
cell_bl_color: Color for bottom left points
|
|
704
|
+
cell_bl_outline: Outline color for bottom left points
|
|
705
|
+
cell_bl_alpha: Alpha value for bottom left points
|
|
706
|
+
cell_bl_radius: Radius for bottom left points
|
|
707
|
+
cell_tr_color: Color for top right points
|
|
708
|
+
cell_tr_outline: Outline color for top right points
|
|
709
|
+
cell_tr_alpha: Alpha value for top right points
|
|
710
|
+
cell_tr_radius: Radius for top right points
|
|
711
|
+
draw_bitmap_resources: Whether to draw bitmap resources
|
|
712
|
+
bitmap_resources_outline: Outline color for bitmap resources
|
|
713
|
+
bitmap_resources_fill: Fill color for bitmap resources
|
|
714
|
+
bitmap_resources_alpha: Alpha value for bitmap resources
|
|
715
|
+
draw_lines: Whether to draw lines
|
|
716
|
+
line_color: Color for lines
|
|
717
|
+
line_width: Width for lines
|
|
718
|
+
line_alpha: Alpha value for lines
|
|
719
|
+
draw_annotations: Whether to draw annotations
|
|
720
|
+
annotations_outline: Outline color for annotations
|
|
721
|
+
annotations_color: Fill color for annotations
|
|
722
|
+
annotations_alpha: Alpha value for annotations
|
|
723
|
+
draw_crop_box: Whether to draw crop box
|
|
724
|
+
cropbox_outline: Color for crop box outline
|
|
725
|
+
cropbox_width: Width for crop box outline
|
|
726
|
+
cropbox_alpha: Alpha value for crop box
|
|
727
|
+
|
|
728
|
+
Returns:
|
|
729
|
+
PIL Image of the rendered page
|
|
730
|
+
"""
|
|
731
|
+
for _ in [
|
|
732
|
+
cell_alpha,
|
|
733
|
+
cell_bl_alpha,
|
|
734
|
+
cell_tr_alpha,
|
|
735
|
+
bitmap_resources_alpha,
|
|
736
|
+
line_alpha,
|
|
737
|
+
annotations_alpha,
|
|
738
|
+
cropbox_alpha,
|
|
739
|
+
]:
|
|
740
|
+
if _ < 0 or 1.0 < _:
|
|
741
|
+
logging.error(f"alpha value {_} needs to be in [0, 1]")
|
|
742
|
+
_ = max(0, min(1.0, _))
|
|
743
|
+
|
|
744
|
+
page_bbox = self.dimension.crop_bbox
|
|
745
|
+
|
|
746
|
+
page_width = page_bbox.width
|
|
747
|
+
page_height = page_bbox.height
|
|
748
|
+
|
|
749
|
+
# Create a blank white image with RGBA mode
|
|
750
|
+
result = PILImage.new(
|
|
751
|
+
"RGBA", (round(page_width), round(page_height)), (255, 255, 255, 255)
|
|
752
|
+
)
|
|
753
|
+
draw = ImageDraw.Draw(result)
|
|
754
|
+
|
|
755
|
+
# Draw each rectangle by connecting its four points
|
|
756
|
+
if draw_bitmap_resources:
|
|
757
|
+
draw = self._render_bitmap_resources(
|
|
758
|
+
draw=draw,
|
|
759
|
+
page_height=page_height,
|
|
760
|
+
bitmap_resources_fill=bitmap_resources_fill,
|
|
761
|
+
bitmap_resources_outline=bitmap_resources_outline,
|
|
762
|
+
bitmap_resources_alpha=bitmap_resources_alpha,
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
if draw_cells_text:
|
|
766
|
+
result = self._render_cells_text(
|
|
767
|
+
cell_unit=cell_unit, img=result, page_height=page_height
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
elif draw_cells_bbox:
|
|
771
|
+
self._render_cells_bbox(
|
|
772
|
+
cell_unit=cell_unit,
|
|
773
|
+
draw=draw,
|
|
774
|
+
page_height=page_height,
|
|
775
|
+
cell_fill=cell_color,
|
|
776
|
+
cell_outline=cell_outline,
|
|
777
|
+
cell_alpha=cell_alpha,
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
if draw_cells_bl:
|
|
781
|
+
self._draw_cells_bl(
|
|
782
|
+
cell_unit=cell_unit,
|
|
783
|
+
draw=draw,
|
|
784
|
+
page_height=page_height,
|
|
785
|
+
cell_bl_color=cell_bl_color,
|
|
786
|
+
cell_bl_outline=cell_bl_outline,
|
|
787
|
+
cell_bl_alpha=cell_bl_alpha,
|
|
788
|
+
cell_bl_radius=cell_bl_radius,
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
if draw_cells_tr:
|
|
792
|
+
self._draw_cells_tr(
|
|
793
|
+
cell_unit=cell_unit,
|
|
794
|
+
draw=draw,
|
|
795
|
+
page_height=page_height,
|
|
796
|
+
cell_tr_color=cell_tr_color,
|
|
797
|
+
cell_tr_outline=cell_tr_outline,
|
|
798
|
+
cell_tr_alpha=cell_tr_alpha,
|
|
799
|
+
cell_tr_radius=cell_tr_radius,
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
if draw_lines:
|
|
803
|
+
draw = self._render_lines(
|
|
804
|
+
draw=draw,
|
|
805
|
+
page_height=page_height,
|
|
806
|
+
line_color=line_color,
|
|
807
|
+
line_alpha=line_alpha,
|
|
808
|
+
line_width=line_width,
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
return result
|
|
812
|
+
|
|
813
|
+
def _get_rgba(self, name: str, alpha: float):
|
|
814
|
+
"""Get RGBA tuple from color name and alpha value.
|
|
815
|
+
|
|
816
|
+
Args:
|
|
817
|
+
name: Color name
|
|
818
|
+
alpha: Alpha value between 0 and 1
|
|
819
|
+
|
|
820
|
+
Returns:
|
|
821
|
+
RGBA tuple
|
|
822
|
+
|
|
823
|
+
Raises:
|
|
824
|
+
AssertionError: If alpha is out of range
|
|
825
|
+
"""
|
|
826
|
+
assert 0.0 <= alpha and alpha <= 1.0, "0.0 <= alpha and alpha <= 1.0"
|
|
827
|
+
rgba = ImageColor.getrgb(name) + (int(alpha * 255),)
|
|
828
|
+
return rgba
|
|
829
|
+
|
|
830
|
+
def _render_bitmap_resources(
|
|
831
|
+
self,
|
|
832
|
+
draw: ImageDraw.ImageDraw,
|
|
833
|
+
page_height: float,
|
|
834
|
+
bitmap_resources_fill: str,
|
|
835
|
+
bitmap_resources_outline: str,
|
|
836
|
+
bitmap_resources_alpha: float,
|
|
837
|
+
) -> ImageDraw.ImageDraw:
|
|
838
|
+
"""Render bitmap resources on the page.
|
|
839
|
+
|
|
840
|
+
Args:
|
|
841
|
+
draw: PIL ImageDraw object
|
|
842
|
+
page_height: Height of the page
|
|
843
|
+
bitmap_resources_fill: Fill color for bitmap resources
|
|
844
|
+
bitmap_resources_outline: Outline color for bitmap resources
|
|
845
|
+
bitmap_resources_alpha: Alpha value for bitmap resources
|
|
846
|
+
|
|
847
|
+
Returns:
|
|
848
|
+
Updated ImageDraw object
|
|
849
|
+
"""
|
|
850
|
+
for bitmap_resource in self.bitmap_resources:
|
|
851
|
+
poly = bitmap_resource.rect.to_top_left_origin(
|
|
852
|
+
page_height=page_height
|
|
853
|
+
).to_polygon()
|
|
854
|
+
|
|
855
|
+
fill = self._get_rgba(
|
|
856
|
+
name=bitmap_resources_fill, alpha=bitmap_resources_alpha
|
|
857
|
+
)
|
|
858
|
+
outline = self._get_rgba(
|
|
859
|
+
name=bitmap_resources_outline, alpha=bitmap_resources_alpha
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
draw.polygon(poly, outline=outline, fill=fill)
|
|
863
|
+
|
|
864
|
+
return draw
|
|
865
|
+
|
|
866
|
+
def _render_cells_bbox(
|
|
867
|
+
self,
|
|
868
|
+
cell_unit: TextCellUnit,
|
|
869
|
+
draw: ImageDraw.ImageDraw,
|
|
870
|
+
page_height: float,
|
|
871
|
+
cell_fill: str,
|
|
872
|
+
cell_outline: str,
|
|
873
|
+
cell_alpha: float,
|
|
874
|
+
) -> ImageDraw.ImageDraw:
|
|
875
|
+
"""Render bounding boxes for text cells.
|
|
876
|
+
|
|
877
|
+
Args:
|
|
878
|
+
cell_unit: Type of text unit to render
|
|
879
|
+
draw: PIL ImageDraw object
|
|
880
|
+
page_height: Height of the page
|
|
881
|
+
cell_fill: Fill color for cells
|
|
882
|
+
cell_outline: Outline color for cells
|
|
883
|
+
cell_alpha: Alpha value for cells
|
|
884
|
+
|
|
885
|
+
Returns:
|
|
886
|
+
Updated ImageDraw object
|
|
887
|
+
"""
|
|
888
|
+
fill = self._get_rgba(name=cell_fill, alpha=cell_alpha)
|
|
889
|
+
outline = self._get_rgba(name=cell_outline, alpha=cell_alpha)
|
|
890
|
+
|
|
891
|
+
# Draw each rectangle by connecting its four points
|
|
892
|
+
for page_cell in self.iterate_cells(unit_type=cell_unit):
|
|
893
|
+
poly = page_cell.rect.to_top_left_origin(
|
|
894
|
+
page_height=page_height
|
|
895
|
+
).to_polygon()
|
|
896
|
+
draw.polygon(poly, outline=outline, fill=fill)
|
|
897
|
+
|
|
898
|
+
return draw
|
|
899
|
+
|
|
900
|
+
def _draw_text_in_rectangle(
|
|
901
|
+
self,
|
|
902
|
+
img: PILImage.Image,
|
|
903
|
+
rect: BoundingRectangle,
|
|
904
|
+
text: str,
|
|
905
|
+
font: Optional[Union[FreeTypeFont, ImageFont.ImageFont]] = None,
|
|
906
|
+
fill: str = "black",
|
|
907
|
+
) -> PILImage.Image:
|
|
908
|
+
"""Draw text within a rectangular boundary with rotation.
|
|
909
|
+
|
|
910
|
+
Args:
|
|
911
|
+
img: PIL Image to draw on
|
|
912
|
+
rect: Rectangle defining the text boundary
|
|
913
|
+
text: Text content to draw
|
|
914
|
+
font: Font to use for drawing text
|
|
915
|
+
fill: Text color
|
|
916
|
+
|
|
917
|
+
Returns:
|
|
918
|
+
Updated PIL Image
|
|
919
|
+
"""
|
|
920
|
+
width = round(rect.width)
|
|
921
|
+
height = round(rect.height)
|
|
922
|
+
rot_angle = rect.angle_360
|
|
923
|
+
|
|
924
|
+
centre = rect.centre
|
|
925
|
+
centre_x, centre_y = round(centre[0]), round(centre[1])
|
|
926
|
+
|
|
927
|
+
# print(f"width: {width}, height: {height}, angle: {rot_angle}, text: {text}")
|
|
928
|
+
|
|
929
|
+
if width <= 2 or height <= 2:
|
|
930
|
+
# logging.warning(f"skipping to draw text
|
|
931
|
+
# (width: {x1-x0}, height: {y1-y0}): {text}")
|
|
932
|
+
return img
|
|
933
|
+
|
|
934
|
+
# Use the default font if no font is provided
|
|
935
|
+
if font is None:
|
|
936
|
+
font = ImageFont.load_default()
|
|
937
|
+
|
|
938
|
+
# Create a temporary image for the text
|
|
939
|
+
tmp_img = PILImage.new("RGBA", (1, 1), (255, 255, 255, 0)) # Dummy size
|
|
940
|
+
tmp_draw = ImageDraw.Draw(tmp_img)
|
|
941
|
+
_, _, text_width, text_height = tmp_draw.textbbox((0, 0), text=text, font=font)
|
|
942
|
+
|
|
943
|
+
# Create a properly sized temporary image
|
|
944
|
+
text_img = PILImage.new(
|
|
945
|
+
"RGBA", (round(text_width), round(text_height)), (255, 255, 255, 255)
|
|
946
|
+
)
|
|
947
|
+
text_draw = ImageDraw.Draw(text_img)
|
|
948
|
+
text_draw.text((0, 0), text, font=font, fill=(0, 0, 0, 255))
|
|
949
|
+
|
|
950
|
+
# Resize image
|
|
951
|
+
text_img = text_img.resize((width, height), PILImage.Resampling.LANCZOS)
|
|
952
|
+
|
|
953
|
+
# Rotate img_1
|
|
954
|
+
rotated_img = text_img.rotate(rot_angle, expand=True)
|
|
955
|
+
|
|
956
|
+
# Compute new position for pasting
|
|
957
|
+
rotated_w, rotated_h = rotated_img.size
|
|
958
|
+
paste_x = centre_x - rotated_w // 2
|
|
959
|
+
paste_y = centre_y - rotated_h // 2
|
|
960
|
+
|
|
961
|
+
# Paste rotated image onto img_2
|
|
962
|
+
img.paste(rotated_img, (paste_x, paste_y), rotated_img)
|
|
963
|
+
|
|
964
|
+
return img
|
|
965
|
+
|
|
966
|
+
def _render_cells_text(
|
|
967
|
+
self, cell_unit: TextCellUnit, img: PILImage.Image, page_height: float
|
|
968
|
+
) -> PILImage.Image:
|
|
969
|
+
"""Render text content of cells on the image.
|
|
970
|
+
|
|
971
|
+
Args:
|
|
972
|
+
cell_unit: Type of text unit to render
|
|
973
|
+
img: PIL Image to draw on
|
|
974
|
+
page_height: Height of the page
|
|
975
|
+
|
|
976
|
+
Returns:
|
|
977
|
+
Updated PIL Image
|
|
978
|
+
"""
|
|
979
|
+
# Draw each rectangle by connecting its four points
|
|
980
|
+
for page_cell in self.iterate_cells(unit_type=cell_unit):
|
|
981
|
+
rect = page_cell.rect.to_top_left_origin(page_height=page_height)
|
|
982
|
+
img = self._draw_text_in_rectangle(
|
|
983
|
+
img=img,
|
|
984
|
+
rect=rect,
|
|
985
|
+
text=page_cell.text,
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
return img
|
|
989
|
+
|
|
990
|
+
def _draw_cells_bl(
|
|
991
|
+
self,
|
|
992
|
+
cell_unit: TextCellUnit,
|
|
993
|
+
draw: ImageDraw.ImageDraw,
|
|
994
|
+
page_height: float,
|
|
995
|
+
cell_bl_color: str,
|
|
996
|
+
cell_bl_outline: str,
|
|
997
|
+
cell_bl_alpha: float,
|
|
998
|
+
cell_bl_radius: float,
|
|
999
|
+
) -> ImageDraw.ImageDraw:
|
|
1000
|
+
"""Draw bottom-left points of text cells.
|
|
1001
|
+
|
|
1002
|
+
Args:
|
|
1003
|
+
cell_unit: Type of text unit to render
|
|
1004
|
+
draw: PIL ImageDraw object
|
|
1005
|
+
page_height: Height of the page
|
|
1006
|
+
cell_bl_color: Fill color for bottom-left points
|
|
1007
|
+
cell_bl_outline: Outline color for bottom-left points
|
|
1008
|
+
cell_bl_alpha: Alpha value for bottom-left points
|
|
1009
|
+
cell_bl_radius: Radius for bottom-left points
|
|
1010
|
+
|
|
1011
|
+
Returns:
|
|
1012
|
+
Updated ImageDraw object
|
|
1013
|
+
"""
|
|
1014
|
+
fill = self._get_rgba(name=cell_bl_color, alpha=cell_bl_alpha)
|
|
1015
|
+
outline = self._get_rgba(name=cell_bl_outline, alpha=cell_bl_alpha)
|
|
1016
|
+
|
|
1017
|
+
# Draw each rectangle by connecting its four points
|
|
1018
|
+
for page_cell in self.iterate_cells(unit_type=cell_unit):
|
|
1019
|
+
poly = page_cell.rect.to_top_left_origin(
|
|
1020
|
+
page_height=page_height
|
|
1021
|
+
).to_polygon()
|
|
1022
|
+
# Define the bounding box for the dot
|
|
1023
|
+
dot_bbox = [
|
|
1024
|
+
(poly[0][0] - cell_bl_radius, poly[0][1] - cell_bl_radius),
|
|
1025
|
+
(poly[0][0] + cell_bl_radius, poly[0][1] + cell_bl_radius),
|
|
1026
|
+
]
|
|
1027
|
+
|
|
1028
|
+
# Draw the red dot
|
|
1029
|
+
draw.ellipse(dot_bbox, fill=fill, outline=outline)
|
|
1030
|
+
|
|
1031
|
+
return draw
|
|
1032
|
+
|
|
1033
|
+
def _draw_cells_tr(
|
|
1034
|
+
self,
|
|
1035
|
+
cell_unit: TextCellUnit,
|
|
1036
|
+
draw: ImageDraw.ImageDraw,
|
|
1037
|
+
page_height: float,
|
|
1038
|
+
cell_tr_color: str,
|
|
1039
|
+
cell_tr_outline: str,
|
|
1040
|
+
cell_tr_alpha: float,
|
|
1041
|
+
cell_tr_radius: float,
|
|
1042
|
+
) -> ImageDraw.ImageDraw:
|
|
1043
|
+
"""Draw top-right points of text cells.
|
|
1044
|
+
|
|
1045
|
+
Args:
|
|
1046
|
+
cell_unit: Type of text unit to render
|
|
1047
|
+
draw: PIL ImageDraw object
|
|
1048
|
+
page_height: Height of the page
|
|
1049
|
+
cell_tr_color: Fill color for top-right points
|
|
1050
|
+
cell_tr_outline: Outline color for top-right points
|
|
1051
|
+
cell_tr_alpha: Alpha value for top-right points
|
|
1052
|
+
cell_tr_radius: Radius for top-right points
|
|
1053
|
+
|
|
1054
|
+
Returns:
|
|
1055
|
+
Updated ImageDraw object
|
|
1056
|
+
"""
|
|
1057
|
+
fill = self._get_rgba(name=cell_tr_color, alpha=cell_tr_alpha)
|
|
1058
|
+
outline = self._get_rgba(name=cell_tr_outline, alpha=cell_tr_alpha)
|
|
1059
|
+
|
|
1060
|
+
# Draw each rectangle by connecting its four points
|
|
1061
|
+
for page_cell in self.iterate_cells(unit_type=cell_unit):
|
|
1062
|
+
poly = page_cell.rect.to_top_left_origin(
|
|
1063
|
+
page_height=page_height
|
|
1064
|
+
).to_polygon()
|
|
1065
|
+
# Define the bounding box for the dot
|
|
1066
|
+
dot_bbox = [
|
|
1067
|
+
(poly[0][0] - cell_tr_radius, poly[0][1] - cell_tr_radius),
|
|
1068
|
+
(poly[0][0] + cell_tr_radius, poly[0][1] + cell_tr_radius),
|
|
1069
|
+
]
|
|
1070
|
+
|
|
1071
|
+
# Draw the red dot
|
|
1072
|
+
draw.ellipse(dot_bbox, fill=fill, outline=outline)
|
|
1073
|
+
|
|
1074
|
+
return draw
|
|
1075
|
+
|
|
1076
|
+
def _render_lines(
|
|
1077
|
+
self,
|
|
1078
|
+
draw: ImageDraw.ImageDraw,
|
|
1079
|
+
page_height: float,
|
|
1080
|
+
line_color: str,
|
|
1081
|
+
line_alpha: float,
|
|
1082
|
+
line_width: float,
|
|
1083
|
+
) -> ImageDraw.ImageDraw:
|
|
1084
|
+
"""Render lines on the page.
|
|
1085
|
+
|
|
1086
|
+
Args:
|
|
1087
|
+
draw: PIL ImageDraw object
|
|
1088
|
+
page_height: Height of the page
|
|
1089
|
+
line_color: Color for lines
|
|
1090
|
+
line_alpha: Alpha value for lines
|
|
1091
|
+
line_width: Width for lines
|
|
1092
|
+
|
|
1093
|
+
Returns:
|
|
1094
|
+
Updated ImageDraw object
|
|
1095
|
+
"""
|
|
1096
|
+
fill = self._get_rgba(name=line_color, alpha=line_alpha)
|
|
1097
|
+
|
|
1098
|
+
# Draw each rectangle by connecting its four points
|
|
1099
|
+
for line in self.lines:
|
|
1100
|
+
|
|
1101
|
+
line.to_top_left_origin(page_height=page_height)
|
|
1102
|
+
for segment in line.iterate_segments():
|
|
1103
|
+
draw.line(
|
|
1104
|
+
(segment[0][0], segment[0][1], segment[1][0], segment[1][1]),
|
|
1105
|
+
fill=fill,
|
|
1106
|
+
width=max(1, round(line.width)),
|
|
1107
|
+
)
|
|
1108
|
+
|
|
1109
|
+
return draw
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
class PdfMetaData(BaseModel):
|
|
1113
|
+
"""Model representing PDF metadata extracted from XML."""
|
|
1114
|
+
|
|
1115
|
+
xml: str = ""
|
|
1116
|
+
|
|
1117
|
+
data: Dict[str, str] = {}
|
|
1118
|
+
|
|
1119
|
+
def initialise(self):
|
|
1120
|
+
"""Initialize metadata by parsing the XML content."""
|
|
1121
|
+
# Define the regex pattern
|
|
1122
|
+
pattern = r"\<([a-zA-Z]+)\:([a-zA-Z]+)\>(.+?)\<\/([a-zA-Z]+)\:([a-zA-Z]+)\>"
|
|
1123
|
+
|
|
1124
|
+
# Find all matches
|
|
1125
|
+
matches = re.findall(pattern, self.xml)
|
|
1126
|
+
|
|
1127
|
+
# Process matches
|
|
1128
|
+
for _ in matches:
|
|
1129
|
+
namespace_open, tag_open, content, namespace_close, tag_close = _
|
|
1130
|
+
if namespace_open == namespace_close and tag_open == tag_close:
|
|
1131
|
+
_logger.debug(
|
|
1132
|
+
f"Namespace: {namespace_open}, Tag: {tag_open}, Content: {content}"
|
|
1133
|
+
)
|
|
1134
|
+
self.data[tag_open] = content
|
|
1135
|
+
|
|
1136
|
+
|
|
1137
|
+
class PdfTableOfContents(BaseModel):
|
|
1138
|
+
"""Model representing a PDF table of contents entry with hierarchical structure."""
|
|
1139
|
+
|
|
1140
|
+
text: str
|
|
1141
|
+
orig: str = ""
|
|
1142
|
+
|
|
1143
|
+
marker: str = ""
|
|
1144
|
+
|
|
1145
|
+
children: List["PdfTableOfContents"] = []
|
|
1146
|
+
|
|
1147
|
+
def export_to_dict(self, mode: str = "json") -> Dict:
|
|
1148
|
+
"""Export the table of contents to a dictionary.
|
|
1149
|
+
|
|
1150
|
+
Args:
|
|
1151
|
+
mode: Serialization mode
|
|
1152
|
+
|
|
1153
|
+
Returns:
|
|
1154
|
+
Dictionary representation of the table of contents
|
|
1155
|
+
"""
|
|
1156
|
+
return self.model_dump(mode=mode, by_alias=True, exclude_none=True)
|
|
1157
|
+
|
|
1158
|
+
def save_as_json(self, filename: Path, indent: int = 2):
|
|
1159
|
+
"""Save the table of contents as a JSON file.
|
|
1160
|
+
|
|
1161
|
+
Args:
|
|
1162
|
+
filename: Path to save the JSON file
|
|
1163
|
+
indent: Indentation level for JSON formatting
|
|
1164
|
+
"""
|
|
1165
|
+
out = self.export_to_dict()
|
|
1166
|
+
with open(filename, "w", encoding="utf-8") as fw:
|
|
1167
|
+
json.dump(out, fw, indent=indent)
|
|
1168
|
+
|
|
1169
|
+
@classmethod
|
|
1170
|
+
def load_from_json(cls, filename: Path) -> "PdfTableOfContents":
|
|
1171
|
+
"""Load table of contents from a JSON file.
|
|
1172
|
+
|
|
1173
|
+
Args:
|
|
1174
|
+
filename: Path to the JSON file
|
|
1175
|
+
|
|
1176
|
+
Returns:
|
|
1177
|
+
Instantiated PdfTableOfContents object
|
|
1178
|
+
"""
|
|
1179
|
+
with open(filename, "r", encoding="utf-8") as f:
|
|
1180
|
+
return cls.model_validate_json(f.read())
|
|
1181
|
+
|
|
1182
|
+
|
|
1183
|
+
class ParsedPdfDocument(BaseModel):
|
|
1184
|
+
"""Model representing a completely parsed PDF document with all components."""
|
|
1185
|
+
|
|
1186
|
+
pages: Dict[PageNumber, SegmentedPdfPage] = {}
|
|
1187
|
+
|
|
1188
|
+
meta_data: Optional[PdfMetaData] = None
|
|
1189
|
+
table_of_contents: Optional[PdfTableOfContents] = None
|
|
1190
|
+
|
|
1191
|
+
def iterate_pages(
|
|
1192
|
+
self,
|
|
1193
|
+
) -> Iterator[Tuple[int, SegmentedPdfPage]]:
|
|
1194
|
+
"""Iterate through all pages in the document.
|
|
1195
|
+
|
|
1196
|
+
Returns:
|
|
1197
|
+
Iterator of (page number, page) tuples
|
|
1198
|
+
"""
|
|
1199
|
+
for page_no, page in self.pages.items():
|
|
1200
|
+
yield (page_no, page)
|
|
1201
|
+
|
|
1202
|
+
def export_to_dict(
|
|
1203
|
+
self,
|
|
1204
|
+
mode: str = "json",
|
|
1205
|
+
) -> Dict:
|
|
1206
|
+
"""Export the document to a dictionary.
|
|
1207
|
+
|
|
1208
|
+
Args:
|
|
1209
|
+
mode: Serialization mode
|
|
1210
|
+
|
|
1211
|
+
Returns:
|
|
1212
|
+
Dictionary representation of the document
|
|
1213
|
+
"""
|
|
1214
|
+
return self.model_dump(mode=mode, by_alias=True, exclude_none=True)
|
|
1215
|
+
|
|
1216
|
+
def save_as_json(self, filename: Path, indent: int = 2):
|
|
1217
|
+
"""Save the document as a JSON file.
|
|
1218
|
+
|
|
1219
|
+
Args:
|
|
1220
|
+
filename: Path to save the JSON file
|
|
1221
|
+
indent: Indentation level for JSON formatting
|
|
1222
|
+
"""
|
|
1223
|
+
out = self.export_to_dict()
|
|
1224
|
+
with open(filename, "w", encoding="utf-8") as fw:
|
|
1225
|
+
json.dump(out, fw, indent=indent)
|
|
1226
|
+
|
|
1227
|
+
@classmethod
|
|
1228
|
+
def load_from_json(cls, filename: Path) -> "ParsedPdfDocument":
|
|
1229
|
+
"""Load document from a JSON file.
|
|
1230
|
+
|
|
1231
|
+
Args:
|
|
1232
|
+
filename: Path to the JSON file
|
|
1233
|
+
|
|
1234
|
+
Returns:
|
|
1235
|
+
Instantiated ParsedPdfDocument object
|
|
1236
|
+
"""
|
|
1237
|
+
with open(filename, "r", encoding="utf-8") as f:
|
|
1238
|
+
return cls.model_validate_json(f.read())
|