docling-core 2.22.0__py3-none-any.whl → 2.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1,1238 @@
1
+ """Datastructures for PaginatedDocument."""
2
+
3
+ import json
4
+ import logging
5
+ import math
6
+ import re
7
+ import typing
8
+ from enum import Enum
9
+ from pathlib import Path
10
+ from typing import (
11
+ Annotated,
12
+ Dict,
13
+ Iterator,
14
+ List,
15
+ Literal,
16
+ NamedTuple,
17
+ Optional,
18
+ Tuple,
19
+ Union,
20
+ )
21
+
22
+ import numpy as np
23
+ from PIL import Image as PILImage
24
+ from PIL import ImageColor, ImageDraw, ImageFont
25
+ from PIL.ImageFont import FreeTypeFont
26
+ from pydantic import AnyUrl, BaseModel, Field, model_validator
27
+
28
+ from docling_core.types.doc.base import BoundingBox, CoordOrigin
29
+ from docling_core.types.doc.document import ImageRef
30
+
31
+ _logger = logging.getLogger(__name__)
32
+
33
+ PageNumber = typing.Annotated[int, Field(ge=1)]
34
+
35
+
36
+ class TextCellUnit(str, Enum):
37
+ """Enumeration of text cell units for segmented PDF page processing."""
38
+
39
+ CHAR = "char"
40
+ WORD = "word"
41
+ LINE = "line"
42
+
43
+ def __str__(self) -> str:
44
+ """Return string representation of the enum value."""
45
+ return str(self.value)
46
+
47
+
48
+ class PdfPageBoundaryType(str, Enum):
49
+ """Enumeration of PDF page boundary types."""
50
+
51
+ ART_BOX = "art_box"
52
+ BLEED_BOX = "bleed_box"
53
+ CROP_BOX = "crop_box"
54
+ MEDIA_BOX = "media_box"
55
+ TRIM_BOX = "trim_box"
56
+
57
+ def __str__(self) -> str:
58
+ """Return string representation of the enum value."""
59
+ return str(self.value)
60
+
61
+
62
+ ColorChannelValue = Annotated[int, Field(ge=0, le=255)]
63
+
64
+
65
+ class ColorRGBA(BaseModel):
66
+ """Model representing an RGBA color value."""
67
+
68
+ r: ColorChannelValue
69
+ g: ColorChannelValue
70
+ b: ColorChannelValue
71
+ a: ColorChannelValue = 255
72
+
73
+ def as_tuple(self) -> tuple[int, int, int, int]:
74
+ """Return the color components as a tuple."""
75
+ return (self.r, self.g, self.b, self.a)
76
+
77
+ def __iter__(self):
78
+ """Yield the color components for iteration."""
79
+ yield from (self.r, self.g, self.b, self.a)
80
+
81
+
82
+ class Coord2D(NamedTuple):
83
+ """A 2D coordinate with x and y components."""
84
+
85
+ x: float
86
+ y: float
87
+
88
+
89
+ class BoundingRectangle(BaseModel):
90
+ """Model representing a rectangular boundary with four corner points."""
91
+
92
+ r_x0: float
93
+ r_y0: float
94
+
95
+ r_x1: float
96
+ r_y1: float
97
+
98
+ r_x2: float
99
+ r_y2: float
100
+
101
+ r_x3: float
102
+ r_y3: float
103
+
104
+ coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT
105
+
106
+ @property
107
+ def width(self) -> float:
108
+ """Calculate the width of the rectangle."""
109
+ return np.sqrt((self.r_x1 - self.r_x0) ** 2 + (self.r_y1 - self.r_y0) ** 2)
110
+
111
+ @property
112
+ def height(self) -> float:
113
+ """Calculate the height of the rectangle."""
114
+ return np.sqrt((self.r_x3 - self.r_x0) ** 2 + (self.r_y3 - self.r_y0) ** 2)
115
+
116
+ @property
117
+ def angle(self) -> float:
118
+ """Calculate the angle of the rectangle in radians."""
119
+ p_0 = ((self.r_x0 + self.r_x3) / 2.0, (self.r_y0 + self.r_y3) / 2.0)
120
+ p_1 = ((self.r_x1 + self.r_x2) / 2.0, (self.r_y1 + self.r_y2) / 2.0)
121
+
122
+ delta_x, delta_y = p_1[0] - p_0[0], p_1[1] - p_0[1]
123
+
124
+ if abs(delta_x) > 1.0e-3:
125
+ return math.atan(delta_y / delta_x)
126
+ elif delta_y > 0:
127
+ return 3.142592 / 2.0
128
+ else:
129
+ return -3.142592 / 2.0
130
+
131
+ @property
132
+ def angle_360(self) -> int:
133
+ """Calculate the angle of the rectangle in degrees (0-360 range)."""
134
+ p_0 = ((self.r_x0 + self.r_x3) / 2.0, (self.r_y0 + self.r_y3) / 2.0)
135
+ p_1 = ((self.r_x1 + self.r_x2) / 2.0, (self.r_y1 + self.r_y2) / 2.0)
136
+
137
+ delta_x, delta_y = p_1[0] - p_0[0], p_1[1] - p_0[1]
138
+
139
+ if abs(delta_y) < 1.0e-2:
140
+ return 0
141
+ elif abs(delta_x) < 1.0e-2:
142
+ return 90
143
+ else:
144
+ return round(-math.atan(delta_y / delta_x) / np.pi * 180)
145
+
146
+ @property
147
+ def centre(self):
148
+ """Calculate the center point of the rectangle."""
149
+ return (self.r_x0 + self.r_x1 + self.r_x2 + self.r_x3) / 4.0, (
150
+ self.r_y0 + self.r_y1 + self.r_y2 + self.r_y3
151
+ ) / 4.0
152
+
153
+ def to_bounding_box(self) -> BoundingBox:
154
+ """Convert to a BoundingBox representation."""
155
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
156
+ top = max([self.r_y0, self.r_y1, self.r_y2, self.r_y3])
157
+ bottom = min([self.r_y0, self.r_y1, self.r_y2, self.r_y3])
158
+ else:
159
+ top = min([self.r_y0, self.r_y1, self.r_y2, self.r_y3])
160
+ bottom = max([self.r_y0, self.r_y1, self.r_y2, self.r_y3])
161
+
162
+ left = min([self.r_x0, self.r_x1, self.r_x2, self.r_x3])
163
+ right = max([self.r_x0, self.r_x1, self.r_x2, self.r_x3])
164
+
165
+ return BoundingBox(
166
+ l=left,
167
+ b=bottom,
168
+ r=right,
169
+ t=top,
170
+ coord_origin=self.coord_origin,
171
+ )
172
+
173
+ @classmethod
174
+ def from_bounding_box(cls, bbox: BoundingBox) -> "BoundingRectangle":
175
+ """Convert a BoundingBox into a BoundingRectangle."""
176
+ return cls(
177
+ r_x0=bbox.l,
178
+ r_y0=bbox.b,
179
+ r_x2=bbox.r,
180
+ r_y2=bbox.t,
181
+ r_x1=bbox.r,
182
+ r_y1=bbox.b,
183
+ r_x3=bbox.l,
184
+ r_y3=bbox.t,
185
+ coord_origin=bbox.coord_origin,
186
+ )
187
+
188
+ def to_polygon(self) -> List[Coord2D]:
189
+ """Convert to a list of point coordinates forming a polygon."""
190
+ return [
191
+ Coord2D(self.r_x0, self.r_y0),
192
+ Coord2D(self.r_x1, self.r_y1),
193
+ Coord2D(self.r_x2, self.r_y2),
194
+ Coord2D(self.r_x3, self.r_y3),
195
+ ]
196
+
197
+ def to_bottom_left_origin(self, page_height: float) -> "BoundingRectangle":
198
+ """Convert coordinates to use bottom-left origin.
199
+
200
+ Args:
201
+ page_height: The height of the page
202
+
203
+ Returns:
204
+ BoundingRectangle with bottom-left origin
205
+ """
206
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
207
+ return self
208
+ elif self.coord_origin == CoordOrigin.TOPLEFT:
209
+ return BoundingRectangle(
210
+ r_x0=self.r_x0,
211
+ r_x1=self.r_x1,
212
+ r_x2=self.r_x2,
213
+ r_x3=self.r_x3,
214
+ r_y0=page_height - self.r_y0,
215
+ r_y1=page_height - self.r_y1,
216
+ r_y2=page_height - self.r_y2,
217
+ r_y3=page_height - self.r_y3,
218
+ coord_origin=CoordOrigin.BOTTOMLEFT,
219
+ )
220
+
221
+ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle":
222
+ """Convert coordinates to use top-left origin.
223
+
224
+ Args:
225
+ page_height: The height of the page
226
+
227
+ Returns:
228
+ BoundingRectangle with top-left origin
229
+ """
230
+ if self.coord_origin == CoordOrigin.TOPLEFT:
231
+ return self
232
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
233
+ return BoundingRectangle(
234
+ r_x0=self.r_x0,
235
+ r_x1=self.r_x1,
236
+ r_x2=self.r_x2,
237
+ r_x3=self.r_x3,
238
+ r_y0=page_height - self.r_y0,
239
+ r_y1=page_height - self.r_y1,
240
+ r_y2=page_height - self.r_y2,
241
+ r_y3=page_height - self.r_y3,
242
+ coord_origin=CoordOrigin.TOPLEFT,
243
+ )
244
+
245
+
246
+ class OrderedElement(BaseModel):
247
+ """Base model for elements that have an ordering index."""
248
+
249
+ index: int = -1
250
+
251
+
252
+ class ColorMixin(BaseModel):
253
+ """Mixin class that adds color attributes to a model."""
254
+
255
+ rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255)
256
+
257
+
258
+ class TextDirection(str, Enum):
259
+ """Enumeration for text direction options."""
260
+
261
+ LEFT_TO_RIGHT = "left_to_right"
262
+ RIGHT_TO_LEFT = "right_to_left"
263
+ UNSPECIFIED = "unspecified"
264
+
265
+
266
+ class TextCell(ColorMixin, OrderedElement):
267
+ """Model representing a text cell with positioning and content information."""
268
+
269
+ rect: BoundingRectangle
270
+
271
+ text: str
272
+ orig: str
273
+
274
+ text_direction: TextDirection = TextDirection.LEFT_TO_RIGHT
275
+
276
+ confidence: float = 1.0
277
+ from_ocr: bool
278
+
279
+ def to_bounding_box(self) -> BoundingBox:
280
+ """Convert the cell rectangle to a BoundingBox."""
281
+ return self.rect.to_bounding_box()
282
+
283
+ def to_bottom_left_origin(self, page_height: float):
284
+ """Convert the cell's coordinates to use bottom-left origin.
285
+
286
+ Args:
287
+ page_height: The height of the page
288
+ """
289
+ self.rect = self.rect.to_bottom_left_origin(page_height=page_height)
290
+
291
+ def to_top_left_origin(self, page_height: float):
292
+ """Convert the cell's coordinates to use top-left origin.
293
+
294
+ Args:
295
+ page_height: The height of the page
296
+ """
297
+ self.rect = self.rect.to_top_left_origin(page_height=page_height)
298
+
299
+
300
+ class PdfCellRenderingMode(int, Enum):
301
+ """Text Rendering Mode, according to PDF32000."""
302
+
303
+ FILL_TEXT = 0
304
+ STROKE_TEXT = 1
305
+ FILL_THEN_STROKE = 2
306
+ INVISIBLE = 3
307
+ FILL_AND_CLIPPING = 4
308
+ STROKE_AND_CLIPPING = 5
309
+ FILL_THEN_STROKE_AND_CLIPPING = 6
310
+ ONLY_CLIPPING = 7
311
+ UNKNOWN = -1
312
+
313
+
314
+ class PdfTextCell(TextCell):
315
+ """Specialized text cell for PDF documents with font information."""
316
+
317
+ rendering_mode: (
318
+ PdfCellRenderingMode # Turn into enum (PDF32000 Text Rendering Mode)
319
+ )
320
+ widget: bool # Determines if this belongs to fillable PDF field.
321
+
322
+ font_key: str
323
+ font_name: str
324
+
325
+ from_ocr: Literal[False] = False
326
+
327
+ @model_validator(mode="before")
328
+ @classmethod
329
+ def update_ltr_property(cls, data: dict) -> dict:
330
+ """Update text direction property from left_to_right flag."""
331
+ if "left_to_right" in data:
332
+ data["text_direction"] = (
333
+ "left_to_right" if data["left_to_right"] else "right_to_left"
334
+ )
335
+ # if "ordering" in data:
336
+ # data["index"] = data["ordering"]
337
+ return data
338
+
339
+
340
+ class BitmapResource(OrderedElement):
341
+ """Model representing a bitmap resource with positioning and URI information."""
342
+
343
+ rect: BoundingRectangle
344
+ uri: Optional[AnyUrl] = None
345
+
346
+ def to_bottom_left_origin(self, page_height: float):
347
+ """Convert the resource's coordinates to use bottom-left origin.
348
+
349
+ Args:
350
+ page_height: The height of the page
351
+ """
352
+ self.rect = self.rect.to_bottom_left_origin(page_height=page_height)
353
+
354
+ def to_top_left_origin(self, page_height: float):
355
+ """Convert the resource's coordinates to use top-left origin.
356
+
357
+ Args:
358
+ page_height: The height of the page
359
+ """
360
+ self.rect = self.rect.to_top_left_origin(page_height=page_height)
361
+
362
+
363
+ class PdfLine(ColorMixin, OrderedElement):
364
+ """Model representing a line in a PDF document."""
365
+
366
+ parent_id: int
367
+ points: List[Coord2D]
368
+ width: float = 1.0
369
+
370
+ coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT
371
+
372
+ def __len__(self) -> int:
373
+ """Return the number of points in the line."""
374
+ return len(self.points)
375
+
376
+ def iterate_segments(
377
+ self,
378
+ ) -> Iterator[Tuple[Coord2D, Coord2D]]:
379
+ """Iterate through line segments defined by consecutive point pairs."""
380
+ for k in range(0, len(self.points) - 1):
381
+ yield (self.points[k], self.points[k + 1])
382
+
383
+ def to_bottom_left_origin(self, page_height: float):
384
+ """Convert the line's coordinates to use bottom-left origin.
385
+
386
+ Args:
387
+ page_height: The height of the page
388
+ """
389
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
390
+ return self
391
+ elif self.coord_origin == CoordOrigin.TOPLEFT:
392
+ for i, point in enumerate(self.points):
393
+ self.points[i] = Coord2D(point[0], page_height - point[1])
394
+
395
+ self.coord_origin = CoordOrigin.BOTTOMLEFT
396
+
397
+ def to_top_left_origin(self, page_height: float):
398
+ """Convert the line's coordinates to use top-left origin.
399
+
400
+ Args:
401
+ page_height: The height of the page
402
+ """
403
+ if self.coord_origin == CoordOrigin.TOPLEFT:
404
+ return self
405
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
406
+ for i, point in enumerate(self.points):
407
+ self.points[i] = Coord2D(point[0], page_height - point[1])
408
+
409
+ self.coord_origin = CoordOrigin.TOPLEFT
410
+
411
+
412
+ class PageGeometry(BaseModel):
413
+ """Model representing dimensions of a page."""
414
+
415
+ angle: float
416
+ rect: BoundingRectangle
417
+
418
+ @property
419
+ def width(self):
420
+ """Get the width of the page."""
421
+ # FIXME: think about angle, boundary_type and coord_origin ...
422
+ return self.rect.width
423
+
424
+ @property
425
+ def height(self):
426
+ """Get the height of the page."""
427
+ # FIXME: think about angle, boundary_type and coord_origin ...
428
+ return self.rect.height
429
+
430
+ @property
431
+ def origin(self):
432
+ """Get the origin point of the page."""
433
+ # FIXME: think about angle, boundary_type and coord_origin ...
434
+ return (self.rect.to_bounding_box().l, self.rect.to_bounding_box().b)
435
+
436
+
437
+ class PdfPageGeometry(PageGeometry):
438
+ """Extended dimensions model specific to PDF pages with boundary types."""
439
+
440
+ boundary_type: PdfPageBoundaryType
441
+
442
+ art_bbox: BoundingBox
443
+ bleed_bbox: BoundingBox
444
+ crop_bbox: BoundingBox
445
+ media_bbox: BoundingBox
446
+ trim_bbox: BoundingBox
447
+
448
+ @property
449
+ def width(self):
450
+ """Get the width of the PDF page based on crop box."""
451
+ # FIXME: think about angle, boundary_type and coord_origin ...
452
+ return self.crop_bbox.width
453
+
454
+ @property
455
+ def height(self):
456
+ """Get the height of the PDF page based on crop box."""
457
+ # FIXME: think about angle, boundary_type and coord_origin ...
458
+ return self.crop_bbox.height
459
+
460
+ @property
461
+ def origin(self):
462
+ """Get the origin point of the PDF page based on crop box."""
463
+ # FIXME: think about angle, boundary_type and coord_origin ...
464
+ return (self.crop_bbox.l, self.crop_bbox.b)
465
+
466
+
467
+ class SegmentedPage(BaseModel):
468
+ """Model representing a segmented page with text cells and resources."""
469
+
470
+ dimension: PageGeometry
471
+
472
+ bitmap_resources: List[BitmapResource] = []
473
+
474
+ char_cells: List[TextCell] = []
475
+ word_cells: List[TextCell] = []
476
+ textline_cells: List[TextCell] = []
477
+
478
+ image: Optional[ImageRef] = None
479
+
480
+ def iterate_cells(self, unit_type: TextCellUnit) -> Iterator[TextCell]:
481
+ """Iterate through text cells of the specified unit type.
482
+
483
+ Args:
484
+ unit_type: Type of text unit to iterate through
485
+
486
+ Returns:
487
+ Iterator of text cells
488
+
489
+ Raises:
490
+ ValueError: If an incompatible unit type is provided
491
+ """
492
+ if unit_type == TextCellUnit.CHAR:
493
+ yield from self.char_cells
494
+
495
+ elif unit_type == TextCellUnit.WORD:
496
+ yield from self.word_cells
497
+
498
+ elif unit_type == TextCellUnit.LINE:
499
+ yield from self.textline_cells
500
+
501
+ else:
502
+ raise ValueError(f"incompatible {unit_type}")
503
+
504
+
505
+ class SegmentedPdfPage(SegmentedPage):
506
+ """Extended segmented page model specific to PDF documents."""
507
+
508
+ # Redefine typing to use PdfPageDimensions
509
+ dimension: PdfPageGeometry
510
+
511
+ lines: List[PdfLine] = []
512
+
513
+ # Redefine typing of elements to include PdfTextCell
514
+ char_cells: List[Union[PdfTextCell, TextCell]]
515
+ word_cells: List[Union[PdfTextCell, TextCell]]
516
+ textline_cells: List[Union[PdfTextCell, TextCell]]
517
+
518
+ def get_cells_in_bbox(
519
+ self, cell_unit: TextCellUnit, bbox: BoundingBox, ios: float = 0.8
520
+ ) -> List[Union[PdfTextCell, TextCell]]:
521
+ """Get text cells that are within the specified bounding box.
522
+
523
+ Args:
524
+ cell_unit: Type of text unit to check
525
+ bbox: Bounding box to check against
526
+ ios: Minimum intersection over self ratio
527
+
528
+ Returns:
529
+ List of text cells within the bounding box
530
+ """
531
+ cells = []
532
+ for page_cell in self.iterate_cells(cell_unit):
533
+ cell_bbox = page_cell.to_bounding_box()
534
+ if cell_bbox.intersection_over_self(bbox) > ios:
535
+ cells.append(page_cell)
536
+
537
+ return cells
538
+
539
+ def export_to_dict(self) -> Dict:
540
+ """Export the page data to a dictionary.
541
+
542
+ Returns:
543
+ Dictionary representation of the page
544
+ """
545
+ return self.model_dump(mode="json", by_alias=True, exclude_none=True)
546
+
547
+ def save_as_json(
548
+ self,
549
+ filename: Path,
550
+ indent: int = 2,
551
+ ):
552
+ """Save the page data as a JSON file.
553
+
554
+ Args:
555
+ filename: Path to save the JSON file
556
+ indent: Indentation level for JSON formatting
557
+ """
558
+ out = self.export_to_dict()
559
+ with open(filename, "w", encoding="utf-8") as fw:
560
+ json.dump(out, fw, indent=indent)
561
+
562
+ @classmethod
563
+ def load_from_json(cls, filename: Path) -> "SegmentedPdfPage":
564
+ """Load page data from a JSON file.
565
+
566
+ Args:
567
+ filename: Path to the JSON file
568
+
569
+ Returns:
570
+ Instantiated SegmentedPdfPage object
571
+ """
572
+ with open(filename, "r", encoding="utf-8") as f:
573
+ return cls.model_validate_json(f.read())
574
+
575
+ def crop_text(self, cell_unit: TextCellUnit, bbox: BoundingBox, eps: float = 1.0):
576
+ """Extract text from cells within the specified bounding box.
577
+
578
+ Args:
579
+ cell_unit: Type of text unit to extract
580
+ bbox: Bounding box to extract from
581
+ eps: Epsilon value for position comparison
582
+ """
583
+ selection = []
584
+ for page_cell in self.iterate_cells(cell_unit):
585
+ cell_bbox = page_cell.rect.to_bottom_left_origin(
586
+ page_height=self.dimension.height
587
+ ).to_bounding_box()
588
+
589
+ if (
590
+ bbox.l <= cell_bbox.l
591
+ and cell_bbox.r <= bbox.r
592
+ and bbox.b <= cell_bbox.b
593
+ and cell_bbox.t <= bbox.t
594
+ ):
595
+ selection.append(page_cell.copy())
596
+
597
+ selection = sorted(selection, key=lambda x: x.index)
598
+
599
+ text = ""
600
+ for i, cell in enumerate(selection):
601
+
602
+ if i == 0:
603
+ text += cell.text
604
+ else:
605
+ prev = selection[i - 1]
606
+
607
+ if (
608
+ abs(cell.rect.r_x0 - prev.rect.r_x1) < eps
609
+ and abs(cell.rect.r_y0 - prev.rect.r_y1) < eps
610
+ ):
611
+ text += cell.text
612
+ else:
613
+ text += " "
614
+ text += cell.text
615
+
616
+ def export_to_textlines(
617
+ self,
618
+ cell_unit: TextCellUnit,
619
+ add_location: bool = True,
620
+ add_fontkey: bool = False,
621
+ add_fontname: bool = True,
622
+ ) -> List[str]:
623
+ """Export text cells as formatted text lines.
624
+
625
+ Args:
626
+ cell_unit: Type of text unit to export
627
+ add_location: Whether to include position information
628
+ add_fontkey: Whether to include font key information
629
+ add_fontname: Whether to include font name information
630
+
631
+ Returns:
632
+ List of formatted text lines
633
+ """
634
+ lines: List[str] = []
635
+ for cell in self.iterate_cells(cell_unit):
636
+
637
+ line = ""
638
+ if add_location:
639
+ line += f"({cell.rect.r_x0:06.02f}, {cell.rect.r_y0:06.02f}) "
640
+ line += f"({cell.rect.r_x1:06.02f}, {cell.rect.r_y1:06.02f}) "
641
+ line += f"({cell.rect.r_x2:06.02f}, {cell.rect.r_y2:06.02f}) "
642
+ line += f"({cell.rect.r_x3:06.02f}, {cell.rect.r_y3:06.02f}) "
643
+
644
+ if add_fontkey and isinstance(cell, PdfTextCell):
645
+ line += f"{cell.font_key:>10} "
646
+
647
+ if add_fontname and isinstance(cell, PdfTextCell):
648
+ line += f"{cell.font_name:>10} "
649
+
650
+ line += f"{cell.text}"
651
+ lines.append(line)
652
+
653
+ return lines
654
+
655
+ def render_as_image(
656
+ self,
657
+ cell_unit: TextCellUnit,
658
+ boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, # media_box
659
+ draw_cells_bbox: bool = False,
660
+ draw_cells_text: bool = True,
661
+ draw_cells_bl: bool = False,
662
+ draw_cells_tr: bool = False,
663
+ cell_outline: str = "black",
664
+ cell_color: str = "cyan",
665
+ cell_alpha: float = 1.0,
666
+ cell_bl_color: str = "red",
667
+ cell_bl_outline: str = "red",
668
+ cell_bl_alpha: float = 1.0,
669
+ cell_bl_radius: float = 3.0,
670
+ cell_tr_color: str = "green",
671
+ cell_tr_outline: str = "green",
672
+ cell_tr_alpha: float = 1.0,
673
+ cell_tr_radius: float = 3.0,
674
+ draw_bitmap_resources: bool = True,
675
+ bitmap_resources_outline: str = "black",
676
+ bitmap_resources_fill: str = "yellow",
677
+ bitmap_resources_alpha: float = 1.0,
678
+ draw_lines: bool = True,
679
+ line_color: str = "black",
680
+ line_width: int = 1,
681
+ line_alpha: float = 1.0,
682
+ draw_annotations: bool = True,
683
+ annotations_outline: str = "white",
684
+ annotations_color: str = "green",
685
+ annotations_alpha: float = 0.5,
686
+ draw_crop_box: bool = True,
687
+ cropbox_outline: str = "red",
688
+ cropbox_width: int = 3,
689
+ cropbox_alpha: float = 1.0,
690
+ ) -> PILImage.Image:
691
+ """Render the page as an image with various visualization options.
692
+
693
+ Args:
694
+ cell_unit: Type of text unit to render
695
+ boundary_type: Type of page boundary to use
696
+ draw_cells_bbox: Whether to draw bounding boxes for cells
697
+ draw_cells_text: Whether to draw text content of cells
698
+ draw_cells_bl: Whether to draw bottom left points of cells
699
+ draw_cells_tr: Whether to draw top right points of cells
700
+ cell_outline: Color for cell outlines
701
+ cell_color: Fill color for cells
702
+ cell_alpha: Alpha value for cell visualization
703
+ cell_bl_color: Color for bottom left points
704
+ cell_bl_outline: Outline color for bottom left points
705
+ cell_bl_alpha: Alpha value for bottom left points
706
+ cell_bl_radius: Radius for bottom left points
707
+ cell_tr_color: Color for top right points
708
+ cell_tr_outline: Outline color for top right points
709
+ cell_tr_alpha: Alpha value for top right points
710
+ cell_tr_radius: Radius for top right points
711
+ draw_bitmap_resources: Whether to draw bitmap resources
712
+ bitmap_resources_outline: Outline color for bitmap resources
713
+ bitmap_resources_fill: Fill color for bitmap resources
714
+ bitmap_resources_alpha: Alpha value for bitmap resources
715
+ draw_lines: Whether to draw lines
716
+ line_color: Color for lines
717
+ line_width: Width for lines
718
+ line_alpha: Alpha value for lines
719
+ draw_annotations: Whether to draw annotations
720
+ annotations_outline: Outline color for annotations
721
+ annotations_color: Fill color for annotations
722
+ annotations_alpha: Alpha value for annotations
723
+ draw_crop_box: Whether to draw crop box
724
+ cropbox_outline: Color for crop box outline
725
+ cropbox_width: Width for crop box outline
726
+ cropbox_alpha: Alpha value for crop box
727
+
728
+ Returns:
729
+ PIL Image of the rendered page
730
+ """
731
+ for _ in [
732
+ cell_alpha,
733
+ cell_bl_alpha,
734
+ cell_tr_alpha,
735
+ bitmap_resources_alpha,
736
+ line_alpha,
737
+ annotations_alpha,
738
+ cropbox_alpha,
739
+ ]:
740
+ if _ < 0 or 1.0 < _:
741
+ logging.error(f"alpha value {_} needs to be in [0, 1]")
742
+ _ = max(0, min(1.0, _))
743
+
744
+ page_bbox = self.dimension.crop_bbox
745
+
746
+ page_width = page_bbox.width
747
+ page_height = page_bbox.height
748
+
749
+ # Create a blank white image with RGBA mode
750
+ result = PILImage.new(
751
+ "RGBA", (round(page_width), round(page_height)), (255, 255, 255, 255)
752
+ )
753
+ draw = ImageDraw.Draw(result)
754
+
755
+ # Draw each rectangle by connecting its four points
756
+ if draw_bitmap_resources:
757
+ draw = self._render_bitmap_resources(
758
+ draw=draw,
759
+ page_height=page_height,
760
+ bitmap_resources_fill=bitmap_resources_fill,
761
+ bitmap_resources_outline=bitmap_resources_outline,
762
+ bitmap_resources_alpha=bitmap_resources_alpha,
763
+ )
764
+
765
+ if draw_cells_text:
766
+ result = self._render_cells_text(
767
+ cell_unit=cell_unit, img=result, page_height=page_height
768
+ )
769
+
770
+ elif draw_cells_bbox:
771
+ self._render_cells_bbox(
772
+ cell_unit=cell_unit,
773
+ draw=draw,
774
+ page_height=page_height,
775
+ cell_fill=cell_color,
776
+ cell_outline=cell_outline,
777
+ cell_alpha=cell_alpha,
778
+ )
779
+
780
+ if draw_cells_bl:
781
+ self._draw_cells_bl(
782
+ cell_unit=cell_unit,
783
+ draw=draw,
784
+ page_height=page_height,
785
+ cell_bl_color=cell_bl_color,
786
+ cell_bl_outline=cell_bl_outline,
787
+ cell_bl_alpha=cell_bl_alpha,
788
+ cell_bl_radius=cell_bl_radius,
789
+ )
790
+
791
+ if draw_cells_tr:
792
+ self._draw_cells_tr(
793
+ cell_unit=cell_unit,
794
+ draw=draw,
795
+ page_height=page_height,
796
+ cell_tr_color=cell_tr_color,
797
+ cell_tr_outline=cell_tr_outline,
798
+ cell_tr_alpha=cell_tr_alpha,
799
+ cell_tr_radius=cell_tr_radius,
800
+ )
801
+
802
+ if draw_lines:
803
+ draw = self._render_lines(
804
+ draw=draw,
805
+ page_height=page_height,
806
+ line_color=line_color,
807
+ line_alpha=line_alpha,
808
+ line_width=line_width,
809
+ )
810
+
811
+ return result
812
+
813
+ def _get_rgba(self, name: str, alpha: float):
814
+ """Get RGBA tuple from color name and alpha value.
815
+
816
+ Args:
817
+ name: Color name
818
+ alpha: Alpha value between 0 and 1
819
+
820
+ Returns:
821
+ RGBA tuple
822
+
823
+ Raises:
824
+ AssertionError: If alpha is out of range
825
+ """
826
+ assert 0.0 <= alpha and alpha <= 1.0, "0.0 <= alpha and alpha <= 1.0"
827
+ rgba = ImageColor.getrgb(name) + (int(alpha * 255),)
828
+ return rgba
829
+
830
+ def _render_bitmap_resources(
831
+ self,
832
+ draw: ImageDraw.ImageDraw,
833
+ page_height: float,
834
+ bitmap_resources_fill: str,
835
+ bitmap_resources_outline: str,
836
+ bitmap_resources_alpha: float,
837
+ ) -> ImageDraw.ImageDraw:
838
+ """Render bitmap resources on the page.
839
+
840
+ Args:
841
+ draw: PIL ImageDraw object
842
+ page_height: Height of the page
843
+ bitmap_resources_fill: Fill color for bitmap resources
844
+ bitmap_resources_outline: Outline color for bitmap resources
845
+ bitmap_resources_alpha: Alpha value for bitmap resources
846
+
847
+ Returns:
848
+ Updated ImageDraw object
849
+ """
850
+ for bitmap_resource in self.bitmap_resources:
851
+ poly = bitmap_resource.rect.to_top_left_origin(
852
+ page_height=page_height
853
+ ).to_polygon()
854
+
855
+ fill = self._get_rgba(
856
+ name=bitmap_resources_fill, alpha=bitmap_resources_alpha
857
+ )
858
+ outline = self._get_rgba(
859
+ name=bitmap_resources_outline, alpha=bitmap_resources_alpha
860
+ )
861
+
862
+ draw.polygon(poly, outline=outline, fill=fill)
863
+
864
+ return draw
865
+
866
+ def _render_cells_bbox(
867
+ self,
868
+ cell_unit: TextCellUnit,
869
+ draw: ImageDraw.ImageDraw,
870
+ page_height: float,
871
+ cell_fill: str,
872
+ cell_outline: str,
873
+ cell_alpha: float,
874
+ ) -> ImageDraw.ImageDraw:
875
+ """Render bounding boxes for text cells.
876
+
877
+ Args:
878
+ cell_unit: Type of text unit to render
879
+ draw: PIL ImageDraw object
880
+ page_height: Height of the page
881
+ cell_fill: Fill color for cells
882
+ cell_outline: Outline color for cells
883
+ cell_alpha: Alpha value for cells
884
+
885
+ Returns:
886
+ Updated ImageDraw object
887
+ """
888
+ fill = self._get_rgba(name=cell_fill, alpha=cell_alpha)
889
+ outline = self._get_rgba(name=cell_outline, alpha=cell_alpha)
890
+
891
+ # Draw each rectangle by connecting its four points
892
+ for page_cell in self.iterate_cells(unit_type=cell_unit):
893
+ poly = page_cell.rect.to_top_left_origin(
894
+ page_height=page_height
895
+ ).to_polygon()
896
+ draw.polygon(poly, outline=outline, fill=fill)
897
+
898
+ return draw
899
+
900
+ def _draw_text_in_rectangle(
901
+ self,
902
+ img: PILImage.Image,
903
+ rect: BoundingRectangle,
904
+ text: str,
905
+ font: Optional[Union[FreeTypeFont, ImageFont.ImageFont]] = None,
906
+ fill: str = "black",
907
+ ) -> PILImage.Image:
908
+ """Draw text within a rectangular boundary with rotation.
909
+
910
+ Args:
911
+ img: PIL Image to draw on
912
+ rect: Rectangle defining the text boundary
913
+ text: Text content to draw
914
+ font: Font to use for drawing text
915
+ fill: Text color
916
+
917
+ Returns:
918
+ Updated PIL Image
919
+ """
920
+ width = round(rect.width)
921
+ height = round(rect.height)
922
+ rot_angle = rect.angle_360
923
+
924
+ centre = rect.centre
925
+ centre_x, centre_y = round(centre[0]), round(centre[1])
926
+
927
+ # print(f"width: {width}, height: {height}, angle: {rot_angle}, text: {text}")
928
+
929
+ if width <= 2 or height <= 2:
930
+ # logging.warning(f"skipping to draw text
931
+ # (width: {x1-x0}, height: {y1-y0}): {text}")
932
+ return img
933
+
934
+ # Use the default font if no font is provided
935
+ if font is None:
936
+ font = ImageFont.load_default()
937
+
938
+ # Create a temporary image for the text
939
+ tmp_img = PILImage.new("RGBA", (1, 1), (255, 255, 255, 0)) # Dummy size
940
+ tmp_draw = ImageDraw.Draw(tmp_img)
941
+ _, _, text_width, text_height = tmp_draw.textbbox((0, 0), text=text, font=font)
942
+
943
+ # Create a properly sized temporary image
944
+ text_img = PILImage.new(
945
+ "RGBA", (round(text_width), round(text_height)), (255, 255, 255, 255)
946
+ )
947
+ text_draw = ImageDraw.Draw(text_img)
948
+ text_draw.text((0, 0), text, font=font, fill=(0, 0, 0, 255))
949
+
950
+ # Resize image
951
+ text_img = text_img.resize((width, height), PILImage.Resampling.LANCZOS)
952
+
953
+ # Rotate img_1
954
+ rotated_img = text_img.rotate(rot_angle, expand=True)
955
+
956
+ # Compute new position for pasting
957
+ rotated_w, rotated_h = rotated_img.size
958
+ paste_x = centre_x - rotated_w // 2
959
+ paste_y = centre_y - rotated_h // 2
960
+
961
+ # Paste rotated image onto img_2
962
+ img.paste(rotated_img, (paste_x, paste_y), rotated_img)
963
+
964
+ return img
965
+
966
+ def _render_cells_text(
967
+ self, cell_unit: TextCellUnit, img: PILImage.Image, page_height: float
968
+ ) -> PILImage.Image:
969
+ """Render text content of cells on the image.
970
+
971
+ Args:
972
+ cell_unit: Type of text unit to render
973
+ img: PIL Image to draw on
974
+ page_height: Height of the page
975
+
976
+ Returns:
977
+ Updated PIL Image
978
+ """
979
+ # Draw each rectangle by connecting its four points
980
+ for page_cell in self.iterate_cells(unit_type=cell_unit):
981
+ rect = page_cell.rect.to_top_left_origin(page_height=page_height)
982
+ img = self._draw_text_in_rectangle(
983
+ img=img,
984
+ rect=rect,
985
+ text=page_cell.text,
986
+ )
987
+
988
+ return img
989
+
990
+ def _draw_cells_bl(
991
+ self,
992
+ cell_unit: TextCellUnit,
993
+ draw: ImageDraw.ImageDraw,
994
+ page_height: float,
995
+ cell_bl_color: str,
996
+ cell_bl_outline: str,
997
+ cell_bl_alpha: float,
998
+ cell_bl_radius: float,
999
+ ) -> ImageDraw.ImageDraw:
1000
+ """Draw bottom-left points of text cells.
1001
+
1002
+ Args:
1003
+ cell_unit: Type of text unit to render
1004
+ draw: PIL ImageDraw object
1005
+ page_height: Height of the page
1006
+ cell_bl_color: Fill color for bottom-left points
1007
+ cell_bl_outline: Outline color for bottom-left points
1008
+ cell_bl_alpha: Alpha value for bottom-left points
1009
+ cell_bl_radius: Radius for bottom-left points
1010
+
1011
+ Returns:
1012
+ Updated ImageDraw object
1013
+ """
1014
+ fill = self._get_rgba(name=cell_bl_color, alpha=cell_bl_alpha)
1015
+ outline = self._get_rgba(name=cell_bl_outline, alpha=cell_bl_alpha)
1016
+
1017
+ # Draw each rectangle by connecting its four points
1018
+ for page_cell in self.iterate_cells(unit_type=cell_unit):
1019
+ poly = page_cell.rect.to_top_left_origin(
1020
+ page_height=page_height
1021
+ ).to_polygon()
1022
+ # Define the bounding box for the dot
1023
+ dot_bbox = [
1024
+ (poly[0][0] - cell_bl_radius, poly[0][1] - cell_bl_radius),
1025
+ (poly[0][0] + cell_bl_radius, poly[0][1] + cell_bl_radius),
1026
+ ]
1027
+
1028
+ # Draw the red dot
1029
+ draw.ellipse(dot_bbox, fill=fill, outline=outline)
1030
+
1031
+ return draw
1032
+
1033
+ def _draw_cells_tr(
1034
+ self,
1035
+ cell_unit: TextCellUnit,
1036
+ draw: ImageDraw.ImageDraw,
1037
+ page_height: float,
1038
+ cell_tr_color: str,
1039
+ cell_tr_outline: str,
1040
+ cell_tr_alpha: float,
1041
+ cell_tr_radius: float,
1042
+ ) -> ImageDraw.ImageDraw:
1043
+ """Draw top-right points of text cells.
1044
+
1045
+ Args:
1046
+ cell_unit: Type of text unit to render
1047
+ draw: PIL ImageDraw object
1048
+ page_height: Height of the page
1049
+ cell_tr_color: Fill color for top-right points
1050
+ cell_tr_outline: Outline color for top-right points
1051
+ cell_tr_alpha: Alpha value for top-right points
1052
+ cell_tr_radius: Radius for top-right points
1053
+
1054
+ Returns:
1055
+ Updated ImageDraw object
1056
+ """
1057
+ fill = self._get_rgba(name=cell_tr_color, alpha=cell_tr_alpha)
1058
+ outline = self._get_rgba(name=cell_tr_outline, alpha=cell_tr_alpha)
1059
+
1060
+ # Draw each rectangle by connecting its four points
1061
+ for page_cell in self.iterate_cells(unit_type=cell_unit):
1062
+ poly = page_cell.rect.to_top_left_origin(
1063
+ page_height=page_height
1064
+ ).to_polygon()
1065
+ # Define the bounding box for the dot
1066
+ dot_bbox = [
1067
+ (poly[0][0] - cell_tr_radius, poly[0][1] - cell_tr_radius),
1068
+ (poly[0][0] + cell_tr_radius, poly[0][1] + cell_tr_radius),
1069
+ ]
1070
+
1071
+ # Draw the red dot
1072
+ draw.ellipse(dot_bbox, fill=fill, outline=outline)
1073
+
1074
+ return draw
1075
+
1076
+ def _render_lines(
1077
+ self,
1078
+ draw: ImageDraw.ImageDraw,
1079
+ page_height: float,
1080
+ line_color: str,
1081
+ line_alpha: float,
1082
+ line_width: float,
1083
+ ) -> ImageDraw.ImageDraw:
1084
+ """Render lines on the page.
1085
+
1086
+ Args:
1087
+ draw: PIL ImageDraw object
1088
+ page_height: Height of the page
1089
+ line_color: Color for lines
1090
+ line_alpha: Alpha value for lines
1091
+ line_width: Width for lines
1092
+
1093
+ Returns:
1094
+ Updated ImageDraw object
1095
+ """
1096
+ fill = self._get_rgba(name=line_color, alpha=line_alpha)
1097
+
1098
+ # Draw each rectangle by connecting its four points
1099
+ for line in self.lines:
1100
+
1101
+ line.to_top_left_origin(page_height=page_height)
1102
+ for segment in line.iterate_segments():
1103
+ draw.line(
1104
+ (segment[0][0], segment[0][1], segment[1][0], segment[1][1]),
1105
+ fill=fill,
1106
+ width=max(1, round(line.width)),
1107
+ )
1108
+
1109
+ return draw
1110
+
1111
+
1112
+ class PdfMetaData(BaseModel):
1113
+ """Model representing PDF metadata extracted from XML."""
1114
+
1115
+ xml: str = ""
1116
+
1117
+ data: Dict[str, str] = {}
1118
+
1119
+ def initialise(self):
1120
+ """Initialize metadata by parsing the XML content."""
1121
+ # Define the regex pattern
1122
+ pattern = r"\<([a-zA-Z]+)\:([a-zA-Z]+)\>(.+?)\<\/([a-zA-Z]+)\:([a-zA-Z]+)\>"
1123
+
1124
+ # Find all matches
1125
+ matches = re.findall(pattern, self.xml)
1126
+
1127
+ # Process matches
1128
+ for _ in matches:
1129
+ namespace_open, tag_open, content, namespace_close, tag_close = _
1130
+ if namespace_open == namespace_close and tag_open == tag_close:
1131
+ _logger.debug(
1132
+ f"Namespace: {namespace_open}, Tag: {tag_open}, Content: {content}"
1133
+ )
1134
+ self.data[tag_open] = content
1135
+
1136
+
1137
+ class PdfTableOfContents(BaseModel):
1138
+ """Model representing a PDF table of contents entry with hierarchical structure."""
1139
+
1140
+ text: str
1141
+ orig: str = ""
1142
+
1143
+ marker: str = ""
1144
+
1145
+ children: List["PdfTableOfContents"] = []
1146
+
1147
+ def export_to_dict(self, mode: str = "json") -> Dict:
1148
+ """Export the table of contents to a dictionary.
1149
+
1150
+ Args:
1151
+ mode: Serialization mode
1152
+
1153
+ Returns:
1154
+ Dictionary representation of the table of contents
1155
+ """
1156
+ return self.model_dump(mode=mode, by_alias=True, exclude_none=True)
1157
+
1158
+ def save_as_json(self, filename: Path, indent: int = 2):
1159
+ """Save the table of contents as a JSON file.
1160
+
1161
+ Args:
1162
+ filename: Path to save the JSON file
1163
+ indent: Indentation level for JSON formatting
1164
+ """
1165
+ out = self.export_to_dict()
1166
+ with open(filename, "w", encoding="utf-8") as fw:
1167
+ json.dump(out, fw, indent=indent)
1168
+
1169
+ @classmethod
1170
+ def load_from_json(cls, filename: Path) -> "PdfTableOfContents":
1171
+ """Load table of contents from a JSON file.
1172
+
1173
+ Args:
1174
+ filename: Path to the JSON file
1175
+
1176
+ Returns:
1177
+ Instantiated PdfTableOfContents object
1178
+ """
1179
+ with open(filename, "r", encoding="utf-8") as f:
1180
+ return cls.model_validate_json(f.read())
1181
+
1182
+
1183
+ class ParsedPdfDocument(BaseModel):
1184
+ """Model representing a completely parsed PDF document with all components."""
1185
+
1186
+ pages: Dict[PageNumber, SegmentedPdfPage] = {}
1187
+
1188
+ meta_data: Optional[PdfMetaData] = None
1189
+ table_of_contents: Optional[PdfTableOfContents] = None
1190
+
1191
+ def iterate_pages(
1192
+ self,
1193
+ ) -> Iterator[Tuple[int, SegmentedPdfPage]]:
1194
+ """Iterate through all pages in the document.
1195
+
1196
+ Returns:
1197
+ Iterator of (page number, page) tuples
1198
+ """
1199
+ for page_no, page in self.pages.items():
1200
+ yield (page_no, page)
1201
+
1202
+ def export_to_dict(
1203
+ self,
1204
+ mode: str = "json",
1205
+ ) -> Dict:
1206
+ """Export the document to a dictionary.
1207
+
1208
+ Args:
1209
+ mode: Serialization mode
1210
+
1211
+ Returns:
1212
+ Dictionary representation of the document
1213
+ """
1214
+ return self.model_dump(mode=mode, by_alias=True, exclude_none=True)
1215
+
1216
+ def save_as_json(self, filename: Path, indent: int = 2):
1217
+ """Save the document as a JSON file.
1218
+
1219
+ Args:
1220
+ filename: Path to save the JSON file
1221
+ indent: Indentation level for JSON formatting
1222
+ """
1223
+ out = self.export_to_dict()
1224
+ with open(filename, "w", encoding="utf-8") as fw:
1225
+ json.dump(out, fw, indent=indent)
1226
+
1227
+ @classmethod
1228
+ def load_from_json(cls, filename: Path) -> "ParsedPdfDocument":
1229
+ """Load document from a JSON file.
1230
+
1231
+ Args:
1232
+ filename: Path to the JSON file
1233
+
1234
+ Returns:
1235
+ Instantiated ParsedPdfDocument object
1236
+ """
1237
+ with open(filename, "r", encoding="utf-8") as f:
1238
+ return cls.model_validate_json(f.read())