docling 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,55 @@
1
+ from abc import ABC, abstractmethod
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Any, Iterable, Optional, Union
5
+
6
+ from PIL import Image
7
+
8
+
9
+ class PdfPageBackend(ABC):
10
+ def __init__(self, page_obj: Any) -> object:
11
+ pass
12
+
13
+ @abstractmethod
14
+ def get_text_in_rect(self, bbox: "BoundingBox") -> str:
15
+ pass
16
+
17
+ @abstractmethod
18
+ def get_text_cells(self) -> Iterable["Cell"]:
19
+ pass
20
+
21
+ @abstractmethod
22
+ def get_page_image(
23
+ self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
24
+ ) -> Image.Image:
25
+ pass
26
+
27
+ @abstractmethod
28
+ def get_size(self) -> "PageSize":
29
+ pass
30
+
31
+ @abstractmethod
32
+ def unload(self):
33
+ pass
34
+
35
+
36
+ class PdfDocumentBackend(ABC):
37
+ @abstractmethod
38
+ def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
39
+ pass
40
+
41
+ @abstractmethod
42
+ def load_page(self, page_no: int) -> PdfPageBackend:
43
+ pass
44
+
45
+ @abstractmethod
46
+ def page_count(self) -> int:
47
+ pass
48
+
49
+ @abstractmethod
50
+ def is_valid(self) -> bool:
51
+ pass
52
+
53
+ @abstractmethod
54
+ def unload(self):
55
+ pass
@@ -0,0 +1,223 @@
1
+ import random
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Optional, Union
5
+
6
+ import pypdfium2 as pdfium
7
+ from PIL import Image, ImageDraw
8
+ from pypdfium2 import PdfPage
9
+
10
+ from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
11
+ from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
12
+
13
+
14
+ class PyPdfiumPageBackend(PdfPageBackend):
15
+ def __init__(self, page_obj: PdfPage):
16
+ super().__init__(page_obj)
17
+ self._ppage = page_obj
18
+ self.text_page = None
19
+
20
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
21
+ if not self.text_page:
22
+ self.text_page = self._ppage.get_textpage()
23
+
24
+ if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
25
+ bbox = bbox.to_bottom_left_origin(self.get_size().height)
26
+
27
+ text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
28
+
29
+ return text_piece
30
+
31
+ def get_text_cells(self) -> Iterable[Cell]:
32
+ if not self.text_page:
33
+ self.text_page = self._ppage.get_textpage()
34
+
35
+ cells = []
36
+ cell_counter = 0
37
+
38
+ page_size = self.get_size()
39
+
40
+ for i in range(self.text_page.count_rects()):
41
+ rect = self.text_page.get_rect(i)
42
+ text_piece = self.text_page.get_text_bounded(*rect)
43
+ x0, y0, x1, y1 = rect
44
+ cells.append(
45
+ Cell(
46
+ id=cell_counter,
47
+ text=text_piece,
48
+ bbox=BoundingBox(
49
+ l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
50
+ ).to_top_left_origin(page_size.height),
51
+ )
52
+ )
53
+ cell_counter += 1
54
+
55
+ # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
56
+ # The cell merging code below is to clean this up.
57
+ def merge_horizontal_cells(
58
+ cells: List[Cell],
59
+ horizontal_threshold_factor: float = 1.0,
60
+ vertical_threshold_factor: float = 0.5,
61
+ ) -> List[Cell]:
62
+ if not cells:
63
+ return []
64
+
65
+ def group_rows(cells: List[Cell]) -> List[List[Cell]]:
66
+ rows = []
67
+ current_row = [cells[0]]
68
+ row_top = cells[0].bbox.t
69
+ row_bottom = cells[0].bbox.b
70
+ row_height = cells[0].bbox.height
71
+
72
+ for cell in cells[1:]:
73
+ vertical_threshold = row_height * vertical_threshold_factor
74
+ if (
75
+ abs(cell.bbox.t - row_top) <= vertical_threshold
76
+ and abs(cell.bbox.b - row_bottom) <= vertical_threshold
77
+ ):
78
+ current_row.append(cell)
79
+ row_top = min(row_top, cell.bbox.t)
80
+ row_bottom = max(row_bottom, cell.bbox.b)
81
+ row_height = row_bottom - row_top
82
+ else:
83
+ rows.append(current_row)
84
+ current_row = [cell]
85
+ row_top = cell.bbox.t
86
+ row_bottom = cell.bbox.b
87
+ row_height = cell.bbox.height
88
+
89
+ if current_row:
90
+ rows.append(current_row)
91
+
92
+ return rows
93
+
94
+ def merge_row(row: List[Cell]) -> List[Cell]:
95
+ merged = []
96
+ current_group = [row[0]]
97
+
98
+ for cell in row[1:]:
99
+ prev_cell = current_group[-1]
100
+ avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
101
+ if (
102
+ cell.bbox.l - prev_cell.bbox.r
103
+ <= avg_height * horizontal_threshold_factor
104
+ ):
105
+ current_group.append(cell)
106
+ else:
107
+ merged.append(merge_group(current_group))
108
+ current_group = [cell]
109
+
110
+ if current_group:
111
+ merged.append(merge_group(current_group))
112
+
113
+ return merged
114
+
115
+ def merge_group(group: List[Cell]) -> Cell:
116
+ if len(group) == 1:
117
+ return group[0]
118
+
119
+ merged_text = "".join(cell.text for cell in group)
120
+ merged_bbox = BoundingBox(
121
+ l=min(cell.bbox.l for cell in group),
122
+ t=min(cell.bbox.t for cell in group),
123
+ r=max(cell.bbox.r for cell in group),
124
+ b=max(cell.bbox.b for cell in group),
125
+ )
126
+ return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
127
+
128
+ rows = group_rows(cells)
129
+ merged_cells = [cell for row in rows for cell in merge_row(row)]
130
+
131
+ for i, cell in enumerate(merged_cells, 1):
132
+ cell.id = i
133
+
134
+ return merged_cells
135
+
136
+ def draw_clusters_and_cells():
137
+ image = self.get_page_image()
138
+ draw = ImageDraw.Draw(image)
139
+ for c in cells:
140
+ x0, y0, x1, y1 = c.bbox.as_tuple()
141
+ cell_color = (
142
+ random.randint(30, 140),
143
+ random.randint(30, 140),
144
+ random.randint(30, 140),
145
+ )
146
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
147
+ image.show()
148
+
149
+ # before merge:
150
+ # draw_clusters_and_cells()
151
+
152
+ cells = merge_horizontal_cells(cells)
153
+
154
+ # after merge:
155
+ # draw_clusters_and_cells()
156
+
157
+ return cells
158
+
159
+ def get_page_image(
160
+ self, scale: int = 1, cropbox: Optional[BoundingBox] = None
161
+ ) -> Image.Image:
162
+
163
+ page_size = self.get_size()
164
+
165
+ if not cropbox:
166
+ cropbox = BoundingBox(
167
+ l=0,
168
+ r=page_size.width,
169
+ t=0,
170
+ b=page_size.height,
171
+ coord_origin=CoordOrigin.TOPLEFT,
172
+ )
173
+ padbox = BoundingBox(
174
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
175
+ )
176
+ else:
177
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
178
+ padbox.r = page_size.width - padbox.r
179
+ padbox.t = page_size.height - padbox.t
180
+
181
+ image = (
182
+ self._ppage.render(
183
+ scale=scale * 1.5,
184
+ rotation=0, # no additional rotation
185
+ crop=padbox.as_tuple(),
186
+ )
187
+ .to_pil()
188
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
189
+ ) # We resize the image from 1.5x the given scale to make it sharper.
190
+
191
+ return image
192
+
193
+ def get_size(self) -> PageSize:
194
+ return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
195
+
196
+ def unload(self):
197
+ self._ppage = None
198
+ self.text_page = None
199
+
200
+
201
+ class PyPdfiumDocumentBackend(PdfDocumentBackend):
202
+ def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
203
+ super().__init__(path_or_stream)
204
+
205
+ if isinstance(path_or_stream, Path):
206
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
207
+ elif isinstance(path_or_stream, BytesIO):
208
+ self._pdoc = pdfium.PdfDocument(
209
+ path_or_stream
210
+ ) # TODO Fix me, won't accept bytes.
211
+
212
+ def page_count(self) -> int:
213
+ return len(self._pdoc)
214
+
215
+ def load_page(self, page_no: int) -> PdfPage:
216
+ return PyPdfiumPageBackend(self._pdoc[page_no])
217
+
218
+ def is_valid(self) -> bool:
219
+ return self.page_count() > 0
220
+
221
+ def unload(self):
222
+ self._pdoc.close()
223
+ self._pdoc = None
File without changes
@@ -0,0 +1,247 @@
1
+ from enum import Enum, auto
2
+ from io import BytesIO
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+
5
+ from PIL.Image import Image
6
+ from pydantic import BaseModel, ConfigDict, model_validator
7
+
8
+ from docling.backend.abstract_backend import PdfPageBackend
9
+
10
+
11
+ class ConversionStatus(str, Enum):
12
+ PENDING = auto()
13
+ STARTED = auto()
14
+ FAILURE = auto()
15
+ SUCCESS = auto()
16
+ SUCCESS_WITH_ERRORS = auto()
17
+
18
+
19
+ class DocInputType(str, Enum):
20
+ PATH = auto()
21
+ STREAM = auto()
22
+
23
+
24
+ class CoordOrigin(str, Enum):
25
+ TOPLEFT = auto()
26
+ BOTTOMLEFT = auto()
27
+
28
+
29
+ class PageSize(BaseModel):
30
+ width: float = 0.0
31
+ height: float = 0.0
32
+
33
+
34
+ class BoundingBox(BaseModel):
35
+ l: float # left
36
+ t: float # top
37
+ r: float # right
38
+ b: float # bottom
39
+
40
+ coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
41
+
42
+ @property
43
+ def width(self):
44
+ return self.r - self.l
45
+
46
+ @property
47
+ def height(self):
48
+ return abs(self.t - self.b)
49
+
50
+ def as_tuple(self):
51
+ if self.coord_origin == CoordOrigin.TOPLEFT:
52
+ return (self.l, self.t, self.r, self.b)
53
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
54
+ return (self.l, self.b, self.r, self.t)
55
+
56
+ @classmethod
57
+ def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
58
+ if origin == CoordOrigin.TOPLEFT:
59
+ return BoundingBox(
60
+ l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin
61
+ )
62
+ elif origin == CoordOrigin.BOTTOMLEFT:
63
+ return BoundingBox(
64
+ l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin
65
+ )
66
+
67
+ def area(self) -> float:
68
+ return (self.r - self.l) * (self.b - self.t)
69
+
70
+ def intersection_area_with(self, other: "BoundingBox") -> float:
71
+ # Calculate intersection coordinates
72
+ left = max(self.l, other.l)
73
+ top = max(self.t, other.t)
74
+ right = min(self.r, other.r)
75
+ bottom = min(self.b, other.b)
76
+
77
+ # Calculate intersection dimensions
78
+ width = right - left
79
+ height = bottom - top
80
+
81
+ # If the bounding boxes do not overlap, width or height will be negative
82
+ if width <= 0 or height <= 0:
83
+ return 0.0
84
+
85
+ return width * height
86
+
87
+ def to_bottom_left_origin(self, page_height) -> "BoundingBox":
88
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
89
+ return self
90
+ elif self.coord_origin == CoordOrigin.TOPLEFT:
91
+ return BoundingBox(
92
+ l=self.l,
93
+ r=self.r,
94
+ t=page_height - self.t,
95
+ b=page_height - self.b,
96
+ coord_origin=CoordOrigin.BOTTOMLEFT,
97
+ )
98
+
99
+ def to_top_left_origin(self, page_height):
100
+ if self.coord_origin == CoordOrigin.TOPLEFT:
101
+ return self
102
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
103
+ return BoundingBox(
104
+ l=self.l,
105
+ r=self.r,
106
+ t=page_height - self.t, # self.b
107
+ b=page_height - self.b, # self.t
108
+ coord_origin=CoordOrigin.TOPLEFT,
109
+ )
110
+
111
+
112
+ class Cell(BaseModel):
113
+ id: int
114
+ text: str
115
+ bbox: BoundingBox
116
+
117
+
118
+ class OcrCell(Cell):
119
+ confidence: float
120
+
121
+
122
+ class Cluster(BaseModel):
123
+ id: int
124
+ label: str
125
+ bbox: BoundingBox
126
+ confidence: float = 1.0
127
+ cells: List[Cell] = []
128
+
129
+
130
+ class BasePageElement(BaseModel):
131
+ label: str
132
+ id: int
133
+ page_no: int
134
+ cluster: Cluster
135
+ text: Optional[str] = None
136
+
137
+
138
+ class LayoutPrediction(BaseModel):
139
+ clusters: List[Cluster] = []
140
+
141
+
142
+ class TableCell(BaseModel):
143
+ bbox: BoundingBox
144
+ row_span: int
145
+ col_span: int
146
+ start_row_offset_idx: int
147
+ end_row_offset_idx: int
148
+ start_col_offset_idx: int
149
+ end_col_offset_idx: int
150
+ text: str
151
+ column_header: bool = False
152
+ row_header: bool = False
153
+ row_section: bool = False
154
+
155
+ @model_validator(mode="before")
156
+ @classmethod
157
+ def from_dict_format(cls, data: Any) -> Any:
158
+ if isinstance(data, Dict):
159
+ text = data["bbox"].get("token", "")
160
+ if not len(text):
161
+ text_cells = data.pop("text_cell_bboxes", None)
162
+ if text_cells:
163
+ for el in text_cells:
164
+ text += el["token"] + " "
165
+
166
+ text = text.strip()
167
+ data["text"] = text
168
+
169
+ return data
170
+
171
+
172
+ class TableElement(BasePageElement):
173
+ otsl_seq: List[str]
174
+ num_rows: int = 0
175
+ num_cols: int = 0
176
+ table_cells: List[TableCell]
177
+
178
+
179
+ class TableStructurePrediction(BaseModel):
180
+ table_map: Dict[int, TableElement] = {}
181
+
182
+
183
+ class TextElement(BasePageElement):
184
+ ...
185
+
186
+
187
+ class FigureData(BaseModel):
188
+ pass
189
+
190
+
191
+ class FigureElement(BasePageElement):
192
+ data: Optional[FigureData] = None
193
+ provenance: Optional[str] = None
194
+ predicted_class: Optional[str] = None
195
+ confidence: Optional[float] = None
196
+
197
+
198
+ class FigureClassificationPrediction(BaseModel):
199
+ figure_count: int = 0
200
+ figure_map: Dict[int, FigureElement] = {}
201
+
202
+
203
+ class EquationPrediction(BaseModel):
204
+ equation_count: int = 0
205
+ equation_map: Dict[int, TextElement] = {}
206
+
207
+
208
+ class PagePredictions(BaseModel):
209
+ layout: LayoutPrediction = None
210
+ tablestructure: TableStructurePrediction = None
211
+ figures_classification: FigureClassificationPrediction = None
212
+ equations_prediction: EquationPrediction = None
213
+
214
+
215
+ PageElement = Union[TextElement, TableElement, FigureElement]
216
+
217
+
218
+ class AssembledUnit(BaseModel):
219
+ elements: List[PageElement]
220
+ body: List[PageElement]
221
+ headers: List[PageElement]
222
+
223
+
224
+ class Page(BaseModel):
225
+ model_config = ConfigDict(arbitrary_types_allowed=True)
226
+
227
+ page_no: int
228
+ page_hash: str = None
229
+ size: PageSize = None
230
+ image: Image = None
231
+ cells: List[Cell] = None
232
+ predictions: PagePredictions = PagePredictions()
233
+ assembled: AssembledUnit = None
234
+
235
+ _backend: PdfPageBackend = None # Internal PDF backend
236
+
237
+
238
+ class DocumentStream(BaseModel):
239
+ model_config = ConfigDict(arbitrary_types_allowed=True)
240
+
241
+ filename: str
242
+ stream: BytesIO
243
+
244
+
245
+ class PipelineOptions(BaseModel):
246
+ do_table_structure: bool = True
247
+ do_ocr: bool = False