docling 1.19.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.1.dist-info/METADATA +0 -380
  35. docling-1.19.1.dist-info/RECORD +0 -34
  36. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -1,68 +1,63 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
4
+ from typing import TYPE_CHECKING, Set, Union
5
5
 
6
- from PIL import Image
6
+ from docling_core.types.doc import DoclingDocument
7
7
 
8
8
  if TYPE_CHECKING:
9
- from docling.datamodel.base_models import BoundingBox, Cell, PageSize
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.document import InputDocument
10
11
 
11
12
 
12
- class PdfPageBackend(ABC):
13
-
13
+ class AbstractDocumentBackend(ABC):
14
14
  @abstractmethod
15
- def get_text_in_rect(self, bbox: "BoundingBox") -> str:
16
- pass
15
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
16
+ self.path_or_stream = path_or_stream
17
+ self.document_hash = in_doc.document_hash
18
+ self.input_format = in_doc.format
17
19
 
18
20
  @abstractmethod
19
- def get_text_cells(self) -> Iterable["Cell"]:
21
+ def is_valid(self) -> bool:
20
22
  pass
21
23
 
24
+ @classmethod
22
25
  @abstractmethod
23
- def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
26
+ def supports_pagination(cls) -> bool:
24
27
  pass
25
28
 
26
29
  @abstractmethod
27
- def get_page_image(
28
- self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
29
- ) -> Image.Image:
30
- pass
30
+ def unload(self):
31
+ if isinstance(self.path_or_stream, BytesIO):
32
+ self.path_or_stream.close()
31
33
 
32
- @abstractmethod
33
- def get_size(self) -> "PageSize":
34
- pass
34
+ self.path_or_stream = None
35
35
 
36
+ @classmethod
36
37
  @abstractmethod
37
- def is_valid(self) -> bool:
38
+ def supported_formats(cls) -> Set["InputFormat"]:
38
39
  pass
39
40
 
40
- @abstractmethod
41
- def unload(self):
42
- pass
43
41
 
42
+ class PaginatedDocumentBackend(AbstractDocumentBackend):
43
+ """DeclarativeDocumentBackend.
44
44
 
45
- class PdfDocumentBackend(ABC):
46
- @abstractmethod
47
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
48
- self.path_or_stream = path_or_stream
49
- self.document_hash = document_hash
50
-
51
- @abstractmethod
52
- def load_page(self, page_no: int) -> PdfPageBackend:
53
- pass
45
+ A declarative document backend is a backend that can transform to DoclingDocument
46
+ straight without a recognition pipeline.
47
+ """
54
48
 
55
49
  @abstractmethod
56
50
  def page_count(self) -> int:
57
51
  pass
58
52
 
59
- @abstractmethod
60
- def is_valid(self) -> bool:
61
- pass
62
53
 
63
- @abstractmethod
64
- def unload(self):
65
- if isinstance(self.path_or_stream, BytesIO):
66
- self.path_or_stream.close()
54
+ class DeclarativeDocumentBackend(AbstractDocumentBackend):
55
+ """DeclarativeDocumentBackend.
67
56
 
68
- self.path_or_stream = None
57
+ A declarative document backend is a backend that can transform to DoclingDocument
58
+ straight without a recognition pipeline.
59
+ """
60
+
61
+ @abstractmethod
62
+ def convert(self) -> DoclingDocument:
63
+ pass
@@ -5,12 +5,14 @@ from pathlib import Path
5
5
  from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
8
9
  from docling_parse.docling_parse import pdf_parser
9
10
  from PIL import Image, ImageDraw
10
11
  from pypdfium2 import PdfPage
11
12
 
12
- from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
13
- from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import Cell
15
+ from docling.datamodel.document import InputDocument
14
16
 
15
17
  _log = logging.getLogger(__name__)
16
18
 
@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):
177
179
 
178
180
  return image
179
181
 
180
- def get_size(self) -> PageSize:
181
- return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
182
+ def get_size(self) -> Size:
183
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
182
184
 
183
185
  def unload(self):
184
186
  self._ppage = None
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
186
188
 
187
189
 
188
190
  class DoclingParseDocumentBackend(PdfDocumentBackend):
189
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
190
- super().__init__(path_or_stream, document_hash)
191
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
192
+ super().__init__(in_doc, path_or_stream)
191
193
 
192
- self._pdoc = pdfium.PdfDocument(path_or_stream)
194
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
193
195
  self.parser = pdf_parser()
194
196
 
195
197
  success = False
196
- if isinstance(path_or_stream, BytesIO):
198
+ if isinstance(self.path_or_stream, BytesIO):
197
199
  success = self.parser.load_document_from_bytesio(
198
- document_hash, path_or_stream
200
+ self.document_hash, self.path_or_stream
201
+ )
202
+ elif isinstance(self.path_or_stream, Path):
203
+ success = self.parser.load_document(
204
+ self.document_hash, str(self.path_or_stream)
199
205
  )
200
- elif isinstance(path_or_stream, Path):
201
- success = self.parser.load_document(document_hash, str(path_or_stream))
202
206
 
203
207
  if not success:
204
208
  raise RuntimeError(
205
- f"docling-parse could not load document {document_hash}."
209
+ f"docling-parse could not load document with hash {self.document_hash}."
206
210
  )
207
211
 
208
212
  def page_count(self) -> int:
@@ -0,0 +1,240 @@
1
+ import logging
2
+ import random
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
+
7
+ import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_parse.docling_parse import pdf_parser_v2
10
+ from PIL import Image, ImageDraw
11
+ from pypdfium2 import PdfPage
12
+
13
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import Cell, Size
15
+
16
+ if TYPE_CHECKING:
17
+ from docling.datamodel.document import InputDocument
18
+
19
+ _log = logging.getLogger(__name__)
20
+
21
+
22
+ class DoclingParseV2PageBackend(PdfPageBackend):
23
+ def __init__(
24
+ self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
25
+ ):
26
+ self._ppage = page_obj
27
+ parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
28
+
29
+ self.valid = "pages" in parsed_page
30
+ if self.valid:
31
+ self._dpage = parsed_page["pages"][page_no]
32
+ else:
33
+ _log.info(
34
+ f"An error occured when loading page {page_no} of document {document_hash}."
35
+ )
36
+
37
+ def is_valid(self) -> bool:
38
+ return self.valid
39
+
40
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
41
+ if not self.valid:
42
+ return ""
43
+ # Find intersecting cells on the page
44
+ text_piece = ""
45
+ page_size = self.get_size()
46
+
47
+ parser_width = self._dpage["sanitized"]["dimension"]["width"]
48
+ parser_height = self._dpage["sanitized"]["dimension"]["height"]
49
+
50
+ scale = (
51
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
52
+ )
53
+
54
+ cells_data = self._dpage["sanitized"]["cells"]["data"]
55
+ cells_header = self._dpage["sanitized"]["cells"]["header"]
56
+
57
+ for i, cell_data in enumerate(cells_data):
58
+ x0 = cell_data[cells_header.index("x0")]
59
+ y0 = cell_data[cells_header.index("y0")]
60
+ x1 = cell_data[cells_header.index("x1")]
61
+ y1 = cell_data[cells_header.index("y1")]
62
+
63
+ cell_bbox = BoundingBox(
64
+ l=x0 * scale * page_size.width / parser_width,
65
+ b=y0 * scale * page_size.height / parser_height,
66
+ r=x1 * scale * page_size.width / parser_width,
67
+ t=y1 * scale * page_size.height / parser_height,
68
+ coord_origin=CoordOrigin.BOTTOMLEFT,
69
+ ).to_top_left_origin(page_height=page_size.height * scale)
70
+
71
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
72
+
73
+ if overlap_frac > 0.5:
74
+ if len(text_piece) > 0:
75
+ text_piece += " "
76
+ text_piece += cell_data[cells_header.index("text")]
77
+
78
+ return text_piece
79
+
80
+ def get_text_cells(self) -> Iterable[Cell]:
81
+ cells: List[Cell] = []
82
+ cell_counter = 0
83
+
84
+ if not self.valid:
85
+ return cells
86
+
87
+ page_size = self.get_size()
88
+
89
+ parser_width = self._dpage["sanitized"]["dimension"]["width"]
90
+ parser_height = self._dpage["sanitized"]["dimension"]["height"]
91
+
92
+ cells_data = self._dpage["sanitized"]["cells"]["data"]
93
+ cells_header = self._dpage["sanitized"]["cells"]["header"]
94
+
95
+ for i, cell_data in enumerate(cells_data):
96
+ x0 = cell_data[cells_header.index("x0")]
97
+ y0 = cell_data[cells_header.index("y0")]
98
+ x1 = cell_data[cells_header.index("x1")]
99
+ y1 = cell_data[cells_header.index("y1")]
100
+
101
+ if x1 < x0:
102
+ x0, x1 = x1, x0
103
+ if y1 < y0:
104
+ y0, y1 = y1, y0
105
+
106
+ text_piece = cell_data[cells_header.index("text")]
107
+ cells.append(
108
+ Cell(
109
+ id=cell_counter,
110
+ text=text_piece,
111
+ bbox=BoundingBox(
112
+ # l=x0, b=y0, r=x1, t=y1,
113
+ l=x0 * page_size.width / parser_width,
114
+ b=y0 * page_size.height / parser_height,
115
+ r=x1 * page_size.width / parser_width,
116
+ t=y1 * page_size.height / parser_height,
117
+ coord_origin=CoordOrigin.BOTTOMLEFT,
118
+ ).to_top_left_origin(page_size.height),
119
+ )
120
+ )
121
+ cell_counter += 1
122
+
123
+ def draw_clusters_and_cells():
124
+ image = (
125
+ self.get_page_image()
126
+ ) # make new image to avoid drawing on the saved ones
127
+ draw = ImageDraw.Draw(image)
128
+ for c in cells:
129
+ x0, y0, x1, y1 = c.bbox.as_tuple()
130
+ cell_color = (
131
+ random.randint(30, 140),
132
+ random.randint(30, 140),
133
+ random.randint(30, 140),
134
+ )
135
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
136
+ image.show()
137
+
138
+ # draw_clusters_and_cells()
139
+
140
+ return cells
141
+
142
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
143
+ AREA_THRESHOLD = 32 * 32
144
+
145
+ images = self._dpage["sanitized"]["images"]["data"]
146
+ images_header = self._dpage["sanitized"]["images"]["header"]
147
+
148
+ for row in images:
149
+ x0 = row[images_header.index("x0")]
150
+ y0 = row[images_header.index("y0")]
151
+ x1 = row[images_header.index("x1")]
152
+ y1 = row[images_header.index("y1")]
153
+
154
+ cropbox = BoundingBox.from_tuple(
155
+ (x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
156
+ ).to_top_left_origin(self.get_size().height)
157
+
158
+ if cropbox.area() > AREA_THRESHOLD:
159
+ cropbox = cropbox.scaled(scale=scale)
160
+
161
+ yield cropbox
162
+
163
+ def get_page_image(
164
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
165
+ ) -> Image.Image:
166
+
167
+ page_size = self.get_size()
168
+
169
+ if not cropbox:
170
+ cropbox = BoundingBox(
171
+ l=0,
172
+ r=page_size.width,
173
+ t=0,
174
+ b=page_size.height,
175
+ coord_origin=CoordOrigin.TOPLEFT,
176
+ )
177
+ padbox = BoundingBox(
178
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
179
+ )
180
+ else:
181
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
182
+ padbox.r = page_size.width - padbox.r
183
+ padbox.t = page_size.height - padbox.t
184
+
185
+ image = (
186
+ self._ppage.render(
187
+ scale=scale * 1.5,
188
+ rotation=0, # no additional rotation
189
+ crop=padbox.as_tuple(),
190
+ )
191
+ .to_pil()
192
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
193
+ ) # We resize the image from 1.5x the given scale to make it sharper.
194
+
195
+ return image
196
+
197
+ def get_size(self) -> Size:
198
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
199
+
200
+ def unload(self):
201
+ self._ppage = None
202
+ self._dpage = None
203
+
204
+
205
+ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
206
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
207
+ super().__init__(in_doc, path_or_stream)
208
+
209
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
210
+ self.parser = pdf_parser_v2("fatal")
211
+
212
+ success = False
213
+ if isinstance(path_or_stream, BytesIO):
214
+ success = self.parser.load_document_from_bytesio(
215
+ self.document_hash, path_or_stream
216
+ )
217
+ elif isinstance(path_or_stream, Path):
218
+ success = self.parser.load_document(self.document_hash, str(path_or_stream))
219
+
220
+ if not success:
221
+ raise RuntimeError(
222
+ f"docling-parse v2 could not load document {self.document_hash}."
223
+ )
224
+
225
+ def page_count(self) -> int:
226
+ return len(self._pdoc) # To be replaced with docling-parse API
227
+
228
+ def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
229
+ return DoclingParseV2PageBackend(
230
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
231
+ )
232
+
233
+ def is_valid(self) -> bool:
234
+ return self.page_count() > 0
235
+
236
+ def unload(self):
237
+ super().unload()
238
+ self.parser.unload_document(self.document_hash)
239
+ self._pdoc.close()
240
+ self._pdoc = None