docling 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,6 @@
1
+ import logging
1
2
  import random
3
+ import time
2
4
  from io import BytesIO
3
5
  from pathlib import Path
4
6
  from typing import Iterable, List, Optional, Union
@@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
11
13
  from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
12
14
  from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
15
 
16
+ _log = logging.getLogger(__name__)
17
+
14
18
 
15
19
  class DoclingParsePageBackend(PdfPageBackend):
16
20
  def __init__(self, page_obj: PdfPage, docling_page_obj):
@@ -80,7 +84,9 @@ class DoclingParsePageBackend(PdfPageBackend):
80
84
  cell_counter += 1
81
85
 
82
86
  def draw_clusters_and_cells():
83
- image = self.get_page_image()
87
+ image = (
88
+ self.get_page_image()
89
+ ) # make new image to avoid drawing on the saved ones
84
90
  draw = ImageDraw.Draw(image)
85
91
  for c in cells:
86
92
  x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -151,11 +157,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
151
157
  self._pdoc = pdfium.PdfDocument(path_or_stream)
152
158
  # Parsing cells with docling_parser call
153
159
  parser = pdf_parser()
160
+
161
+ start_pb_time = time.time()
162
+
154
163
  if isinstance(path_or_stream, BytesIO):
155
164
  self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
156
165
  else:
157
166
  self._parser_doc = parser.find_cells(str(path_or_stream))
158
167
 
168
+ end_pb_time = time.time() - start_pb_time
169
+ _log.info(
170
+ f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
171
+ )
172
+
159
173
  def page_count(self) -> int:
160
174
  return len(self._parser_doc["pages"])
161
175
 
@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
134
134
  return merged_cells
135
135
 
136
136
  def draw_clusters_and_cells():
137
- image = self.get_page_image()
137
+ image = (
138
+ self.get_page_image()
139
+ ) # make new image to avoid drawing on the saved ones
138
140
  draw = ImageDraw.Draw(image)
139
141
  for c in cells:
140
142
  x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -1,10 +1,12 @@
1
1
  import copy
2
+ import warnings
2
3
  from enum import Enum, auto
3
4
  from io import BytesIO
4
- from typing import Any, Dict, List, Optional, Tuple, Union
5
+ from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
5
6
 
6
7
  from PIL.Image import Image
7
- from pydantic import BaseModel, ConfigDict, model_validator
8
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
9
+ from typing_extensions import Self
8
10
 
9
11
  from docling.backend.abstract_backend import PdfPageBackend
10
12
 
@@ -234,14 +236,30 @@ class Page(BaseModel):
234
236
  model_config = ConfigDict(arbitrary_types_allowed=True)
235
237
 
236
238
  page_no: int
237
- page_hash: str = None
238
- size: PageSize = None
239
- image: Image = None
239
+ page_hash: Optional[str] = None
240
+ size: Optional[PageSize] = None
240
241
  cells: List[Cell] = None
241
242
  predictions: PagePredictions = PagePredictions()
242
- assembled: AssembledUnit = None
243
+ assembled: Optional[AssembledUnit] = None
243
244
 
244
- _backend: PdfPageBackend = None # Internal PDF backend
245
+ _backend: Optional[PdfPageBackend] = (
246
+ None # Internal PDF backend. By default it is cleared during assembling.
247
+ )
248
+ _default_image_scale: float = 1.0 # Default image scale for external usage.
249
+ _image_cache: Dict[float, Image] = (
250
+ {}
251
+ ) # Cache of images in different scales. By default it is cleared during assembling.
252
+
253
+ def get_image(self, scale: float = 1.0) -> Optional[Image]:
254
+ if self._backend is None:
255
+ return self._image_cache.get(scale, None)
256
+ if not scale in self._image_cache:
257
+ self._image_cache[scale] = self._backend.get_page_image(scale=scale)
258
+ return self._image_cache[scale]
259
+
260
+ @property
261
+ def image(self) -> Optional[Image]:
262
+ return self.get_image(scale=self._default_image_scale)
245
263
 
246
264
 
247
265
  class DocumentStream(BaseModel):
@@ -268,6 +286,19 @@ class PipelineOptions(BaseModel):
268
286
 
269
287
 
270
288
  class AssembleOptions(BaseModel):
271
- keep_page_images: bool = (
272
- False # False: page images are removed in the assemble step
273
- )
289
+ keep_page_images: Annotated[
290
+ bool,
291
+ Field(
292
+ deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
293
+ ),
294
+ ] = False # False: page images are removed in the assemble step
295
+ images_scale: Optional[float] = None # if set, the scale for generated images
296
+
297
+ @model_validator(mode="after")
298
+ def set_page_images_from_deprecated(self) -> Self:
299
+ with warnings.catch_warnings():
300
+ warnings.simplefilter("ignore", DeprecationWarning)
301
+ default_scale = 1.0
302
+ if self.keep_page_images and self.images_scale is None:
303
+ self.images_scale = default_scale
304
+ return self
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path, PurePath
4
- from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
4
+ from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
5
5
 
6
6
  from docling_core.types import BaseCell, BaseText
7
7
  from docling_core.types import BoundingBox as DsBoundingBox
@@ -21,6 +21,7 @@ from docling.datamodel.base_models import (
21
21
  DocumentStream,
22
22
  FigureElement,
23
23
  Page,
24
+ PageElement,
24
25
  TableElement,
25
26
  TextElement,
26
27
  )
@@ -302,6 +303,20 @@ class ConvertedDocument(BaseModel):
302
303
  else:
303
304
  return ""
304
305
 
306
+ def render_element_images(
307
+ self, element_types: Tuple[PageElement] = (FigureElement,)
308
+ ):
309
+ for element in self.assembled.elements:
310
+ if isinstance(element, element_types):
311
+ page_ix = element.page_no
312
+ scale = self.pages[page_ix]._default_image_scale
313
+ crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
314
+ page_height=self.pages[page_ix].size.height * scale
315
+ )
316
+
317
+ cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
318
+ yield element, cropped_im
319
+
305
320
 
306
321
  class DocumentConversionInput(BaseModel):
307
322
 
@@ -188,10 +188,8 @@ class DocumentConverter:
188
188
  # Free up mem resources before moving on with next batch
189
189
 
190
190
  # Remove page images (can be disabled)
191
- if not self.assemble_options.keep_page_images:
192
- assembled_page.image = (
193
- None # Comment this if you want to visualize page images
194
- )
191
+ if self.assemble_options.images_scale is None:
192
+ assembled_page._image_cache = {}
195
193
 
196
194
  # Unload backend
197
195
  assembled_page._backend.unload()
@@ -231,7 +229,15 @@ class DocumentConverter:
231
229
 
232
230
  # Generate the page image and store it in the page object
233
231
  def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
234
- page.image = page._backend.get_page_image()
232
+ # default scale
233
+ page.get_image(scale=1.0)
234
+
235
+ # user requested scales
236
+ if self.assemble_options.images_scale is not None:
237
+ page._default_image_scale = self.assemble_options.images_scale
238
+ page.get_image(
239
+ scale=self.assemble_options.images_scale
240
+ ) # this will trigger storing the image in the internal cache
235
241
 
236
242
  return page
237
243
 
@@ -247,7 +253,7 @@ class DocumentConverter:
247
253
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
248
254
  image.show()
249
255
 
250
- # draw_text_boxes(page.image, cells)
256
+ # draw_text_boxes(page.get_image(scale=1.0), cells)
251
257
 
252
258
  return page
253
259
 
@@ -30,7 +30,7 @@ class EasyOcrModel:
30
30
 
31
31
  for page in page_batch:
32
32
  # rects = page._fpage.
33
- high_res_image = page._backend.get_page_image(scale=self.scale)
33
+ high_res_image = page.get_image(scale=self.scale)
34
34
  im = numpy.array(high_res_image)
35
35
  result = self.reader.readtext(im)
36
36
 
@@ -267,7 +267,9 @@ class LayoutModel:
267
267
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
268
268
  for page in page_batch:
269
269
  clusters = []
270
- for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
270
+ for ix, pred_item in enumerate(
271
+ self.layout_predictor.predict(page.get_image(scale=1.0))
272
+ ):
271
273
  cluster = Cluster(
272
274
  id=ix,
273
275
  label=pred_item["label"],
@@ -34,7 +34,9 @@ class TableStructureModel:
34
34
  self.scale = 2.0 # Scale up table input images to 144 dpi
35
35
 
36
36
  def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
37
- image = page._backend.get_page_image()
37
+ image = (
38
+ page._backend.get_page_image()
39
+ ) # make new image to avoid drawing on the saved ones
38
40
  draw = ImageDraw.Draw(image)
39
41
 
40
42
  for table_element in tbl_list:
@@ -94,13 +96,7 @@ class TableStructureModel:
94
96
  "width": page.size.width * self.scale,
95
97
  "height": page.size.height * self.scale,
96
98
  }
97
- # add image to page input.
98
- if self.scale == 1.0:
99
- page_input["image"] = numpy.asarray(page.image)
100
- else: # render new page image on the fly at desired scale
101
- page_input["image"] = numpy.asarray(
102
- page._backend.get_page_image(scale=self.scale)
103
- )
99
+ page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
104
100
 
105
101
  table_clusters, table_bboxes = zip(*in_tables)
106
102
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.4.0
3
+ Version: 1.5.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -44,6 +44,7 @@ Description-Content-Type: text/markdown
44
44
 
45
45
  # Docling
46
46
 
47
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
47
48
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
48
49
  ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
49
50
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -172,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
172
173
  If you use Docling in your projects, please consider citing the following:
173
174
 
174
175
  ```bib
175
- @software{Docling,
176
- author = {Deep Search Team},
177
- month = {7},
178
- title = {{Docling}},
179
- url = {https://github.com/DS4SD/docling},
180
- version = {main},
181
- year = {2024}
176
+ @techreport{Docling,
177
+ author = {Deep Search Team},
178
+ month = {8},
179
+ title = {{Docling Technical Report}},
180
+ url={https://arxiv.org/abs/2408.09869},
181
+ eprint={2408.09869},
182
+ doi = "10.48550/arXiv.2408.09869",
183
+ version = {1.0.0},
184
+ year = {2024}
182
185
  }
183
186
  ```
184
187
 
@@ -1,26 +1,26 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
4
- docling/backend/docling_parse_backend.py,sha256=bgsmnwDmroBwuOwkEUzlN9KMEIFJ1xUaCZW6rsr5G-c,5924
5
- docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
4
+ docling/backend/docling_parse_backend.py,sha256=-bIjYJ-80R2SArAEw_lAyzgW5_BFEoX83n1oBMmUGF4,6284
5
+ docling/backend/pypdfium2_backend.py,sha256=3Qeeal8z6DunUe4S10Z2TXrdeucanCpa8evt6SQtpKQ,7496
6
6
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
8
- docling/datamodel/document.py,sha256=lZHXINmPWvpzrV3PTilgJs1blqTMCnJdLEww_qfcqdE,12533
7
+ docling/datamodel/base_models.py,sha256=uOq0zjUS60aIkROREiypp3Jn1yqQTlWEf34jXTT43ls,8391
8
+ docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
9
9
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
- docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
10
+ docling/document_converter.py,sha256=r9z48VjL_hkq-rbAgyZ135njzUGBJ5AnhEH6-1zfyCA,10490
11
11
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
13
- docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
14
- docling/models/layout_model.py,sha256=3mOgNvCYPh99_oLxJy-ZaIqGOFgG5bcIQ0tTubW656Q,11204
13
+ docling/models/easyocr_model.py,sha256=Y-RWolIFE3By6gk8dnb2qFy7Cr9qcHs6eo65fWPT0Nc,2276
14
+ docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
15
15
  docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
16
- docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
16
+ docling/models/table_structure_model.py,sha256=lKsodvfZaGwxOHp-CbRW5nzCKZYMwf770h0Ka6Bdbgw,5451
17
17
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
19
19
  docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
20
20
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
22
22
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
23
- docling-1.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
- docling-1.4.0.dist-info/METADATA,sha256=Hu8pvrxpc0b1qzQvvzI_ijRAQWjOfcfNl4_1Zb7oyoc,7042
25
- docling-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
- docling-1.4.0.dist-info/RECORD,,
23
+ docling-1.5.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
24
+ docling-1.5.0.dist-info/METADATA,sha256=jWcjsrdfYcpeYFCRQ1h5C1b8MyaKsJWyUhGheXQEGvY,7235
25
+ docling-1.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
26
+ docling-1.5.0.dist-info/RECORD,,