PyPI - docling - Versions diffs - 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

docling 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

docling/backend/docling_parse_backend.py +15 -1
docling/backend/pypdfium2_backend.py +3 -1
docling/datamodel/base_models.py +41 -10
docling/datamodel/document.py +16 -1
docling/document_converter.py +12 -6
docling/models/easyocr_model.py +1 -1
docling/models/layout_model.py +3 -1
docling/models/table_structure_model.py +4 -8
{docling-1.4.0.dist-info → docling-1.5.0.dist-info}/METADATA +11 -8
{docling-1.4.0.dist-info → docling-1.5.0.dist-info}/RECORD +12 -12
{docling-1.4.0.dist-info → docling-1.5.0.dist-info}/LICENSE +0 -0
{docling-1.4.0.dist-info → docling-1.5.0.dist-info}/WHEEL +0 -0

docling/backend/docling_parse_backend.py CHANGED Viewed

@@ -1,4 +1,6 @@
+import logging
 import random
+import time
 from io import BytesIO
 from pathlib import Path
 from typing import Iterable, List, Optional, Union
@@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
 from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+_log = logging.getLogger(__name__)
 class DoclingParsePageBackend(PdfPageBackend):
     def __init__(self, page_obj: PdfPage, docling_page_obj):
@@ -80,7 +84,9 @@ class DoclingParsePageBackend(PdfPageBackend):
             cell_counter += 1
         def draw_clusters_and_cells():
-            image = self.get_page_image()
+            image = (
+                self.get_page_image()
+            )  # make new image to avoid drawing on the saved ones
             draw = ImageDraw.Draw(image)
             for c in cells:
                 x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -151,11 +157,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
         self._pdoc = pdfium.PdfDocument(path_or_stream)
         # Parsing cells with docling_parser call
         parser = pdf_parser()
+        start_pb_time = time.time()
         if isinstance(path_or_stream, BytesIO):
             self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
         else:
             self._parser_doc = parser.find_cells(str(path_or_stream))
+        end_pb_time = time.time() - start_pb_time
+        _log.info(
+            f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
+        )
     def page_count(self) -> int:
         return len(self._parser_doc["pages"])

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
             return merged_cells
         def draw_clusters_and_cells():
-            image = self.get_page_image()
+            image = (
+                self.get_page_image()
+            )  # make new image to avoid drawing on the saved ones
             draw = ImageDraw.Draw(image)
             for c in cells:
                 x0, y0, x1, y1 = c.bbox.as_tuple()

docling/datamodel/base_models.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import copy
+import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, model_validator
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from typing_extensions import Self
 from docling.backend.abstract_backend import PdfPageBackend
@@ -234,14 +236,30 @@ class Page(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
     page_no: int
-    page_hash: str = None
-    size: PageSize = None
-    image: Image = None
+    page_hash: Optional[str] = None
+    size: Optional[PageSize] = None
     cells: List[Cell] = None
     predictions: PagePredictions = PagePredictions()
-    assembled: AssembledUnit = None
+    assembled: Optional[AssembledUnit] = None
-    _backend: PdfPageBackend = None  # Internal PDF backend
+    _backend: Optional[PdfPageBackend] = (
+        None  # Internal PDF backend. By default it is cleared during assembling.
+    )
+    _default_image_scale: float = 1.0  # Default image scale for external usage.
+    _image_cache: Dict[float, Image] = (
+        {}
+    )  # Cache of images in different scales. By default it is cleared during assembling.
+    def get_image(self, scale: float = 1.0) -> Optional[Image]:
+        if self._backend is None:
+            return self._image_cache.get(scale, None)
+        if not scale in self._image_cache:
+            self._image_cache[scale] = self._backend.get_page_image(scale=scale)
+        return self._image_cache[scale]
+    @property
+    def image(self) -> Optional[Image]:
+        return self.get_image(scale=self._default_image_scale)
 class DocumentStream(BaseModel):
@@ -268,6 +286,19 @@ class PipelineOptions(BaseModel):
 class AssembleOptions(BaseModel):
-    keep_page_images: bool = (
-        False  # False: page images are removed in the assemble step
-    )
+    keep_page_images: Annotated[
+        bool,
+        Field(
+            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
+        ),
+    ] = False  # False: page images are removed in the assemble step
+    images_scale: Optional[float] = None  # if set, the scale for generated images
+    @model_validator(mode="after")
+    def set_page_images_from_deprecated(self) -> Self:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", DeprecationWarning)
+            default_scale = 1.0
+            if self.keep_page_images and self.images_scale is None:
+                self.images_scale = default_scale
+        return self

docling/datamodel/document.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
+from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
 from docling_core.types import BaseCell, BaseText
 from docling_core.types import BoundingBox as DsBoundingBox
@@ -21,6 +21,7 @@ from docling.datamodel.base_models import (
     DocumentStream,
     FigureElement,
     Page,
+    PageElement,
     TableElement,
     TextElement,
 )
@@ -302,6 +303,20 @@ class ConvertedDocument(BaseModel):
         else:
             return ""
+    def render_element_images(
+        self, element_types: Tuple[PageElement] = (FigureElement,)
+    ):
+        for element in self.assembled.elements:
+            if isinstance(element, element_types):
+                page_ix = element.page_no
+                scale = self.pages[page_ix]._default_image_scale
+                crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
+                    page_height=self.pages[page_ix].size.height * scale
+                )
+                cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
+                yield element, cropped_im
 class DocumentConversionInput(BaseModel):

docling/document_converter.py CHANGED Viewed

@@ -188,10 +188,8 @@ class DocumentConverter:
                     # Free up mem resources before moving on with next batch
                     # Remove page images (can be disabled)
-                    if not self.assemble_options.keep_page_images:
-                        assembled_page.image = (
-                            None  # Comment this if you want to visualize page images
-                        )
+                    if self.assemble_options.images_scale is None:
+                        assembled_page._image_cache = {}
                     # Unload backend
                     assembled_page._backend.unload()
@@ -231,7 +229,15 @@ class DocumentConverter:
     # Generate the page image and store it in the page object
     def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
-        page.image = page._backend.get_page_image()
+        # default scale
+        page.get_image(scale=1.0)
+        # user requested scales
+        if self.assemble_options.images_scale is not None:
+            page._default_image_scale = self.assemble_options.images_scale
+            page.get_image(
+                scale=self.assemble_options.images_scale
+            )  # this will trigger storing the image in the internal cache
         return page
@@ -247,7 +253,7 @@ class DocumentConverter:
                 draw.rectangle([(x0, y0), (x1, y1)], outline="red")
             image.show()
-        # draw_text_boxes(page.image, cells)
+        # draw_text_boxes(page.get_image(scale=1.0), cells)
         return page

docling/models/easyocr_model.py CHANGED Viewed

@@ -30,7 +30,7 @@ class EasyOcrModel:
         for page in page_batch:
             # rects = page._fpage.
-            high_res_image = page._backend.get_page_image(scale=self.scale)
+            high_res_image = page.get_image(scale=self.scale)
             im = numpy.array(high_res_image)
             result = self.reader.readtext(im)

docling/models/layout_model.py CHANGED Viewed

@@ -267,7 +267,9 @@ class LayoutModel:
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
         for page in page_batch:
             clusters = []
-            for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
+            for ix, pred_item in enumerate(
+                self.layout_predictor.predict(page.get_image(scale=1.0))
+            ):
                 cluster = Cluster(
                     id=ix,
                     label=pred_item["label"],

docling/models/table_structure_model.py CHANGED Viewed

@@ -34,7 +34,9 @@ class TableStructureModel:
             self.scale = 2.0  # Scale up table input images to 144 dpi
     def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
-        image = page._backend.get_page_image()
+        image = (
+            page._backend.get_page_image()
+        )  # make new image to avoid drawing on the saved ones
         draw = ImageDraw.Draw(image)
         for table_element in tbl_list:
@@ -94,13 +96,7 @@ class TableStructureModel:
                 "width": page.size.width * self.scale,
                 "height": page.size.height * self.scale,
             }
-            # add image to page input.
-            if self.scale == 1.0:
-                page_input["image"] = numpy.asarray(page.image)
-            else:  # render new page image on the fly at desired scale
-                page_input["image"] = numpy.asarray(
-                    page._backend.get_page_image(scale=self.scale)
-                )
+            page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
             table_clusters, table_bboxes = zip(*in_tables)

{docling-1.4.0.dist-info → docling-1.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.4.0
+Version: 1.5.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -44,6 +44,7 @@ Description-Content-Type: text/markdown
 # Docling
+[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
 ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
 [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -172,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
 If you use Docling in your projects, please consider citing the following:
 ```bib
-@software{Docling,
-author = {Deep Search Team},
-month = {7},
-title = {{Docling}},
-url = {https://github.com/DS4SD/docling},
-version = {main},
-year = {2024}
+@techreport{Docling,
+  author = {Deep Search Team},
+  month = {8},
+  title = {{Docling Technical Report}},
+  url={https://arxiv.org/abs/2408.09869},
+  eprint={2408.09869},
+  doi = "10.48550/arXiv.2408.09869",
+  version = {1.0.0},
+  year = {2024}
 }
 ```

{docling-1.4.0.dist-info → docling-1.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,26 +1,26 @@
 docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
-docling/backend/docling_parse_backend.py,sha256=bgsmnwDmroBwuOwkEUzlN9KMEIFJ1xUaCZW6rsr5G-c,5924
-docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
+docling/backend/docling_parse_backend.py,sha256=-bIjYJ-80R2SArAEw_lAyzgW5_BFEoX83n1oBMmUGF4,6284
+docling/backend/pypdfium2_backend.py,sha256=3Qeeal8z6DunUe4S10Z2TXrdeucanCpa8evt6SQtpKQ,7496
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
-docling/datamodel/document.py,sha256=lZHXINmPWvpzrV3PTilgJs1blqTMCnJdLEww_qfcqdE,12533
+docling/datamodel/base_models.py,sha256=uOq0zjUS60aIkROREiypp3Jn1yqQTlWEf34jXTT43ls,8391
+docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
 docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
-docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
+docling/document_converter.py,sha256=r9z48VjL_hkq-rbAgyZ135njzUGBJ5AnhEH6-1zfyCA,10490
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
-docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
-docling/models/layout_model.py,sha256=3mOgNvCYPh99_oLxJy-ZaIqGOFgG5bcIQ0tTubW656Q,11204
+docling/models/easyocr_model.py,sha256=Y-RWolIFE3By6gk8dnb2qFy7Cr9qcHs6eo65fWPT0Nc,2276
+docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
 docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
-docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
+docling/models/table_structure_model.py,sha256=lKsodvfZaGwxOHp-CbRW5nzCKZYMwf770h0Ka6Bdbgw,5451
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
 docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-1.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
-docling-1.4.0.dist-info/METADATA,sha256=Hu8pvrxpc0b1qzQvvzI_ijRAQWjOfcfNl4_1Zb7oyoc,7042
-docling-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-1.4.0.dist-info/RECORD,,
+docling-1.5.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
+docling-1.5.0.dist-info/METADATA,sha256=jWcjsrdfYcpeYFCRQ1h5C1b8MyaKsJWyUhGheXQEGvY,7235
+docling-1.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-1.5.0.dist-info/RECORD,,

{docling-1.4.0.dist-info → docling-1.5.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-1.4.0.dist-info → docling-1.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

docling 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

docling 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl