PyPI - docling - Versions diffs - 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

docling 1.1.2py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

docling/backend/abstract_backend.py CHANGED Viewed

@@ -35,7 +35,7 @@ class PdfPageBackend(ABC):
 class PdfDocumentBackend(ABC):
     @abstractmethod
-    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+    def __init__(self, path_or_stream: Union[BytesIO, Path]):
         pass
     @abstractmethod

docling/backend/docling_parse_backend.py ADDED Viewed

@@ -0,0 +1,172 @@
+import random
+from io import BytesIO
+from pathlib import Path
+from typing import Iterable, List, Optional, Union
+import pypdfium2 as pdfium
+from docling_parse.docling_parse import pdf_parser
+from PIL import Image, ImageDraw
+from pypdfium2 import PdfPage
+from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+class DoclingParsePageBackend(PdfPageBackend):
+    def __init__(self, page_obj: PdfPage, docling_page_obj):
+        super().__init__(page_obj)
+        self._ppage = page_obj
+        self._dpage = docling_page_obj
+        self.text_page = None
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+        parser_width = self._dpage["width"]
+        parser_height = self._dpage["height"]
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+        for i in range(len(self._dpage["cells"])):
+            rect = self._dpage["cells"][i]["box"]["device"]
+            x0, y0, x1, y1 = rect
+            cell_bbox = BoundingBox(
+                l=x0 * scale * page_size.width / parser_width,
+                b=y0 * scale * page_size.height / parser_height,
+                r=x1 * scale * page_size.width / parser_width,
+                t=y1 * scale * page_size.height / parser_height,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            ).to_top_left_origin(page_size.height * scale)
+            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
+        return text_piece
+    def get_text_cells(self) -> Iterable[Cell]:
+        cells = []
+        cell_counter = 0
+        page_size = self.get_size()
+        parser_width = self._dpage["width"]
+        parser_height = self._dpage["height"]
+        for i in range(len(self._dpage["cells"])):
+            rect = self._dpage["cells"][i]["box"]["device"]
+            x0, y0, x1, y1 = rect
+            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
+            cells.append(
+                Cell(
+                    id=cell_counter,
+                    text=text_piece,
+                    bbox=BoundingBox(
+                        # l=x0, b=y0, r=x1, t=y1,
+                        l=x0 * page_size.width / parser_width,
+                        b=y0 * page_size.height / parser_height,
+                        r=x1 * page_size.width / parser_width,
+                        t=y1 * page_size.height / parser_height,
+                        coord_origin=CoordOrigin.BOTTOMLEFT,
+                    ).to_top_left_origin(page_size.height),
+                )
+            )
+            cell_counter += 1
+        def draw_clusters_and_cells():
+            image = self.get_page_image()
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+            image.show()
+        # before merge:
+        # draw_clusters_and_cells()
+        # cells = merge_horizontal_cells(cells)
+        # after merge:
+        # draw_clusters_and_cells()
+        return cells
+    def get_page_image(
+        self, scale: int = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        page_size = self.get_size()
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+            padbox = BoundingBox(
+                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
+            )
+        else:
+            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox.r = page_size.width - padbox.r
+            padbox.t = page_size.height - padbox.t
+        image = (
+            self._ppage.render(
+                scale=scale * 1.5,
+                rotation=0,  # no additional rotation
+                crop=padbox.as_tuple(),
+            )
+            .to_pil()
+            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
+        )  # We resize the image from 1.5x the given scale to make it sharper.
+        return image
+    def get_size(self) -> PageSize:
+        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def unload(self):
+        self._ppage = None
+        self._dpage = None
+        self.text_page = None
+class DoclingParseDocumentBackend(PdfDocumentBackend):
+    def __init__(self, path_or_stream: Union[BytesIO, Path]):
+        super().__init__(path_or_stream)
+        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        # Parsing cells with docling_parser call
+        if isinstance(path_or_stream, BytesIO):
+            raise NotImplemented("This backend does not support byte streams yet.")
+        parser = pdf_parser()
+        self._parser_doc = parser.find_cells(str(path_or_stream))
+    def page_count(self) -> int:
+        return len(self._parser_doc["pages"])
+    def load_page(self, page_no: int) -> PdfPage:
+        return DoclingParsePageBackend(
+            self._pdoc[page_no], self._parser_doc["pages"][page_no]
+        )
+    def is_valid(self) -> bool:
+        return self.page_count() > 0
+    def unload(self):
+        self._pdoc.close()
+        self._pdoc = None
+        self._parser_doc = None

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -199,7 +199,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+    def __init__(self, path_or_stream: Union[BytesIO, Path]):
         super().__init__(path_or_stream)
         self._pdoc = pdfium.PdfDocument(path_or_stream)

{docling-1.1.2.dist-info → docling-1.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.1.2
+Version: 1.2.1
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -21,9 +21,11 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Provides-Extra: easyocr
 Provides-Extra: ocr
+Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.19.0,<1)
 Requires-Dist: docling-core (>=1.1.2,<2.0.0)
 Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
+Requires-Dist: docling-parse (>=0.0.1,<0.0.2)
 Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -92,17 +94,21 @@ print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotate
 ### Convert a batch of documents
-For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
 From a local repo clone, you can run it with:
 ```
-python examples/convert.py
+python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.
 ### Adjust pipeline features
+The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
+one can adjust the conversion pipeline and features.
 #### Control pipeline options
 You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:

{docling-1.1.2.dist-info → docling-1.2.1.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,8 @@
 docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/backend/abstract_backend.py,sha256=dINr8oTax9Fq31Y1AR0CGWNZtAHN5aqB_M7TAPkJNVQ,1122
-docling/backend/pypdfium2_backend.py,sha256=cIQGFkwzceN57PzmACt06CytRo0A_t-im6rW804RC3M,7421
+docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
+docling/backend/docling_parse_backend.py,sha256=mGuJCpMVqyrZK-cXKRWrELPz0Wt1h6uydx4QwWI1rew,5912
+docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
 docling/datamodel/document.py,sha256=FG_ntDFRBWj-MhV52D0sC8XaZOwN3yryyXahsVHGnyI,12517
@@ -19,7 +20,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-1.1.2.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
-docling-1.1.2.dist-info/METADATA,sha256=3uSlwJCchlMWLELW4Sr3L6apbAPt4sOZem3T7NlglU8,6756
-docling-1.1.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-1.1.2.dist-info/RECORD,,
+docling-1.2.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
+docling-1.2.1.dist-info/METADATA,sha256=EwclgZsLCgm5qOT5na5QRxSwSKYZCIfSeotTlwMRyYk,7042
+docling-1.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-1.2.1.dist-info/RECORD,,

{docling-1.1.2.dist-info → docling-1.2.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-1.1.2.dist-info → docling-1.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

docling 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl

docling 1.1.2py3-none-any.whl → 1.2.1py3-none-any.whl