PyPI - docling - Versions diffs - 1.19.1__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

docling 1.19.1py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docling/backend/abstract_backend.py +32 -37
docling/backend/docling_parse_backend.py +16 -12
docling/backend/docling_parse_v2_backend.py +240 -0
docling/backend/html_backend.py +425 -0
docling/backend/mspowerpoint_backend.py +375 -0
docling/backend/msword_backend.py +509 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +15 -10
docling/cli/main.py +61 -60
docling/datamodel/base_models.py +73 -193
docling/datamodel/document.py +379 -324
docling/datamodel/pipeline_options.py +16 -0
docling/datamodel/settings.py +1 -0
docling/document_converter.py +215 -252
docling/models/base_model.py +25 -0
docling/models/base_ocr_model.py +19 -6
docling/models/ds_glm_model.py +220 -22
docling/models/easyocr_model.py +45 -40
docling/models/layout_model.py +130 -114
docling/models/page_assemble_model.py +119 -95
docling/models/page_preprocessing_model.py +61 -0
docling/models/table_structure_model.py +122 -111
docling/models/tesseract_ocr_cli_model.py +63 -56
docling/models/tesseract_ocr_model.py +58 -50
docling/pipeline/base_pipeline.py +190 -0
docling/pipeline/simple_pipeline.py +59 -0
docling/pipeline/standard_pdf_pipeline.py +198 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling-2.1.0.dist-info/METADATA +149 -0
docling-2.1.0.dist-info/RECORD +42 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.19.1.dist-info/METADATA +0 -380
docling-1.19.1.dist-info/RECORD +0 -34
{docling-1.19.1.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
{docling-1.19.1.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
{docling-1.19.1.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0

docling/backend/abstract_backend.py CHANGED Viewed

@@ -1,68 +1,63 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Set, Union
-from PIL import Image
+from docling_core.types.doc import DoclingDocument
 if TYPE_CHECKING:
-    from docling.datamodel.base_models import BoundingBox, Cell, PageSize
+    from docling.datamodel.base_models import InputFormat
+    from docling.datamodel.document import InputDocument
-class PdfPageBackend(ABC):
+class AbstractDocumentBackend(ABC):
     @abstractmethod
-    def get_text_in_rect(self, bbox: "BoundingBox") -> str:
-        pass
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        self.path_or_stream = path_or_stream
+        self.document_hash = in_doc.document_hash
+        self.input_format = in_doc.format
     @abstractmethod
-    def get_text_cells(self) -> Iterable["Cell"]:
+    def is_valid(self) -> bool:
         pass
+    @classmethod
     @abstractmethod
-    def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
+    def supports_pagination(cls) -> bool:
         pass
     @abstractmethod
-    def get_page_image(
-        self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
-    ) -> Image.Image:
-        pass
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
-    @abstractmethod
-    def get_size(self) -> "PageSize":
-        pass
+        self.path_or_stream = None
+    @classmethod
     @abstractmethod
-    def is_valid(self) -> bool:
+    def supported_formats(cls) -> Set["InputFormat"]:
         pass
-    @abstractmethod
-    def unload(self):
-        pass
+class PaginatedDocumentBackend(AbstractDocumentBackend):
+    """DeclarativeDocumentBackend.
-class PdfDocumentBackend(ABC):
-    @abstractmethod
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        self.path_or_stream = path_or_stream
-        self.document_hash = document_hash
-    @abstractmethod
-    def load_page(self, page_no: int) -> PdfPageBackend:
-        pass
+    A declarative document backend is a backend that can transform to DoclingDocument
+    straight without a recognition pipeline.
+    """
     @abstractmethod
     def page_count(self) -> int:
         pass
-    @abstractmethod
-    def is_valid(self) -> bool:
-        pass
-    @abstractmethod
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
+class DeclarativeDocumentBackend(AbstractDocumentBackend):
+    """DeclarativeDocumentBackend.
-        self.path_or_stream = None
+    A declarative document backend is a backend that can transform to DoclingDocument
+    straight without a recognition pipeline.
+    """
+    @abstractmethod
+    def convert(self) -> DoclingDocument:
+        pass

docling/backend/docling_parse_backend.py CHANGED Viewed

@@ -5,12 +5,14 @@ from pathlib import Path
 from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
+from docling_core.types.doc import BoundingBox, CoordOrigin, Size
 from docling_parse.docling_parse import pdf_parser
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Cell
+from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):
         return image
-    def get_size(self) -> PageSize:
-        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def get_size(self) -> Size:
+        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
     def unload(self):
         self._ppage = None
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
 class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
-        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
         self.parser = pdf_parser()
         success = False
-        if isinstance(path_or_stream, BytesIO):
+        if isinstance(self.path_or_stream, BytesIO):
             success = self.parser.load_document_from_bytesio(
-                document_hash, path_or_stream
+                self.document_hash, self.path_or_stream
+            )
+        elif isinstance(self.path_or_stream, Path):
+            success = self.parser.load_document(
+                self.document_hash, str(self.path_or_stream)
             )
-        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(document_hash, str(path_or_stream))
         if not success:
             raise RuntimeError(
-                f"docling-parse could not load document {document_hash}."
+                f"docling-parse could not load document with hash {self.document_hash}."
             )
     def page_count(self) -> int:

docling/backend/docling_parse_v2_backend.py ADDED Viewed

@@ -0,0 +1,240 @@
+import logging
+import random
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+import pypdfium2 as pdfium
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_parse.docling_parse import pdf_parser_v2
+from PIL import Image, ImageDraw
+from pypdfium2 import PdfPage
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Cell, Size
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+class DoclingParseV2PageBackend(PdfPageBackend):
+    def __init__(
+        self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
+    ):
+        self._ppage = page_obj
+        parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
+        self.valid = "pages" in parsed_page
+        if self.valid:
+            self._dpage = parsed_page["pages"][page_no]
+        else:
+            _log.info(
+                f"An error occured when loading page {page_no} of document {document_hash}."
+            )
+    def is_valid(self) -> bool:
+        return self.valid
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        if not self.valid:
+            return ""
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+        parser_width = self._dpage["sanitized"]["dimension"]["width"]
+        parser_height = self._dpage["sanitized"]["dimension"]["height"]
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+        cells_data = self._dpage["sanitized"]["cells"]["data"]
+        cells_header = self._dpage["sanitized"]["cells"]["header"]
+        for i, cell_data in enumerate(cells_data):
+            x0 = cell_data[cells_header.index("x0")]
+            y0 = cell_data[cells_header.index("y0")]
+            x1 = cell_data[cells_header.index("x1")]
+            y1 = cell_data[cells_header.index("y1")]
+            cell_bbox = BoundingBox(
+                l=x0 * scale * page_size.width / parser_width,
+                b=y0 * scale * page_size.height / parser_height,
+                r=x1 * scale * page_size.width / parser_width,
+                t=y1 * scale * page_size.height / parser_height,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            ).to_top_left_origin(page_height=page_size.height * scale)
+            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += cell_data[cells_header.index("text")]
+        return text_piece
+    def get_text_cells(self) -> Iterable[Cell]:
+        cells: List[Cell] = []
+        cell_counter = 0
+        if not self.valid:
+            return cells
+        page_size = self.get_size()
+        parser_width = self._dpage["sanitized"]["dimension"]["width"]
+        parser_height = self._dpage["sanitized"]["dimension"]["height"]
+        cells_data = self._dpage["sanitized"]["cells"]["data"]
+        cells_header = self._dpage["sanitized"]["cells"]["header"]
+        for i, cell_data in enumerate(cells_data):
+            x0 = cell_data[cells_header.index("x0")]
+            y0 = cell_data[cells_header.index("y0")]
+            x1 = cell_data[cells_header.index("x1")]
+            y1 = cell_data[cells_header.index("y1")]
+            if x1 < x0:
+                x0, x1 = x1, x0
+            if y1 < y0:
+                y0, y1 = y1, y0
+            text_piece = cell_data[cells_header.index("text")]
+            cells.append(
+                Cell(
+                    id=cell_counter,
+                    text=text_piece,
+                    bbox=BoundingBox(
+                        # l=x0, b=y0, r=x1, t=y1,
+                        l=x0 * page_size.width / parser_width,
+                        b=y0 * page_size.height / parser_height,
+                        r=x1 * page_size.width / parser_width,
+                        t=y1 * page_size.height / parser_height,
+                        coord_origin=CoordOrigin.BOTTOMLEFT,
+                    ).to_top_left_origin(page_size.height),
+                )
+            )
+            cell_counter += 1
+        def draw_clusters_and_cells():
+            image = (
+                self.get_page_image()
+            )  # make new image to avoid drawing on the saved ones
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+            image.show()
+        # draw_clusters_and_cells()
+        return cells
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 32 * 32
+        images = self._dpage["sanitized"]["images"]["data"]
+        images_header = self._dpage["sanitized"]["images"]["header"]
+        for row in images:
+            x0 = row[images_header.index("x0")]
+            y0 = row[images_header.index("y0")]
+            x1 = row[images_header.index("x1")]
+            y1 = row[images_header.index("y1")]
+            cropbox = BoundingBox.from_tuple(
+                (x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
+            ).to_top_left_origin(self.get_size().height)
+            if cropbox.area() > AREA_THRESHOLD:
+                cropbox = cropbox.scaled(scale=scale)
+                yield cropbox
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        page_size = self.get_size()
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+            padbox = BoundingBox(
+                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
+            )
+        else:
+            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox.r = page_size.width - padbox.r
+            padbox.t = page_size.height - padbox.t
+        image = (
+            self._ppage.render(
+                scale=scale * 1.5,
+                rotation=0,  # no additional rotation
+                crop=padbox.as_tuple(),
+            )
+            .to_pil()
+            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
+        )  # We resize the image from 1.5x the given scale to make it sharper.
+        return image
+    def get_size(self) -> Size:
+        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def unload(self):
+        self._ppage = None
+        self._dpage = None
+class DoclingParseV2DocumentBackend(PdfDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
+        self.parser = pdf_parser_v2("fatal")
+        success = False
+        if isinstance(path_or_stream, BytesIO):
+            success = self.parser.load_document_from_bytesio(
+                self.document_hash, path_or_stream
+            )
+        elif isinstance(path_or_stream, Path):
+            success = self.parser.load_document(self.document_hash, str(path_or_stream))
+        if not success:
+            raise RuntimeError(
+                f"docling-parse v2 could not load document {self.document_hash}."
+            )
+    def page_count(self) -> int:
+        return len(self._pdoc)  # To be replaced with docling-parse API
+    def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
+        return DoclingParseV2PageBackend(
+            self.parser, self.document_hash, page_no, self._pdoc[page_no]
+        )
+    def is_valid(self) -> bool:
+        return self.page_count() > 0
+    def unload(self):
+        super().unload()
+        self.parser.unload_document(self.document_hash)
+        self._pdoc.close()
+        self._pdoc = None

docling 1.19.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

docling 1.19.1py3-none-any.whl → 2.1.0py3-none-any.whl