PyPI - docling - Versions diffs - 2.36.0__tar.gz → 2.37.0__tar.gz - Mend

docling 2.36.0tar.gz → 2.37.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

{docling-2.36.0 → docling-2.37.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.36.0
+Version: 2.37.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -37,7 +37,7 @@ Requires-Dist: requests<3.0.0,>=2.32.2
 Requires-Dist: easyocr<2.0,>=1.7
 Requires-Dist: certifi>=2024.7.4
 Requires-Dist: rtree<2.0.0,>=1.3.0
-Requires-Dist: typer<0.16.0,>=0.12.5
+Requires-Dist: typer<0.17.0,>=0.12.5
 Requires-Dist: python-docx<2.0.0,>=1.1.2
 Requires-Dist: python-pptx<2.0.0,>=1.0.2
 Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
@@ -49,7 +49,6 @@ Requires-Dist: pillow<12.0.0,>=10.0.0
 Requires-Dist: tqdm<5.0.0,>=4.65.0
 Requires-Dist: pluggy<2.0.0,>=1.0.0
 Requires-Dist: pylatexenc<3.0,>=2.10
-Requires-Dist: click<8.2.0
 Requires-Dist: scipy<2.0.0,>=1.6.0
 Provides-Extra: tesserocr
 Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"

{docling-2.36.0 → docling-2.37.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

@@ -2,7 +2,7 @@ import logging
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Final, Set, Union
 from docling_core.types.doc import (
     DocItemLabel,
@@ -22,6 +22,9 @@ from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
+DEFAULT_IMAGE_WIDTH: Final = 128
+DEFAULT_IMAGE_HEIGHT: Final = 128
 class AsciiDocBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
@@ -200,9 +203,11 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                 item = self._parse_picture(line)
-                size = None
+                size: Size
                 if "width" in item and "height" in item:
                     size = Size(width=int(item["width"]), height=int(item["height"]))
+                else:
+                    size = Size(width=DEFAULT_IMAGE_WIDTH, height=DEFAULT_IMAGE_HEIGHT)
                 uri = None
                 if (
@@ -264,14 +269,16 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         return doc
-    def _get_current_level(self, parents):
+    @staticmethod
+    def _get_current_level(parents):
         for k, v in parents.items():
             if v is None and k > 0:
                 return k - 1
         return 0
-    def _get_current_parent(self, parents):
+    @staticmethod
+    def _get_current_parent(parents):
         for k, v in parents.items():
             if v is None and k > 0:
                 return parents[k - 1]
@@ -279,17 +286,21 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         return None
     #   =========   Title
-    def _is_title(self, line):
+    @staticmethod
+    def _is_title(line):
         return re.match(r"^= ", line)
-    def _parse_title(self, line):
+    @staticmethod
+    def _parse_title(line):
         return {"type": "title", "text": line[2:].strip(), "level": 0}
     #   =========   Section headers
-    def _is_section_header(self, line):
+    @staticmethod
+    def _is_section_header(line):
         return re.match(r"^==+\s+", line)
-    def _parse_section_header(self, line):
+    @staticmethod
+    def _parse_section_header(line):
         match = re.match(r"^(=+)\s+(.*)", line)
         marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
@@ -303,10 +314,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         }
     #   =========   Lists
-    def _is_list_item(self, line):
+    @staticmethod
+    def _is_list_item(line):
         return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
-    def _parse_list_item(self, line):
+    @staticmethod
+    def _parse_list_item(line):
         """Extract the item marker (number or bullet symbol) and the text of the item."""
         match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
@@ -342,14 +355,17 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
             }
     #   =========   Tables
-    def _is_table_line(self, line):
+    @staticmethod
+    def _is_table_line(line):
         return re.match(r"^\|.*\|", line)
-    def _parse_table_line(self, line):
+    @staticmethod
+    def _parse_table_line(line):
         # Split table cells and trim extra spaces
         return [cell.strip() for cell in line.split("|") if cell.strip()]
-    def _populate_table_as_grid(self, table_data):
+    @staticmethod
+    def _populate_table_as_grid(table_data):
         num_rows = len(table_data)
         # Adjust the table data into a grid format
@@ -380,10 +396,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         return data
     #   =========   Pictures
-    def _is_picture(self, line):
+    @staticmethod
+    def _is_picture(line):
         return re.match(r"^image::", line)
-    def _parse_picture(self, line):
+    @staticmethod
+    def _parse_picture(line):
         """
         Parse an image macro, extracting its path and attributes.
         Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
@@ -406,10 +424,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         return {"type": "picture", "uri": line}
     #   =========   Captions
-    def _is_caption(self, line):
+    @staticmethod
+    def _is_caption(line):
         return re.match(r"^\.(.+)", line)
-    def _parse_caption(self, line):
+    @staticmethod
+    def _parse_caption(line):
         mtch = re.match(r"^\.(.+)", line)
         if mtch:
             text = mtch.group(1)
@@ -418,5 +438,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         return {"type": "caption", "text": ""}
     #   =========   Plain text
-    def _parse_text(self, line):
+    @staticmethod
+    def _parse_text(line):
         return {"type": "text", "text": line.strip()}

{docling-2.36.0 → docling-2.37.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -7,12 +7,17 @@ from typing import List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    SegmentedPdfPage,
+    TextCell,
+)
 from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.backend.pypdfium2_backend import get_pdf_page_geometry
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
@@ -36,43 +41,8 @@ class DoclingParsePageBackend(PdfPageBackend):
     def is_valid(self) -> bool:
         return self.valid
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        if not self.valid:
-            return ""
-        # Find intersecting cells on the page
-        text_piece = ""
-        page_size = self.get_size()
-        parser_width = self._dpage["width"]
-        parser_height = self._dpage["height"]
-        scale = (
-            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
-        )
-        for i in range(len(self._dpage["cells"])):
-            rect = self._dpage["cells"][i]["box"]["device"]
-            x0, y0, x1, y1 = rect
-            cell_bbox = BoundingBox(
-                l=x0 * scale * page_size.width / parser_width,
-                b=y0 * scale * page_size.height / parser_height,
-                r=x1 * scale * page_size.width / parser_width,
-                t=y1 * scale * page_size.height / parser_height,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_over_self(bbox)
-            if overlap_frac > 0.5:
-                if len(text_piece) > 0:
-                    text_piece += " "
-                text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
-        return text_piece
-    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
-    def get_text_cells(self) -> Iterable[TextCell]:
+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from docling-parse data."""
         cells: List[TextCell] = []
         cell_counter = 0
@@ -102,7 +72,6 @@ class DoclingParsePageBackend(PdfPageBackend):
                     from_ocr=False,
                     rect=BoundingRectangle.from_bounding_box(
                         BoundingBox(
-                            # l=x0, b=y0, r=x1, t=y1,
                             l=x0 * page_size.width / parser_width,
                             b=y0 * page_size.height / parser_height,
                             r=x1 * page_size.width / parser_width,
@@ -115,30 +84,63 @@ class DoclingParsePageBackend(PdfPageBackend):
             cell_counter += 1
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
+        return cells
-        # before merge:
-        # draw_clusters_and_cells()
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        if not self.valid:
+            return ""
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+        parser_width = self._dpage["width"]
+        parser_height = self._dpage["height"]
-        # cells = merge_horizontal_cells(cells)
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
-        # after merge:
-        # draw_clusters_and_cells()
+        for i in range(len(self._dpage["cells"])):
+            rect = self._dpage["cells"][i]["box"]["device"]
+            x0, y0, x1, y1 = rect
+            cell_bbox = BoundingBox(
+                l=x0 * scale * page_size.width / parser_width,
+                b=y0 * scale * page_size.height / parser_height,
+                r=x1 * scale * page_size.width / parser_width,
+                t=y1 * scale * page_size.height / parser_height,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            ).to_top_left_origin(page_height=page_size.height * scale)
-        return cells
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
+        return text_piece
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        if not self.valid:
+            return None
+        text_cells = self._compute_text_cells()
+        # Get the PDF page geometry from pypdfium2
+        dimension = get_pdf_page_geometry(self._ppage)
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_lines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._compute_text_cells()
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 0  # 32 * 32

{docling-2.36.0 → docling-2.37.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
 from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.backend.pypdfium2_backend import get_pdf_page_geometry
 from docling.datamodel.base_models import Size
 from docling.utils.locks import pypdfium2_lock
@@ -40,50 +47,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):
     def is_valid(self) -> bool:
         return self.valid
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        if not self.valid:
-            return ""
-        # Find intersecting cells on the page
-        text_piece = ""
-        page_size = self.get_size()
-        parser_width = self._dpage["sanitized"]["dimension"]["width"]
-        parser_height = self._dpage["sanitized"]["dimension"]["height"]
-        scale = (
-            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
-        )
-        cells_data = self._dpage["sanitized"]["cells"]["data"]
-        cells_header = self._dpage["sanitized"]["cells"]["header"]
-        for i, cell_data in enumerate(cells_data):
-            x0 = cell_data[cells_header.index("x0")]
-            y0 = cell_data[cells_header.index("y0")]
-            x1 = cell_data[cells_header.index("x1")]
-            y1 = cell_data[cells_header.index("y1")]
-            cell_bbox = BoundingBox(
-                l=x0 * scale * page_size.width / parser_width,
-                b=y0 * scale * page_size.height / parser_height,
-                r=x1 * scale * page_size.width / parser_width,
-                t=y1 * scale * page_size.height / parser_height,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_over_self(bbox)
-            if overlap_frac > 0.5:
-                if len(text_piece) > 0:
-                    text_piece += " "
-                text_piece += cell_data[cells_header.index("text")]
-        return text_piece
-    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
-    def get_text_cells(self) -> Iterable[TextCell]:
+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from docling-parse v2 data."""
         cells: List[TextCell] = []
         cell_counter = 0
@@ -118,7 +83,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                     from_ocr=False,
                     rect=BoundingRectangle.from_bounding_box(
                         BoundingBox(
-                            # l=x0, b=y0, r=x1, t=y1,
                             l=x0 * page_size.width / parser_width,
                             b=y0 * page_size.height / parser_height,
                             r=x1 * page_size.width / parser_width,
@@ -130,24 +94,70 @@ class DoclingParseV2PageBackend(PdfPageBackend):
             )
             cell_counter += 1
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
+        return cells
-        # draw_clusters_and_cells()
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        if not self.valid:
+            return ""
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
-        return cells
+        parser_width = self._dpage["sanitized"]["dimension"]["width"]
+        parser_height = self._dpage["sanitized"]["dimension"]["height"]
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+        cells_data = self._dpage["sanitized"]["cells"]["data"]
+        cells_header = self._dpage["sanitized"]["cells"]["header"]
+        for i, cell_data in enumerate(cells_data):
+            x0 = cell_data[cells_header.index("x0")]
+            y0 = cell_data[cells_header.index("y0")]
+            x1 = cell_data[cells_header.index("x1")]
+            y1 = cell_data[cells_header.index("y1")]
+            cell_bbox = BoundingBox(
+                l=x0 * scale * page_size.width / parser_width,
+                b=y0 * scale * page_size.height / parser_height,
+                r=x1 * scale * page_size.width / parser_width,
+                t=y1 * scale * page_size.height / parser_height,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            ).to_top_left_origin(page_height=page_size.height * scale)
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += cell_data[cells_header.index("text")]
+        return text_piece
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        if not self.valid:
+            return None
+        text_cells = self._compute_text_cells()
+        # Get the PDF page geometry from pypdfium2
+        dimension = get_pdf_page_geometry(self._ppage)
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_textlines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._compute_text_cells()
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 0  # 32 * 32

{docling-2.36.0 → docling-2.37.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

@@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
         return self._dpage
     def get_text_cells(self) -> Iterable[TextCell]:
-        page_size = self.get_size()
-        [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
-        # for cell in self._dpage.textline_cells:
-        #     rect = cell.rect
-        #
-        #     assert (
-        #         rect.to_bounding_box().l <= rect.to_bounding_box().r
-        #     ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
-        #     assert (
-        #         rect.to_bounding_box().t <= rect.to_bounding_box().b
-        #     ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
         return self._dpage.textline_cells
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
@@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
         self, page_no: int, create_words: bool = True, create_textlines: bool = True
     ) -> DoclingParseV4PageBackend:
         with pypdfium2_lock:
+            seg_page = self.dp_doc.get_page(
+                page_no + 1,
+                create_words=create_words,
+                create_textlines=create_textlines,
+            )
+            # In Docling, all TextCell instances are expected with top-left origin.
+            [
+                tc.to_top_left_origin(seg_page.dimension.height)
+                for tc in seg_page.textline_cells
+            ]
+            [
+                tc.to_top_left_origin(seg_page.dimension.height)
+                for tc in seg_page.char_cells
+            ]
+            [
+                tc.to_top_left_origin(seg_page.dimension.height)
+                for tc in seg_page.word_cells
+            ]
             return DoclingParseV4PageBackend(
-                self.dp_doc.get_page(
-                    page_no + 1,
-                    create_words=create_words,
-                    create_textlines=create_textlines,
-                ),
+                seg_page,
                 self._pdoc[page_no],
             )

docling 2.36.0__tar.gz → 2.37.0__tar.gz

docling 2.36.0tar.gz → 2.37.0tar.gz