PyPI - docling - Versions diffs - 2.36.0__py3-none-any.whl → 2.37.0__py3-none-any.whl - Mend

docling 2.36.0py3-none-any.whl → 2.37.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

docling/backend/asciidoc_backend.py +39 -18
docling/backend/docling_parse_backend.py +61 -59
docling/backend/docling_parse_v2_backend.py +72 -62
docling/backend/docling_parse_v4_backend.py +21 -19
docling/backend/mspowerpoint_backend.py +72 -113
docling/backend/msword_backend.py +28 -18
docling/backend/pypdfium2_backend.py +127 -53
docling/datamodel/base_models.py +10 -3
docling/datamodel/pipeline_options.py +3 -1
docling/datamodel/pipeline_options_vlm_model.py +2 -1
docling/models/base_ocr_model.py +33 -11
docling/models/easyocr_model.py +1 -1
docling/models/layout_model.py +2 -3
docling/models/ocr_mac_model.py +1 -1
docling/models/page_preprocessing_model.py +3 -6
docling/models/rapid_ocr_model.py +1 -1
docling/models/readingorder_model.py +2 -2
docling/models/tesseract_ocr_cli_model.py +4 -3
docling/models/tesseract_ocr_model.py +1 -1
docling/models/vlm_models_inline/hf_transformers_model.py +1 -0
docling/pipeline/standard_pdf_pipeline.py +0 -1
docling/utils/layout_postprocessor.py +11 -6
{docling-2.36.0.dist-info → docling-2.37.0.dist-info}/METADATA +2 -3
{docling-2.36.0.dist-info → docling-2.37.0.dist-info}/RECORD +28 -28
{docling-2.36.0.dist-info → docling-2.37.0.dist-info}/WHEEL +0 -0
{docling-2.36.0.dist-info → docling-2.37.0.dist-info}/entry_points.txt +0 -0
{docling-2.36.0.dist-info → docling-2.37.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.36.0.dist-info → docling-2.37.0.dist-info}/top_level.txt +0 -0

docling/backend/mspowerpoint_backend.py CHANGED Viewed

@@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
 from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
+from pptx.oxml.text import CT_TextLineBreak
 from docling.backend.abstract_backend import (
     DeclarativeDocumentBackend,
@@ -120,136 +121,91 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         return prov
-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):  # noqa: C901
-        is_a_list = False
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
         is_list_group_created = False
         enum_list_item_value = 0
         new_list = None
-        bullet_type = "None"
-        list_label = GroupLabel.LIST
         doc_label = DocItemLabel.LIST_ITEM
         prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
-        # Identify if shape contains lists
-        for paragraph in shape.text_frame.paragraphs:
-            # Check if paragraph is a bullet point using the `element` XML
+        def is_list_item(paragraph):
+            """Check if the paragraph is a list item."""
             p = paragraph._element
             if (
                 p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
                 is not None
             ):
-                bullet_type = "Bullet"
-                is_a_list = True
+                return (True, "Bullet")
             elif (
                 p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
                 is not None
             ):
-                bullet_type = "Numbered"
-                is_a_list = True
-            else:
-                is_a_list = False
-            if paragraph.level > 0:
+                return (True, "Numbered")
+            elif paragraph.level > 0:
                 # Most likely a sub-list
-                is_a_list = True
-            if is_a_list:
-                # Determine if this is an unordered list or an ordered list.
-                # Set GroupLabel.ORDERED_LIST when it fits.
-                if bullet_type == "Numbered":
-                    list_label = GroupLabel.ORDERED_LIST
-            if is_a_list:
-                _log.debug("LIST DETECTED!")
+                return (True, "None")
             else:
-                _log.debug("No List")
-        # If there is a list inside of the shape, create a new docling list to assign list items to
-        # if is_a_list:
-        #     new_list = doc.add_group(
-        #         label=list_label, name=f"list", parent=parent_slide
-        #     )
+                return (False, "None")
         # Iterate through paragraphs to build up text
         for paragraph in shape.text_frame.paragraphs:
-            # p_text = paragraph.text.strip()
+            is_a_list, bullet_type = is_list_item(paragraph)
             p = paragraph._element
-            enum_list_item_value += 1
-            inline_paragraph_text = ""
-            inline_list_item_text = ""
-            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
-                if len(e.text.strip()) > 0:
-                    e_is_a_list_item = False
-                    is_numbered = False
-                    if (
-                        p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
-                        is not None
-                    ):
-                        bullet_type = "Bullet"
-                        e_is_a_list_item = True
-                    elif (
-                        p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
-                        is not None
-                    ):
-                        bullet_type = "Numbered"
-                        is_numbered = True
-                        e_is_a_list_item = True
-                    else:
-                        e_is_a_list_item = False
-                    if e_is_a_list_item:
-                        if len(inline_paragraph_text) > 0:
-                            # output accumulated inline text:
-                            doc.add_text(
-                                label=doc_label,
-                                parent=parent_slide,
-                                text=inline_paragraph_text,
-                                prov=prov,
-                            )
-                        # Set marker and enumerated arguments if this is an enumeration element.
-                        inline_list_item_text += e.text
-                        # print(e.text)
-                    else:
-                        # Assign proper label to the text, depending if it's a Title or Section Header
-                        # For other types of text, assign - PARAGRAPH
-                        doc_label = DocItemLabel.PARAGRAPH
-                        if shape.is_placeholder:
-                            placeholder_type = shape.placeholder_format.type
-                            if placeholder_type in [
-                                PP_PLACEHOLDER.CENTER_TITLE,
-                                PP_PLACEHOLDER.TITLE,
-                            ]:
-                                # It's a title
-                                doc_label = DocItemLabel.TITLE
-                            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
-                                DocItemLabel.SECTION_HEADER
-                        enum_list_item_value = 0
-                        inline_paragraph_text += e.text
-            if len(inline_paragraph_text) > 0:
-                # output accumulated inline text:
-                doc.add_text(
-                    label=doc_label,
-                    parent=parent_slide,
-                    text=inline_paragraph_text,
-                    prov=prov,
-                )
-            if len(inline_list_item_text) > 0:
+            # Convert line breaks to spaces and accumulate text
+            p_text = ""
+            for e in p.content_children:
+                if isinstance(e, CT_TextLineBreak):
+                    p_text += " "
+                else:
+                    p_text += e.text
+            if is_a_list:
                 enum_marker = ""
-                if is_numbered:
-                    enum_marker = str(enum_list_item_value) + "."
+                enumerated = bullet_type == "Numbered"
                 if not is_list_group_created:
                     new_list = doc.add_group(
-                        label=list_label, name="list", parent=parent_slide
+                        label=GroupLabel.ORDERED_LIST
+                        if enumerated
+                        else GroupLabel.LIST,
+                        name="list",
+                        parent=parent_slide,
                     )
                     is_list_group_created = True
+                    enum_list_item_value = 0
+                if enumerated:
+                    enum_list_item_value += 1
+                    enum_marker = str(enum_list_item_value) + "."
                 doc.add_list_item(
                     marker=enum_marker,
-                    enumerated=is_numbered,
+                    enumerated=enumerated,
                     parent=new_list,
-                    text=inline_list_item_text,
+                    text=p_text,
+                    prov=prov,
+                )
+            else:  # is paragraph not a list item
+                # Assign proper label to the text, depending if it's a Title or Section Header
+                # For other types of text, assign - PARAGRAPH
+                doc_label = DocItemLabel.PARAGRAPH
+                if shape.is_placeholder:
+                    placeholder_type = shape.placeholder_format.type
+                    if placeholder_type in [
+                        PP_PLACEHOLDER.CENTER_TITLE,
+                        PP_PLACEHOLDER.TITLE,
+                    ]:
+                        # It's a title
+                        doc_label = DocItemLabel.TITLE
+                    elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                        DocItemLabel.SECTION_HEADER
+                # output accumulated inline text:
+                doc.add_text(
+                    label=doc_label,
+                    parent=parent_slide,
+                    text=p_text,
                     prov=prov,
                 )
         return
@@ -423,18 +379,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
             # Handle notes slide
             if slide.has_notes_slide:
                 notes_slide = slide.notes_slide
-                notes_text = notes_slide.notes_text_frame.text.strip()
-                if notes_text:
-                    bbox = BoundingBox(l=0, t=0, r=0, b=0)
-                    prov = ProvenanceItem(
-                        page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
-                    )
-                    doc.add_text(
-                        label=DocItemLabel.TEXT,
-                        parent=parent_slide,
-                        text=notes_text,
-                        prov=prov,
-                        content_layer=ContentLayer.FURNITURE,
-                    )
+                if notes_slide.notes_text_frame is not None:
+                    notes_text = notes_slide.notes_text_frame.text.strip()
+                    if notes_text:
+                        bbox = BoundingBox(l=0, t=0, r=0, b=0)
+                        prov = ProvenanceItem(
+                            page_no=slide_ind + 1,
+                            charspan=[0, len(notes_text)],
+                            bbox=bbox,
+                        )
+                        doc.add_text(
+                            label=DocItemLabel.TEXT,
+                            parent=parent_slide,
+                            text=notes_text,
+                            prov=prov,
+                            content_layer=ContentLayer.FURNITURE,
+                        )
         return doc

docling/backend/msword_backend.py CHANGED Viewed

@@ -60,8 +60,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.equation_bookends: str = "<eq>{EQ}</eq>"
         # Track processed textbox elements to avoid duplication
         self.processed_textbox_elements: List[int] = []
-        # Track content hash of processed paragraphs to avoid duplicate content
-        self.processed_paragraph_content: List[str] = []
         for i in range(-1, self.max_levels):
             self.parents[i] = None
@@ -593,9 +591,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             # Add the sorted paragraphs to our processing list
             all_paragraphs.extend(sorted_container_paragraphs)
+        # Track processed paragraphs to avoid duplicates (same content and position)
+        processed_paragraphs = set()
         # Process all the paragraphs
-        for p, _ in all_paragraphs:
-            self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
+        for p, position in all_paragraphs:
+            # Create paragraph object to get text content
+            paragraph = Paragraph(p, docx_obj)
+            text_content = paragraph.text
+            # Create a unique identifier based on content and position
+            paragraph_id = (text_content, position)
+            # Skip if this paragraph (same content and position) was already processed
+            if paragraph_id in processed_paragraphs:
+                _log.debug(
+                    f"Skipping duplicate paragraph: content='{text_content[:50]}...', position={position}"
+                )
+                continue
+            # Mark this paragraph as processed
+            processed_paragraphs.add(paragraph_id)
+            self._handle_text_elements(p, docx_obj, doc)
         # Restore original parent
         self.parents[level] = original_parent
@@ -669,22 +687,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         element: BaseOxmlElement,
         docx_obj: DocxDocument,
         doc: DoclingDocument,
-        is_from_textbox: bool = False,
     ) -> None:
         paragraph = Paragraph(element, docx_obj)
-        # Skip if from a textbox and this exact paragraph content was already processed
-        # Skip if from a textbox and this exact paragraph content was already processed
-        raw_text = paragraph.text
-        if is_from_textbox and raw_text:
-            # Create a simple hash of content to detect duplicates
-            content_hash = f"{len(raw_text)}:{raw_text[:50]}"
-            if content_hash in self.processed_paragraph_content:
-                _log.debug(f"Skipping duplicate paragraph content: {content_hash}")
-                return
-            self.processed_paragraph_content.append(content_hash)
-        text, equations = self._handle_equations_in_text(element=element, text=raw_text)
+        text, equations = self._handle_equations_in_text(
+            element=element, text=paragraph.text
+        )
         if text is None:
             return
@@ -750,7 +758,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             self._add_header(doc, p_level, text, is_numbered_style)
         elif len(equations) > 0:
-            if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
+            if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
+                text
+            ) > 0:
                 # Standalone equation
                 level = self._get_level()
                 doc.add_text(

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
@@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.utils.locks import pypdfium2_lock
+def get_pdf_page_geometry(
+    ppage: pdfium.PdfPage,
+    angle: float = 0.0,
+    boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
+) -> PdfPageGeometry:
+    """
+    Create PdfPageGeometry from a pypdfium2 PdfPage object.
+    Args:
+        ppage: pypdfium2 PdfPage object
+        angle: Page rotation angle in degrees (default: 0.0)
+        boundary_type: The boundary type for the page (default: CROP_BOX)
+    Returns:
+        PdfPageGeometry with all the different bounding boxes properly set
+    """
+    with pypdfium2_lock:
+        # Get the main bounding box (intersection of crop_box and media_box)
+        bbox_tuple = ppage.get_bbox()
+        bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
+        # Get all the different page boxes from pypdfium2
+        media_box_tuple = ppage.get_mediabox()
+        crop_box_tuple = ppage.get_cropbox()
+        art_box_tuple = ppage.get_artbox()
+        bleed_box_tuple = ppage.get_bleedbox()
+        trim_box_tuple = ppage.get_trimbox()
+        # Convert to BoundingBox objects using existing from_tuple method
+        # pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
+        # Use bbox as fallback when specific box types are not defined
+        media_bbox = (
+            BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if media_box_tuple
+            else bbox
+        )
+        crop_bbox = (
+            BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if crop_box_tuple
+            else bbox
+        )
+        art_bbox = (
+            BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if art_box_tuple
+            else bbox
+        )
+        bleed_bbox = (
+            BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if bleed_box_tuple
+            else bbox
+        )
+        trim_bbox = (
+            BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if trim_box_tuple
+            else bbox
+        )
+        return PdfPageGeometry(
+            angle=angle,
+            rect=BoundingRectangle.from_bounding_box(bbox),
+            boundary_type=boundary_type,
+            art_bbox=art_bbox,
+            bleed_bbox=bleed_bbox,
+            crop_bbox=crop_bbox,
+            media_bbox=media_bbox,
+            trim_bbox=trim_bbox,
+        )
 if TYPE_CHECKING:
     from docling.datamodel.document import InputDocument
@@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
     def is_valid(self) -> bool:
         return self.valid
-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 0  # 32 * 32
-        page_size = self.get_size()
-        with pypdfium2_lock:
-            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
-                pos = obj.get_pos()
-                cropbox = BoundingBox.from_tuple(
-                    pos, origin=CoordOrigin.BOTTOMLEFT
-                ).to_top_left_origin(page_height=page_size.height)
-                if cropbox.area() > AREA_THRESHOLD:
-                    cropbox = cropbox.scaled(scale=scale)
-                    yield cropbox
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        with pypdfium2_lock:
-            if not self.text_page:
-                self.text_page = self._ppage.get_textpage()
-        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
-            bbox = bbox.to_bottom_left_origin(self.get_size().height)
-        with pypdfium2_lock:
-            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
-        return text_piece
-    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
-    def get_text_cells(self) -> Iterable[TextCell]:
+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from pypdfium."""
         with pypdfium2_lock:
             if not self.text_page:
                 self.text_page = self._ppage.get_textpage()
@@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
             return merged_cells
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
+        return merge_horizontal_cells(cells)
-        # before merge:
-        # draw_clusters_and_cells()
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+        page_size = self.get_size()
+        with pypdfium2_lock:
+            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
+                pos = obj.get_pos()
+                cropbox = BoundingBox.from_tuple(
+                    pos, origin=CoordOrigin.BOTTOMLEFT
+                ).to_top_left_origin(page_height=page_size.height)
-        cells = merge_horizontal_cells(cells)
+                if cropbox.area() > AREA_THRESHOLD:
+                    cropbox = cropbox.scaled(scale=scale)
-        # after merge:
-        # draw_clusters_and_cells()
+                    yield cropbox
-        return cells
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        with pypdfium2_lock:
+            if not self.text_page:
+                self.text_page = self._ppage.get_textpage()
+        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
+            bbox = bbox.to_bottom_left_origin(self.get_size().height)
+        with pypdfium2_lock:
+            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
+        return text_piece
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        if not self.valid:
+            return None
+        text_cells = self._compute_text_cells()
+        # Get the PDF page geometry from pypdfium2
+        dimension = get_pdf_page_geometry(self._ppage)
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_textlines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._compute_text_cells()
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None

docling/datamodel/base_models.py CHANGED Viewed

@@ -67,10 +67,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.MD: ["md"],
     InputFormat.HTML: ["html", "htm", "xhtml"],
     InputFormat.XML_JATS: ["xml", "nxml"],
-    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
+    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.CSV: ["csv"],
-    InputFormat.XLSX: ["xlsx"],
+    InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
     InputFormat.JSON_DOCLING: ["json"],
 }
@@ -232,7 +232,6 @@ class Page(BaseModel):
     page_no: int
     # page_hash: Optional[str] = None
     size: Optional[Size] = None
-    cells: List[TextCell] = []
     parsed_page: Optional[SegmentedPdfPage] = None
     predictions: PagePredictions = PagePredictions()
     assembled: Optional[AssembledUnit] = None
@@ -245,6 +244,14 @@ class Page(BaseModel):
         float, Image
     ] = {}  # Cache of images in different scales. By default it is cleared during assembling.
+    @property
+    def cells(self) -> List[TextCell]:
+        """Return text cells as a read-only view of parsed_page.textline_cells."""
+        if self.parsed_page is not None:
+            return self.parsed_page.textline_cells
+        else:
+            return []
     def get_image(
         self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
     ) -> Optional[Image]:

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
         ),
     )
-    generate_parsed_pages: bool = False
+    generate_parsed_pages: Literal[True] = (
+        True  # Always True since parsed_page is now mandatory
+    )
 class PdfPipeline(str, Enum):

docling/datamodel/pipeline_options_vlm_model.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any, Dict, List, Literal
+from typing import Any, Dict, List, Literal, Optional, Union
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
@@ -42,6 +42,7 @@ class InlineVlmOptions(BaseVlmOptions):
     transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
     response_format: ResponseFormat
+    torch_dtype: Optional[str] = None
     supported_devices: List[AcceleratorDevice] = [
         AcceleratorDevice.CPU,
         AcceleratorDevice.CUDA,

docling/models/base_ocr_model.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List, Optional, Type
 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
@@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
             return []
     # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
-    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
+    def _filter_ocr_cells(
+        self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
+    ) -> List[TextCell]:
         # Create R-tree index for programmatic cells
         p = index.Property()
         p.dimension = 2
@@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
         ]
         return filtered_ocr_cells
-    def post_process_cells(self, ocr_cells, programmatic_cells):
+    def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
         r"""
-        Post-process the ocr and programmatic cells and return the final list of of cells
+        Post-process the OCR cells and update the page object.
+        Updates parsed_page.textline_cells directly since page.cells is now read-only.
         """
+        # Get existing cells from the read-only property
+        existing_cells = page.cells
+        # Combine existing and OCR cells with overlap filtering
+        final_cells = self._combine_cells(existing_cells, ocr_cells)
+        assert page.parsed_page is not None
+        # Update parsed_page.textline_cells directly
+        page.parsed_page.textline_cells = final_cells
+        page.parsed_page.has_lines = len(final_cells) > 0
+    def _combine_cells(
+        self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
+    ) -> List[TextCell]:
+        """Combine existing and OCR cells with filtering and re-indexing."""
         if self.options.force_full_page_ocr:
-            # If a full page OCR is forced, use only the OCR cells
-            cells = ocr_cells
-            return cells
-        ## Remove OCR cells which overlap with programmatic cells.
-        filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
-        programmatic_cells.extend(filtered_ocr_cells)
-        return programmatic_cells
+            combined = ocr_cells
+        else:
+            filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
+            combined = list(existing_cells) + filtered_ocr_cells
+        # Re-index in-place
+        for i, cell in enumerate(combined):
+            cell.index = i
+        return combined
     def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
         image = copy.deepcopy(page.image)

docling/models/easyocr_model.py CHANGED Viewed

@@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
                         all_ocr_cells.extend(cells)
                     # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                 # DEBUG code:
                 if settings.debug.visualize_ocr:

docling 2.36.0__py3-none-any.whl → 2.37.0__py3-none-any.whl

docling 2.36.0py3-none-any.whl → 2.37.0py3-none-any.whl