PyPI - docling - Versions diffs - 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl - Mend

docling 2.36.1py3-none-any.whl → 2.38.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

docling/backend/asciidoc_backend.py +39 -18
docling/backend/docling_parse_backend.py +61 -59
docling/backend/docling_parse_v2_backend.py +72 -62
docling/backend/docling_parse_v4_backend.py +21 -19
docling/backend/md_backend.py +101 -81
docling/backend/mspowerpoint_backend.py +72 -113
docling/backend/msword_backend.py +99 -80
docling/backend/noop_backend.py +51 -0
docling/backend/pypdfium2_backend.py +127 -53
docling/cli/main.py +82 -14
docling/datamodel/asr_model_specs.py +92 -0
docling/datamodel/base_models.py +21 -4
docling/datamodel/document.py +3 -1
docling/datamodel/pipeline_options.py +15 -2
docling/datamodel/pipeline_options_asr_model.py +57 -0
docling/datamodel/pipeline_options_vlm_model.py +4 -4
docling/document_converter.py +8 -0
docling/models/api_vlm_model.py +3 -1
docling/models/base_model.py +1 -1
docling/models/base_ocr_model.py +33 -11
docling/models/easyocr_model.py +1 -1
docling/models/layout_model.py +2 -3
docling/models/ocr_mac_model.py +1 -1
docling/models/page_preprocessing_model.py +3 -6
docling/models/rapid_ocr_model.py +1 -1
docling/models/readingorder_model.py +3 -3
docling/models/tesseract_ocr_cli_model.py +4 -3
docling/models/tesseract_ocr_model.py +1 -1
docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
docling/models/vlm_models_inline/mlx_model.py +3 -1
docling/pipeline/asr_pipeline.py +253 -0
docling/pipeline/base_pipeline.py +11 -0
docling/pipeline/standard_pdf_pipeline.py +0 -1
docling/utils/layout_postprocessor.py +11 -6
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0

docling/backend/msword_backend.py CHANGED Viewed

@@ -14,7 +14,7 @@ from docling_core.types.doc import (
     TableCell,
     TableData,
 )
-from docling_core.types.doc.document import Formatting
+from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
 from docx import Document
 from docx.document import Document as DocxDocument
 from docx.oxml.table import CT_Tc
@@ -60,8 +60,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.equation_bookends: str = "<eq>{EQ}</eq>"
         # Track processed textbox elements to avoid duplication
         self.processed_textbox_elements: List[int] = []
-        # Track content hash of processed paragraphs to avoid duplicate content
-        self.processed_paragraph_content: List[str] = []
         for i in range(-1, self.max_levels):
             self.parents[i] = None
@@ -86,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             self.valid = True
         except Exception as e:
             raise RuntimeError(
-                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
             ) from e
     @override
@@ -253,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     self._handle_tables(element, docx_obj, doc)
                 except Exception:
                     _log.debug("could not parse a table, broken docx table")
+            # Check for Image
             elif drawing_blip:
                 self._handle_pictures(docx_obj, drawing_blip, doc)
+                # Check for Text after the Image
+                if (
+                    tag_name in ["p"]
+                    and element.find(".//w:t", namespaces=namespaces) is not None
+                ):
+                    self._handle_text_elements(element, docx_obj, doc)
             # Check for the sdt containers, like table of contents
             elif tag_name in ["sdt"]:
                 sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -270,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 self._handle_text_elements(element, docx_obj, doc)
             else:
                 _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
         return doc
     def _str_to_int(
@@ -580,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         all_paragraphs = []
         # Sort paragraphs within each container, then process containers
-        for container_id, paragraphs in container_paragraphs.items():
+        for paragraphs in container_paragraphs.values():
             # Sort by vertical position within each container
             sorted_container_paragraphs = sorted(
                 paragraphs,
@@ -593,9 +598,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             # Add the sorted paragraphs to our processing list
             all_paragraphs.extend(sorted_container_paragraphs)
+        # Track processed paragraphs to avoid duplicates (same content and position)
+        processed_paragraphs = set()
         # Process all the paragraphs
-        for p, _ in all_paragraphs:
-            self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
+        for p, position in all_paragraphs:
+            # Create paragraph object to get text content
+            paragraph = Paragraph(p, docx_obj)
+            text_content = paragraph.text
+            # Create a unique identifier based on content and position
+            paragraph_id = (text_content, position)
+            # Skip if this paragraph (same content and position) was already processed
+            if paragraph_id in processed_paragraphs:
+                _log.debug(
+                    f"Skipping duplicate paragraph: content='{text_content[:50]}...', position={position}"
+                )
+                continue
+            # Mark this paragraph as processed
+            processed_paragraphs.add(paragraph_id)
+            self._handle_text_elements(p, docx_obj, doc)
         # Restore original parent
         self.parents[level] = original_parent
@@ -669,26 +694,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         element: BaseOxmlElement,
         docx_obj: DocxDocument,
         doc: DoclingDocument,
-        is_from_textbox: bool = False,
     ) -> None:
         paragraph = Paragraph(element, docx_obj)
-        # Skip if from a textbox and this exact paragraph content was already processed
-        # Skip if from a textbox and this exact paragraph content was already processed
-        raw_text = paragraph.text
-        if is_from_textbox and raw_text:
-            # Create a simple hash of content to detect duplicates
-            content_hash = f"{len(raw_text)}:{raw_text[:50]}"
-            if content_hash in self.processed_paragraph_content:
-                _log.debug(f"Skipping duplicate paragraph content: {content_hash}")
-                return
-            self.processed_paragraph_content.append(content_hash)
-        text, equations = self._handle_equations_in_text(element=element, text=raw_text)
+        paragraph_elements = self._get_paragraph_elements(paragraph)
+        text, equations = self._handle_equations_in_text(
+            element=element, text=paragraph.text
+        )
         if text is None:
             return
-        paragraph_elements = self._get_paragraph_elements(paragraph)
         text = text.strip()
         # Common styles for bullet and numbered lists.
@@ -750,7 +764,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             self._add_header(doc, p_level, text, is_numbered_style)
         elif len(equations) > 0:
-            if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
+            if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
+                text
+            ) > 0:
                 # Standalone equation
                 level = self._get_level()
                 doc.add_text(
@@ -902,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         )
         return
+    def _add_formatted_list_item(
+        self,
+        doc: DoclingDocument,
+        elements: list,
+        marker: str,
+        enumerated: bool,
+        level: int,
+    ) -> None:
+        # This should not happen by construction
+        if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
+            return
+        if len(elements) == 1:
+            text, format, hyperlink = elements[0]
+            doc.add_list_item(
+                marker=marker,
+                enumerated=enumerated,
+                parent=self.parents[level],
+                text=text,
+                formatting=format,
+                hyperlink=hyperlink,
+            )
+        else:
+            new_item = doc.add_list_item(
+                marker=marker,
+                enumerated=enumerated,
+                parent=self.parents[level],
+                text="",
+            )
+            new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
+            for text, format, hyperlink in elements:
+                doc.add_text(
+                    label=DocItemLabel.TEXT,
+                    parent=new_parent,
+                    text=text,
+                    formatting=format,
+                    hyperlink=hyperlink,
+                )
     def _add_list_item(
         self,
         *,
@@ -911,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         elements: list,
         is_numbered: bool = False,
     ) -> None:
+        # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
+        if not elements:
+            return None
         enum_marker = ""
         level = self._get_level()
@@ -927,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
                 is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[level],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc, elements, enum_marker, is_numbered, level
             )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
         elif (
             self._prev_numid() == numid
             and self.level_at_new_list is not None
@@ -971,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
                 is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc,
+                elements,
+                enum_marker,
+                is_numbered,
+                self.level_at_new_list + ilevel,
             )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
         elif (
             self._prev_numid() == numid
             and self.level_at_new_list is not None
             and prev_indent is not None
             and ilevel < prev_indent
         ):  # Close list
-            for k, v in self.parents.items():
+            for k in self.parents:
                 if k > self.level_at_new_list + ilevel:
                     self.parents[k] = None
@@ -1001,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
                 is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc,
+                elements,
+                enum_marker,
+                is_numbered,
+                self.level_at_new_list + ilevel,
             )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
             self.listIter = 0
         elif self._prev_numid() == numid or prev_indent == ilevel:
@@ -1023,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
                 is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[level - 1],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc, elements, enum_marker, is_numbered, level - 1
             )
-            for text, format, hyperlink in elements:
-                # Add the list item to the parent group
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
         return
     def _handle_tables(

docling/backend/noop_backend.py ADDED Viewed

@@ -0,0 +1,51 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+class NoOpBackend(AbstractDocumentBackend):
+    """
+    A no-op backend that only validates input existence.
+    Used e.g. for audio files where actual processing is handled by the ASR pipeline.
+    """
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        _log.debug(f"NoOpBackend initialized for: {path_or_stream}")
+        # Validate input
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                # Check if stream has content
+                self.valid = len(self.path_or_stream.getvalue()) > 0
+                _log.debug(
+                    f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
+                )
+            elif isinstance(self.path_or_stream, Path):
+                # Check if file exists
+                self.valid = self.path_or_stream.exists()
+                _log.debug(f"File exists: {self.valid}")
+            else:
+                self.valid = False
+        except Exception as e:
+            _log.error(f"NoOpBackend validation failed: {e}")
+            self.valid = False
+    def is_valid(self) -> bool:
+        return self.valid
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set(InputFormat)

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
@@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.utils.locks import pypdfium2_lock
+def get_pdf_page_geometry(
+    ppage: pdfium.PdfPage,
+    angle: float = 0.0,
+    boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
+) -> PdfPageGeometry:
+    """
+    Create PdfPageGeometry from a pypdfium2 PdfPage object.
+    Args:
+        ppage: pypdfium2 PdfPage object
+        angle: Page rotation angle in degrees (default: 0.0)
+        boundary_type: The boundary type for the page (default: CROP_BOX)
+    Returns:
+        PdfPageGeometry with all the different bounding boxes properly set
+    """
+    with pypdfium2_lock:
+        # Get the main bounding box (intersection of crop_box and media_box)
+        bbox_tuple = ppage.get_bbox()
+        bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
+        # Get all the different page boxes from pypdfium2
+        media_box_tuple = ppage.get_mediabox()
+        crop_box_tuple = ppage.get_cropbox()
+        art_box_tuple = ppage.get_artbox()
+        bleed_box_tuple = ppage.get_bleedbox()
+        trim_box_tuple = ppage.get_trimbox()
+        # Convert to BoundingBox objects using existing from_tuple method
+        # pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
+        # Use bbox as fallback when specific box types are not defined
+        media_bbox = (
+            BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if media_box_tuple
+            else bbox
+        )
+        crop_bbox = (
+            BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if crop_box_tuple
+            else bbox
+        )
+        art_bbox = (
+            BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if art_box_tuple
+            else bbox
+        )
+        bleed_bbox = (
+            BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if bleed_box_tuple
+            else bbox
+        )
+        trim_bbox = (
+            BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if trim_box_tuple
+            else bbox
+        )
+        return PdfPageGeometry(
+            angle=angle,
+            rect=BoundingRectangle.from_bounding_box(bbox),
+            boundary_type=boundary_type,
+            art_bbox=art_bbox,
+            bleed_bbox=bleed_bbox,
+            crop_bbox=crop_bbox,
+            media_bbox=media_bbox,
+            trim_bbox=trim_bbox,
+        )
 if TYPE_CHECKING:
     from docling.datamodel.document import InputDocument
@@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
     def is_valid(self) -> bool:
         return self.valid
-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 0  # 32 * 32
-        page_size = self.get_size()
-        with pypdfium2_lock:
-            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
-                pos = obj.get_pos()
-                cropbox = BoundingBox.from_tuple(
-                    pos, origin=CoordOrigin.BOTTOMLEFT
-                ).to_top_left_origin(page_height=page_size.height)
-                if cropbox.area() > AREA_THRESHOLD:
-                    cropbox = cropbox.scaled(scale=scale)
-                    yield cropbox
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        with pypdfium2_lock:
-            if not self.text_page:
-                self.text_page = self._ppage.get_textpage()
-        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
-            bbox = bbox.to_bottom_left_origin(self.get_size().height)
-        with pypdfium2_lock:
-            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
-        return text_piece
-    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
-    def get_text_cells(self) -> Iterable[TextCell]:
+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from pypdfium."""
         with pypdfium2_lock:
             if not self.text_page:
                 self.text_page = self._ppage.get_textpage()
@@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
             return merged_cells
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
+        return merge_horizontal_cells(cells)
-        # before merge:
-        # draw_clusters_and_cells()
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+        page_size = self.get_size()
+        with pypdfium2_lock:
+            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
+                pos = obj.get_pos()
+                cropbox = BoundingBox.from_tuple(
+                    pos, origin=CoordOrigin.BOTTOMLEFT
+                ).to_top_left_origin(page_height=page_size.height)
-        cells = merge_horizontal_cells(cells)
+                if cropbox.area() > AREA_THRESHOLD:
+                    cropbox = cropbox.scaled(scale=scale)
-        # after merge:
-        # draw_clusters_and_cells()
+                    yield cropbox
-        return cells
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        with pypdfium2_lock:
+            if not self.text_page:
+                self.text_page = self._ppage.get_textpage()
+        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
+            bbox = bbox.to_bottom_left_origin(self.get_size().height)
+        with pypdfium2_lock:
+            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
+        return text_piece
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        if not self.valid:
+            return None
+        text_cells = self._compute_text_cells()
+        # Get the PDF page geometry from pypdfium2
+        dimension = get_pdf_page_geometry(self._ppage)
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_textlines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._compute_text_cells()
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None

docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl

docling 2.36.1py3-none-any.whl → 2.38.0py3-none-any.whl