PyPI - docling - Versions diffs - 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl - Mend

docling 2.36.1py3-none-any.whl → 2.38.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

docling/backend/asciidoc_backend.py +39 -18
docling/backend/docling_parse_backend.py +61 -59
docling/backend/docling_parse_v2_backend.py +72 -62
docling/backend/docling_parse_v4_backend.py +21 -19
docling/backend/md_backend.py +101 -81
docling/backend/mspowerpoint_backend.py +72 -113
docling/backend/msword_backend.py +99 -80
docling/backend/noop_backend.py +51 -0
docling/backend/pypdfium2_backend.py +127 -53
docling/cli/main.py +82 -14
docling/datamodel/asr_model_specs.py +92 -0
docling/datamodel/base_models.py +21 -4
docling/datamodel/document.py +3 -1
docling/datamodel/pipeline_options.py +15 -2
docling/datamodel/pipeline_options_asr_model.py +57 -0
docling/datamodel/pipeline_options_vlm_model.py +4 -4
docling/document_converter.py +8 -0
docling/models/api_vlm_model.py +3 -1
docling/models/base_model.py +1 -1
docling/models/base_ocr_model.py +33 -11
docling/models/easyocr_model.py +1 -1
docling/models/layout_model.py +2 -3
docling/models/ocr_mac_model.py +1 -1
docling/models/page_preprocessing_model.py +3 -6
docling/models/rapid_ocr_model.py +1 -1
docling/models/readingorder_model.py +3 -3
docling/models/tesseract_ocr_cli_model.py +4 -3
docling/models/tesseract_ocr_model.py +1 -1
docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
docling/models/vlm_models_inline/mlx_model.py +3 -1
docling/pipeline/asr_pipeline.py +253 -0
docling/pipeline/base_pipeline.py +11 -0
docling/pipeline/standard_pdf_pipeline.py +0 -1
docling/utils/layout_postprocessor.py +11 -6
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0

docling/backend/md_backend.py CHANGED Viewed

@@ -1,17 +1,15 @@
 import logging
 import re
 import warnings
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Set, Union
 import marko
 import marko.element
-import marko.ext
-import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
-    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
@@ -21,7 +19,9 @@ from docling_core.types.doc import (
     TableData,
     TextItem,
 )
+from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
 from marko import Markdown
+from pydantic import AnyUrl, TypeAdapter
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
@@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         self.in_table = False
         self.md_table_buffer: list[str] = []
-        self.inline_texts: list[str] = []
         self._html_blocks: int = 0
         try:
@@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 doc.add_table(data=table_data)
         return
-    def _process_inline_text(
-        self, parent_item: Optional[NodeItem], doc: DoclingDocument
-    ):
-        txt = " ".join(self.inline_texts)
-        if len(txt) > 0:
-            doc.add_text(
-                label=DocItemLabel.PARAGRAPH,
-                parent=parent_item,
-                text=txt,
-            )
-        self.inline_texts = []
     def _iterate_elements(  # noqa: C901
         self,
+        *,
         element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
         visited: Set[marko.element.Element],
         parent_item: Optional[NodeItem] = None,
+        formatting: Optional[Formatting] = None,
+        hyperlink: Optional[Union[AnyUrl, Path]] = None,
     ):
         if element in visited:
             return
@@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading) and len(element.children) > 0:
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(
                 f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
+            if len(element.children) == 1:
+                child = element.children[0]
+                snippet_text = str(child.children)  # type: ignore
+                visited.add(child)
+            else:
+                snippet_text = ""  # inline group will be created
             if element.level == 1:
-                doc_label = DocItemLabel.TITLE
+                parent_item = doc.add_title(
+                    text=snippet_text,
+                    parent=parent_item,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
             else:
-                doc_label = DocItemLabel.SECTION_HEADER
-            # Header could have arbitrary inclusion of bold, italic or emphasis,
-            # hence we need to traverse the tree to get full text of a header
-            strings: List[str] = []
-            # Define a recursive function to traverse the tree
-            def traverse(node: marko.block.BlockElement):
-                # Check if the node has a "children" attribute
-                if hasattr(node, "children"):
-                    # If "children" is a list, continue traversal
-                    if isinstance(node.children, list):
-                        for child in node.children:
-                            traverse(child)
-                    # If "children" is text, add it to header text
-                    elif isinstance(node.children, str):
-                        strings.append(node.children)
-            traverse(element)
-            snippet_text = "".join(strings)
-            if len(snippet_text) > 0:
-                if doc_label == DocItemLabel.SECTION_HEADER:
-                    parent_item = doc.add_heading(
-                        text=snippet_text,
-                        level=element.level - 1,
-                        parent=parent_item,
-                    )
-                else:
-                    parent_item = doc.add_text(
-                        label=doc_label, parent=parent_item, text=snippet_text
-                    )
+                parent_item = doc.add_heading(
+                    text=snippet_text,
+                    level=element.level - 1,
+                    parent=parent_item,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
         elif isinstance(element, marko.block.List):
             has_non_empty_list_items = False
@@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     break
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
             if has_non_empty_list_items:
                 label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         elif (
             isinstance(element, marko.block.ListItem)
-            and len(element.children) > 0
-            and isinstance((first_child := element.children[0]), marko.block.Paragraph)
+            and len(element.children) == 1
+            and isinstance((child := element.children[0]), marko.block.Paragraph)
+            and len(child.children) > 0
         ):
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(" - List item")
-            snippet_text = str(first_child.children[0].children)  # type: ignore
-            is_numbered = False
-            if (
-                parent_item is not None
-                and isinstance(parent_item, DocItem)
-                and parent_item.label == GroupLabel.ORDERED_LIST
-            ):
-                is_numbered = True
-            doc.add_list_item(
-                enumerated=is_numbered, parent=parent_item, text=snippet_text
+            if len(child.children) == 1:
+                snippet_text = str(child.children[0].children)  # type: ignore
+                visited.add(child)
+            else:
+                snippet_text = ""  # inline group will be created
+            is_numbered = isinstance(parent_item, OrderedList)
+            if not isinstance(parent_item, (OrderedList, UnorderedList)):
+                _log.warning("ListItem would have not had a list parent, adding one.")
+                parent_item = doc.add_unordered_list(parent=parent_item)
+            parent_item = doc.add_list_item(
+                enumerated=is_numbered,
+                parent=parent_item,
+                text=snippet_text,
+                formatting=formatting,
+                hyperlink=hyperlink,
             )
-            visited.add(first_child)
         elif isinstance(element, marko.inline.Image):
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
             fig_caption: Optional[TextItem] = None
             if element.title is not None and element.title != "":
                 fig_caption = doc.add_text(
-                    label=DocItemLabel.CAPTION, text=element.title
+                    label=DocItemLabel.CAPTION,
+                    text=element.title,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
                 )
             doc.add_picture(parent=parent_item, caption=fig_caption)
-        elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
-            self._process_inline_text(parent_item, doc)
+        elif isinstance(element, marko.inline.Emphasis):
+            _log.debug(f" - Emphasis: {element.children}")
+            formatting = deepcopy(formatting) if formatting else Formatting()
+            formatting.italic = True
+        elif isinstance(element, marko.inline.StrongEmphasis):
+            _log.debug(f" - StrongEmphasis: {element.children}")
+            formatting = deepcopy(formatting) if formatting else Formatting()
+            formatting.bold = True
+        elif isinstance(element, marko.inline.Link):
+            _log.debug(f" - Link: {element.children}")
+            hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
+                element.dest
+            )
         elif isinstance(element, marko.inline.RawText):
             _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
                 else:
                     self.md_table_buffer.append(snippet_text)
-            else:
+            elif snippet_text:
                 self._close_table(doc)
-                # most likely just inline text
-                self.inline_texts.append(str(element.children))
+                doc.add_text(
+                    label=DocItemLabel.TEXT,
+                    parent=parent_item,
+                    text=snippet_text,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
         elif isinstance(element, marko.inline.CodeSpan):
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_item, text=snippet_text)
+            doc.add_code(
+                parent=parent_item,
+                text=snippet_text,
+                formatting=formatting,
+                hyperlink=hyperlink,
+            )
         elif (
             isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
             and len(element.children) > 0
-            and isinstance((first_child := element.children[0]), marko.inline.RawText)
-            and len(snippet_text := (first_child.children.strip())) > 0
+            and isinstance((child := element.children[0]), marko.inline.RawText)
+            and len(snippet_text := (child.children.strip())) > 0
         ):
             self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Block: {element.children}")
-            doc.add_code(parent=parent_item, text=snippet_text)
+            doc.add_code(
+                parent=parent_item,
+                text=snippet_text,
+                formatting=formatting,
+                hyperlink=hyperlink,
+            )
         elif isinstance(element, marko.inline.LineBreak):
             if self.in_table:
@@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         elif isinstance(element, marko.block.HTMLBlock):
             self._html_blocks += 1
-            self._process_inline_text(parent_item, doc)
             self._close_table(doc)
             _log.debug(f"HTML Block: {element}")
             if (
@@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 # wrap in markers to enable post-processing in convert()
                 text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
-                doc.add_code(parent=parent_item, text=text_to_add)
+                doc.add_code(
+                    parent=parent_item,
+                    text=text_to_add,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
         else:
             if not isinstance(element, str):
                 self._close_table(doc)
                 _log.debug(f"Some other element: {element}")
+        if (
+            isinstance(element, (marko.block.Paragraph, marko.block.Heading))
+            and len(element.children) > 1
+        ):
+            parent_item = doc.add_inline_group(parent=parent_item)
         processed_block_types = (
-            marko.block.Heading,
+            # marko.block.Heading,
             marko.block.CodeBlock,
             marko.block.FencedCode,
             marko.inline.RawText,
@@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     doc=doc,
                     visited=visited,
                     parent_item=parent_item,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
                 )
     def is_valid(self) -> bool:
@@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 parent_item=None,
                 visited=set(),
             )
-            self._process_inline_text(None, doc)  # handle last hanging inline text
             self._close_table(doc=doc)  # handle any last hanging table
             # if HTML blocks were detected, export to HTML and delegate to HTML backend

docling/backend/mspowerpoint_backend.py CHANGED Viewed

@@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
 from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
+from pptx.oxml.text import CT_TextLineBreak
 from docling.backend.abstract_backend import (
     DeclarativeDocumentBackend,
@@ -120,136 +121,91 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         return prov
-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):  # noqa: C901
-        is_a_list = False
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
         is_list_group_created = False
         enum_list_item_value = 0
         new_list = None
-        bullet_type = "None"
-        list_label = GroupLabel.LIST
         doc_label = DocItemLabel.LIST_ITEM
         prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
-        # Identify if shape contains lists
-        for paragraph in shape.text_frame.paragraphs:
-            # Check if paragraph is a bullet point using the `element` XML
+        def is_list_item(paragraph):
+            """Check if the paragraph is a list item."""
             p = paragraph._element
             if (
                 p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
                 is not None
             ):
-                bullet_type = "Bullet"
-                is_a_list = True
+                return (True, "Bullet")
             elif (
                 p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
                 is not None
             ):
-                bullet_type = "Numbered"
-                is_a_list = True
-            else:
-                is_a_list = False
-            if paragraph.level > 0:
+                return (True, "Numbered")
+            elif paragraph.level > 0:
                 # Most likely a sub-list
-                is_a_list = True
-            if is_a_list:
-                # Determine if this is an unordered list or an ordered list.
-                # Set GroupLabel.ORDERED_LIST when it fits.
-                if bullet_type == "Numbered":
-                    list_label = GroupLabel.ORDERED_LIST
-            if is_a_list:
-                _log.debug("LIST DETECTED!")
+                return (True, "None")
             else:
-                _log.debug("No List")
-        # If there is a list inside of the shape, create a new docling list to assign list items to
-        # if is_a_list:
-        #     new_list = doc.add_group(
-        #         label=list_label, name=f"list", parent=parent_slide
-        #     )
+                return (False, "None")
         # Iterate through paragraphs to build up text
         for paragraph in shape.text_frame.paragraphs:
-            # p_text = paragraph.text.strip()
+            is_a_list, bullet_type = is_list_item(paragraph)
             p = paragraph._element
-            enum_list_item_value += 1
-            inline_paragraph_text = ""
-            inline_list_item_text = ""
-            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
-                if len(e.text.strip()) > 0:
-                    e_is_a_list_item = False
-                    is_numbered = False
-                    if (
-                        p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
-                        is not None
-                    ):
-                        bullet_type = "Bullet"
-                        e_is_a_list_item = True
-                    elif (
-                        p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
-                        is not None
-                    ):
-                        bullet_type = "Numbered"
-                        is_numbered = True
-                        e_is_a_list_item = True
-                    else:
-                        e_is_a_list_item = False
-                    if e_is_a_list_item:
-                        if len(inline_paragraph_text) > 0:
-                            # output accumulated inline text:
-                            doc.add_text(
-                                label=doc_label,
-                                parent=parent_slide,
-                                text=inline_paragraph_text,
-                                prov=prov,
-                            )
-                        # Set marker and enumerated arguments if this is an enumeration element.
-                        inline_list_item_text += e.text
-                        # print(e.text)
-                    else:
-                        # Assign proper label to the text, depending if it's a Title or Section Header
-                        # For other types of text, assign - PARAGRAPH
-                        doc_label = DocItemLabel.PARAGRAPH
-                        if shape.is_placeholder:
-                            placeholder_type = shape.placeholder_format.type
-                            if placeholder_type in [
-                                PP_PLACEHOLDER.CENTER_TITLE,
-                                PP_PLACEHOLDER.TITLE,
-                            ]:
-                                # It's a title
-                                doc_label = DocItemLabel.TITLE
-                            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
-                                DocItemLabel.SECTION_HEADER
-                        enum_list_item_value = 0
-                        inline_paragraph_text += e.text
-            if len(inline_paragraph_text) > 0:
-                # output accumulated inline text:
-                doc.add_text(
-                    label=doc_label,
-                    parent=parent_slide,
-                    text=inline_paragraph_text,
-                    prov=prov,
-                )
-            if len(inline_list_item_text) > 0:
+            # Convert line breaks to spaces and accumulate text
+            p_text = ""
+            for e in p.content_children:
+                if isinstance(e, CT_TextLineBreak):
+                    p_text += " "
+                else:
+                    p_text += e.text
+            if is_a_list:
                 enum_marker = ""
-                if is_numbered:
-                    enum_marker = str(enum_list_item_value) + "."
+                enumerated = bullet_type == "Numbered"
                 if not is_list_group_created:
                     new_list = doc.add_group(
-                        label=list_label, name="list", parent=parent_slide
+                        label=GroupLabel.ORDERED_LIST
+                        if enumerated
+                        else GroupLabel.LIST,
+                        name="list",
+                        parent=parent_slide,
                     )
                     is_list_group_created = True
+                    enum_list_item_value = 0
+                if enumerated:
+                    enum_list_item_value += 1
+                    enum_marker = str(enum_list_item_value) + "."
                 doc.add_list_item(
                     marker=enum_marker,
-                    enumerated=is_numbered,
+                    enumerated=enumerated,
                     parent=new_list,
-                    text=inline_list_item_text,
+                    text=p_text,
+                    prov=prov,
+                )
+            else:  # is paragraph not a list item
+                # Assign proper label to the text, depending if it's a Title or Section Header
+                # For other types of text, assign - PARAGRAPH
+                doc_label = DocItemLabel.PARAGRAPH
+                if shape.is_placeholder:
+                    placeholder_type = shape.placeholder_format.type
+                    if placeholder_type in [
+                        PP_PLACEHOLDER.CENTER_TITLE,
+                        PP_PLACEHOLDER.TITLE,
+                    ]:
+                        # It's a title
+                        doc_label = DocItemLabel.TITLE
+                    elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                        DocItemLabel.SECTION_HEADER
+                # output accumulated inline text:
+                doc.add_text(
+                    label=doc_label,
+                    parent=parent_slide,
+                    text=p_text,
                     prov=prov,
                 )
         return
@@ -423,18 +379,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
             # Handle notes slide
             if slide.has_notes_slide:
                 notes_slide = slide.notes_slide
-                notes_text = notes_slide.notes_text_frame.text.strip()
-                if notes_text:
-                    bbox = BoundingBox(l=0, t=0, r=0, b=0)
-                    prov = ProvenanceItem(
-                        page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
-                    )
-                    doc.add_text(
-                        label=DocItemLabel.TEXT,
-                        parent=parent_slide,
-                        text=notes_text,
-                        prov=prov,
-                        content_layer=ContentLayer.FURNITURE,
-                    )
+                if notes_slide.notes_text_frame is not None:
+                    notes_text = notes_slide.notes_text_frame.text.strip()
+                    if notes_text:
+                        bbox = BoundingBox(l=0, t=0, r=0, b=0)
+                        prov = ProvenanceItem(
+                            page_no=slide_ind + 1,
+                            charspan=[0, len(notes_text)],
+                            bbox=bbox,
+                        )
+                        doc.add_text(
+                            label=DocItemLabel.TEXT,
+                            parent=parent_slide,
+                            text=notes_text,
+                            prov=prov,
+                            content_layer=ContentLayer.FURNITURE,
+                        )
         return doc

docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl

docling 2.36.1py3-none-any.whl → 2.38.0py3-none-any.whl