PyPI - docling - Versions diffs - 2.16.0__py3-none-any.whl → 2.18.0__py3-none-any.whl - Mend

docling 2.16.0py3-none-any.whl → 2.18.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

docling/backend/html_backend.py +21 -20
docling/backend/md_backend.py +92 -43
docling/backend/mspowerpoint_backend.py +39 -27
docling/backend/msword_backend.py +172 -130
docling/backend/xml/uspto_backend.py +25 -25
docling/cli/main.py +18 -3
docling/datamodel/document.py +4 -0
docling/datamodel/pipeline_options.py +1 -0
docling/datamodel/settings.py +16 -1
docling/document_converter.py +12 -2
docling/models/rapid_ocr_model.py +1 -0
docling/models/table_structure_model.py +9 -5
docling/models/tesseract_ocr_cli_model.py +72 -4
docling/models/tesseract_ocr_model.py +37 -37
docling/pipeline/base_pipeline.py +3 -1
docling/utils/glm_utils.py +4 -0
docling/utils/ocr_utils.py +9 -0
{docling-2.16.0.dist-info → docling-2.18.0.dist-info}/METADATA +20 -12
{docling-2.16.0.dist-info → docling-2.18.0.dist-info}/RECORD +22 -21
{docling-2.16.0.dist-info → docling-2.18.0.dist-info}/WHEEL +1 -1
{docling-2.16.0.dist-info → docling-2.18.0.dist-info}/LICENSE +0 -0
{docling-2.16.0.dist-info → docling-2.18.0.dist-info}/entry_points.txt +0 -0

docling/backend/html_backend.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Optional, Set, Union
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from docling_core.types.doc import (
     DocItemLabel,
     DoclingDocument,
@@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
         _log.debug("About to init HTML backend...")
-        self.soup = None
+        self.soup: Optional[Tag] = None
         # HTML file:
         self.path_or_stream = path_or_stream
         # Initialise the parents for the hierarchy
@@ -78,17 +78,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if self.is_valid():
             assert self.soup is not None
+            content = self.soup.body or self.soup
             # Replace <br> tags with newline characters
-            for br in self.soup.body.find_all("br"):
+            for br in content.find_all("br"):
                 br.replace_with("\n")
-            doc = self.walk(self.soup.body, doc)
+            doc = self.walk(content, doc)
         else:
             raise RuntimeError(
                 f"Cannot convert doc with {self.document_hash} because the backend failed to init."
             )
         return doc
-    def walk(self, element, doc):
+    def walk(self, element: Tag, doc: DoclingDocument):
         try:
             # Iterate over elements in the body of the document
             for idx, element in enumerate(element.children):
@@ -105,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return doc
-    def analyse_element(self, element, idx, doc):
+    def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
         """
         if element.name!=None:
             _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
@@ -135,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         else:
             self.walk(element, doc)
-    def get_direct_text(self, item):
+    def get_direct_text(self, item: Tag):
         """Get the direct text of the <li> element (ignoring nested lists)."""
         text = item.find(string=True, recursive=False)
         if isinstance(text, str):
@@ -144,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return ""
     # Function to recursively extract text from all child nodes
-    def extract_text_recursively(self, item):
+    def extract_text_recursively(self, item: Tag):
         result = []
         if isinstance(item, str):
@@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return "".join(result) + " "
-    def handle_header(self, element, idx, doc):
+    def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles header tags (h1, h2, etc.)."""
         hlevel = int(element.name.replace("h", ""))
         slevel = hlevel - 1
@@ -207,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 level=hlevel,
             )
-    def handle_code(self, element, idx, doc):
+    def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles monospace code snippets (pre)."""
         if element.text is None:
             return
@@ -215,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         label = DocItemLabel.CODE
         if len(text) == 0:
             return
-        doc.add_code(parent=self.parents[self.level], label=label, text=text)
+        doc.add_code(parent=self.parents[self.level], text=text)
-    def handle_paragraph(self, element, idx, doc):
+    def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles paragraph tags (p)."""
         if element.text is None:
             return
@@ -227,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             return
         doc.add_text(parent=self.parents[self.level], label=label, text=text)
-    def handle_list(self, element, idx, doc):
+    def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles list tags (ul, ol) and their list items."""
         if element.name == "ul":
@@ -249,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.parents[self.level + 1] = None
         self.level -= 1
-    def handle_listitem(self, element, idx, doc):
+    def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles listitem tags (li)."""
         nested_lists = element.find(["ul", "ol"])
@@ -303,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         else:
             _log.warn("list-item has no text: ", element)
-    def handle_table(self, element, idx, doc):
+    def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles table tags."""
         nested_tables = element.find("table")
@@ -376,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         doc.add_table(data=data, parent=self.parents[self.level])
-    def get_list_text(self, list_element, level=0):
+    def get_list_text(self, list_element: Tag, level=0):
         """Recursively extract text from <ul> or <ol> with proper indentation."""
         result = []
         bullet_char = "*"  # Default bullet character for unordered lists
@@ -402,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return result
-    def extract_table_cell_text(self, cell):
+    def extract_table_cell_text(self, cell: Tag):
         """Extract text from a table cell, including lists with indents."""
         contains_lists = cell.find(["ul", "ol"])
         if contains_lists is None:
@@ -413,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             )
             return cell.text
-    def handle_figure(self, element, idx, doc):
+    def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles image tags (img)."""
         # Extract the image URI from the <img> tag
@@ -436,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 caption=fig_caption,
             )
-    def handle_image(self, element, idx, doc):
+    def handle_image(self, element: Tag, idx, doc: DoclingDocument):
         """Handles image tags (img)."""
         doc.add_picture(parent=self.parents[self.level], caption=None)

docling/backend/md_backend.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import List, Optional, Set, Union
 import marko
+import marko.element
 import marko.ext
 import marko.ext.gfm
 import marko.inline
@@ -23,11 +24,16 @@ from docling_core.types.doc import (
 from marko import Markdown
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
+_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
+_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
+_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
     def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
@@ -65,7 +71,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         self.in_table = False
         self.md_table_buffer: list[str] = []
-        self.inline_text_buffer = ""
+        self.inline_texts: list[str] = []
+        self._html_blocks: int = 0
         try:
             if isinstance(self.path_or_stream, BytesIO):
@@ -152,26 +159,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
     def process_inline_text(
         self, parent_element: Optional[NodeItem], doc: DoclingDocument
     ):
-        # self.inline_text_buffer += str(text_in)
-        txt = self.inline_text_buffer.strip()
+        txt = " ".join(self.inline_texts)
         if len(txt) > 0:
             doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
                 parent=parent_element,
                 text=txt,
             )
-        self.inline_text_buffer = ""
+        self.inline_texts = []
     def iterate_elements(
         self,
-        element: marko.block.Element,
+        element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
         parent_element: Optional[NodeItem] = None,
     ):
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
-        if isinstance(element, marko.block.Heading):
+        if isinstance(element, marko.block.Heading) and len(element.children) > 0:
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(
@@ -206,17 +212,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 )
         elif isinstance(element, marko.block.List):
+            has_non_empty_list_items = False
+            for child in element.children:
+                if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
+                    has_non_empty_list_items = True
+                    break
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
-            list_label = GroupLabel.LIST
-            if element.ordered:
-                list_label = GroupLabel.ORDERED_LIST
-            parent_element = doc.add_group(
-                label=list_label, name=f"list", parent=parent_element
-            )
+            if has_non_empty_list_items:
+                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
+                parent_element = doc.add_group(
+                    label=label, name=f"list", parent=parent_element
+                )
-        elif isinstance(element, marko.block.ListItem):
+        elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(" - List item")
@@ -246,29 +257,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             doc.add_picture(parent=parent_element, caption=fig_caption)
-        elif isinstance(element, marko.block.Paragraph):
+        elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
             self.process_inline_text(parent_element, doc)
         elif isinstance(element, marko.inline.RawText):
             _log.debug(f" - Paragraph (raw text): {element.children}")
-            snippet_text = str(element.children).strip()
+            snippet_text = element.children.strip()
             # Detect start of the table:
             if "|" in snippet_text:
                 # most likely part of the markdown table
                 self.in_table = True
                 if len(self.md_table_buffer) > 0:
-                    self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
-                        snippet_text
-                    )
+                    self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
                 else:
                     self.md_table_buffer.append(snippet_text)
             else:
                 self.close_table(doc)
                 self.in_table = False
                 # most likely just inline text
-                self.inline_text_buffer += str(
-                    element.children
-                )  # do not strip an inline text, as it may contain important spaces
+                self.inline_texts.append(str(element.children))
         elif isinstance(element, marko.inline.CodeSpan):
             self.close_table(doc)
@@ -277,50 +284,55 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             snippet_text = str(element.children).strip()
             doc.add_code(parent=parent_element, text=snippet_text)
-        elif isinstance(element, marko.block.CodeBlock):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
-            _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()  # type: ignore
-            doc.add_code(parent=parent_element, text=snippet_text)
-        elif isinstance(element, marko.block.FencedCode):
+        elif (
+            isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
+            and len(element.children) > 0
+            and isinstance((first_child := element.children[0]), marko.inline.RawText)
+            and len(snippet_text := (first_child.children.strip())) > 0
+        ):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()  # type: ignore
             doc.add_code(parent=parent_element, text=snippet_text)
         elif isinstance(element, marko.inline.LineBreak):
-            self.process_inline_text(parent_element, doc)
             if self.in_table:
                 _log.debug("Line break in a table")
                 self.md_table_buffer.append("")
         elif isinstance(element, marko.block.HTMLBlock):
+            self._html_blocks += 1
             self.process_inline_text(parent_element, doc)
             self.close_table(doc)
             _log.debug("HTML Block: {}".format(element))
             if (
-                len(element.children) > 0
+                len(element.body) > 0
             ):  # If Marko doesn't return any content for HTML block, skip it
-                snippet_text = str(element.children).strip()
-                doc.add_text(
-                    label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-                )
+                html_block = element.body.strip()
+                # wrap in markers to enable post-processing in convert()
+                text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
+                doc.add_code(parent=parent_element, text=text_to_add)
         else:
             if not isinstance(element, str):
                 self.close_table(doc)
                 _log.debug("Some other element: {}".format(element))
+        processed_block_types = (
+            marko.block.ListItem,
+            marko.block.Heading,
+            marko.block.CodeBlock,
+            marko.block.FencedCode,
+            # marko.block.Paragraph,
+            marko.inline.RawText,
+        )
         # Iterate through the element's children (if any)
-        if not isinstance(element, marko.block.ListItem):
-            if not isinstance(element, marko.block.Heading):
-                if not isinstance(element, marko.block.FencedCode):
-                    # if not isinstance(element, marko.block.Paragraph):
-                    if hasattr(element, "children"):
-                        for child in element.children:
-                            self.iterate_elements(child, depth + 1, doc, parent_element)
+        if hasattr(element, "children") and not isinstance(
+            element, processed_block_types
+        ):
+            for child in element.children:
+                self.iterate_elements(child, depth + 1, doc, parent_element)
     def is_valid(self) -> bool:
         return self.valid
@@ -356,6 +368,43 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             # Start iterating from the root of the AST
             self.iterate_elements(parsed_ast, 0, doc, None)
             self.process_inline_text(None, doc)  # handle last hanging inline text
+            self.close_table(doc=doc)  # handle any last hanging table
+            # if HTML blocks were detected, export to HTML and delegate to HTML backend
+            if self._html_blocks > 0:
+                # export to HTML
+                html_backend_cls = HTMLDocumentBackend
+                html_str = doc.export_to_html()
+                def _restore_original_html(txt, regex):
+                    _txt, count = re.subn(regex, "", txt)
+                    if count != self._html_blocks:
+                        raise RuntimeError(
+                            "An internal error has occurred during Markdown conversion."
+                        )
+                    return _txt
+                # restore original HTML by removing previouly added markers
+                for regex in [
+                    rf"<pre>\s*<code>\s*{_START_MARKER}",
+                    rf"{_STOP_MARKER}\s*</code>\s*</pre>",
+                ]:
+                    html_str = _restore_original_html(txt=html_str, regex=regex)
+                self._html_blocks = 0
+                # delegate to HTML backend
+                stream = BytesIO(bytes(html_str, encoding="utf-8"))
+                in_doc = InputDocument(
+                    path_or_stream=stream,
+                    format=InputFormat.HTML,
+                    backend=html_backend_cls,
+                    filename=self.file.name,
+                )
+                html_backend_obj = html_backend_cls(
+                    in_doc=in_doc, path_or_stream=stream
+                )
+                doc = html_backend_obj.convert()
         else:
             raise RuntimeError(
                 f"Cannot convert md with {self.document_hash} because the backend failed to init."

docling/backend/mspowerpoint_backend.py CHANGED Viewed

@@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         return doc
-    def generate_prov(self, shape, slide_ind, text=""):
-        left = shape.left
-        top = shape.top
-        width = shape.width
-        height = shape.height
+    def generate_prov(
+        self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
+    ):
+        if shape.left:
+            left = shape.left
+            top = shape.top
+            width = shape.width
+            height = shape.height
+        else:
+            left = 0
+            top = 0
+            width = slide_size.width
+            height = slide_size.height
         shape_bbox = [left, top, left + width, top + height]
         shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
-        # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
         prov = ProvenanceItem(
             page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
         )
         return prov
-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
         is_a_list = False
         is_list_group_created = False
         enum_list_item_value = 0
@@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         list_text = ""
         list_label = GroupLabel.LIST
         doc_label = DocItemLabel.LIST_ITEM
-        prov = self.generate_prov(shape, slide_ind, shape.text.strip())
+        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
         # Identify if shape contains lists
         for paragraph in shape.text_frame.paragraphs:
@@ -270,18 +277,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                 )
         return
-    def handle_pictures(self, shape, parent_slide, slide_ind, doc):
-        # Get the image bytes
-        image = shape.image
-        image_bytes = image.blob
-        im_dpi, _ = image.dpi
+    def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
         # Open it with PIL
         try:
+            # Get the image bytes
+            image = shape.image
+            image_bytes = image.blob
+            im_dpi, _ = image.dpi
             pil_image = Image.open(BytesIO(image_bytes))
             # shape has picture
-            prov = self.generate_prov(shape, slide_ind, "")
+            prov = self.generate_prov(shape, slide_ind, "", slide_size)
             doc.add_picture(
                 parent=parent_slide,
                 image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
@@ -292,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
             _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
         return
-    def handle_tables(self, shape, parent_slide, slide_ind, doc):
+    def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
         # Handling tables, images, charts
         if shape.has_table:
             table = shape.table
             table_xml = shape._element
-            prov = self.generate_prov(shape, slide_ind, "")
+            prov = self.generate_prov(shape, slide_ind, "", slide_size)
             num_cols = 0
             num_rows = len(table.rows)
@@ -375,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                 name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
             )
-            size = Size(width=slide_width, height=slide_height)
-            parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
+            slide_size = Size(width=slide_width, height=slide_height)
+            parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
-            def handle_shapes(shape, parent_slide, slide_ind, doc):
-                handle_groups(shape, parent_slide, slide_ind, doc)
+            def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
+                handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
                 if shape.has_table:
                     # Handle Tables
-                    self.handle_tables(shape, parent_slide, slide_ind, doc)
+                    self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
                 if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                     # Handle Pictures
-                    self.handle_pictures(shape, parent_slide, slide_ind, doc)
+                    self.handle_pictures(
+                        shape, parent_slide, slide_ind, doc, slide_size
+                    )
                 # If shape doesn't have any text, move on to the next shape
                 if not hasattr(shape, "text"):
                     return
@@ -397,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                     _log.warning("Warning: shape has text but not text_frame")
                     return
                 # Handle other text elements, including lists (bullet lists, numbered lists)
-                self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+                self.handle_text_elements(
+                    shape, parent_slide, slide_ind, doc, slide_size
+                )
                 return
-            def handle_groups(shape, parent_slide, slide_ind, doc):
+            def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
                 if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
                     for groupedshape in shape.shapes:
-                        handle_shapes(groupedshape, parent_slide, slide_ind, doc)
+                        handle_shapes(
+                            groupedshape, parent_slide, slide_ind, doc, slide_size
+                        )
             # Loop through each shape in the slide
             for shape in slide.shapes:
-                handle_shapes(shape, parent_slide, slide_ind, doc)
+                handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
         return doc

docling 2.16.0__py3-none-any.whl → 2.18.0__py3-none-any.whl

docling 2.16.0py3-none-any.whl → 2.18.0py3-none-any.whl