PyPI - docling - Versions diffs - 2.17.0__tar.gz → 2.19.0__tar.gz - Mend

docling 2.17.0tar.gz → 2.19.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{docling-2.17.0 → docling-2.19.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.17.0
+Version: 2.19.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -19,16 +19,17 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Provides-Extra: ocrmac
 Provides-Extra: rapidocr
 Provides-Extra: tesserocr
-Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
+Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
-Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
-Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
+Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -48,8 +49,10 @@ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
 Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
 Requires-Dist: requests (>=2.32.2,<3.0.0)
 Requires-Dist: rtree (>=1.3.0,<2.0.0)
-Requires-Dist: scipy (>=1.6.0,<2.0.0)
+Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
+Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
 Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
+Requires-Dist: tqdm (>=4.65.0,<5.0.0)
 Requires-Dist: typer (>=0.12.5,<0.13.0)
 Project-URL: Repository, https://github.com/DS4SD/docling
 Description-Content-Type: text/markdown
@@ -94,6 +97,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ### Coming soon
 * 📝 Metadata extraction, including title, authors, references & language
+* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
+* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
+* 📝 Complex chemistry understanding (Molecular structures)
 ## Installation

{docling-2.17.0 → docling-2.19.0}/README.md RENAMED Viewed

@@ -38,6 +38,9 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ### Coming soon
 * 📝 Metadata extraction, including title, authors, references & language
+* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
+* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
+* 📝 Complex chemistry understanding (Molecular structures)
 ## Installation

{docling-2.17.0 → docling-2.19.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Optional, Set, Union
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from docling_core.types.doc import (
     DocItemLabel,
     DoclingDocument,
@@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
         _log.debug("About to init HTML backend...")
-        self.soup = None
+        self.soup: Optional[Tag] = None
         # HTML file:
         self.path_or_stream = path_or_stream
         # Initialise the parents for the hierarchy
@@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             )
         return doc
-    def walk(self, element, doc):
+    def walk(self, element: Tag, doc: DoclingDocument):
         try:
             # Iterate over elements in the body of the document
             for idx, element in enumerate(element.children):
@@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return doc
-    def analyse_element(self, element, idx, doc):
+    def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
         """
         if element.name!=None:
             _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
@@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         else:
             self.walk(element, doc)
-    def get_direct_text(self, item):
+    def get_direct_text(self, item: Tag):
         """Get the direct text of the <li> element (ignoring nested lists)."""
         text = item.find(string=True, recursive=False)
         if isinstance(text, str):
@@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return ""
     # Function to recursively extract text from all child nodes
-    def extract_text_recursively(self, item):
+    def extract_text_recursively(self, item: Tag):
         result = []
         if isinstance(item, str):
@@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return "".join(result) + " "
-    def handle_header(self, element, idx, doc):
+    def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles header tags (h1, h2, etc.)."""
         hlevel = int(element.name.replace("h", ""))
         slevel = hlevel - 1
@@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 level=hlevel,
             )
-    def handle_code(self, element, idx, doc):
+    def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles monospace code snippets (pre)."""
         if element.text is None:
             return
@@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         label = DocItemLabel.CODE
         if len(text) == 0:
             return
-        doc.add_code(parent=self.parents[self.level], label=label, text=text)
+        doc.add_code(parent=self.parents[self.level], text=text)
-    def handle_paragraph(self, element, idx, doc):
+    def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles paragraph tags (p)."""
         if element.text is None:
             return
@@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             return
         doc.add_text(parent=self.parents[self.level], label=label, text=text)
-    def handle_list(self, element, idx, doc):
+    def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles list tags (ul, ol) and their list items."""
         if element.name == "ul":
@@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.parents[self.level + 1] = None
         self.level -= 1
-    def handle_listitem(self, element, idx, doc):
+    def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles listitem tags (li)."""
         nested_lists = element.find(["ul", "ol"])
@@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         else:
             _log.warn("list-item has no text: ", element)
-    def handle_table(self, element, idx, doc):
+    def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles table tags."""
         nested_tables = element.find("table")
@@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         doc.add_table(data=data, parent=self.parents[self.level])
-    def get_list_text(self, list_element, level=0):
+    def get_list_text(self, list_element: Tag, level=0):
         """Recursively extract text from <ul> or <ol> with proper indentation."""
         result = []
         bullet_char = "*"  # Default bullet character for unordered lists
@@ -403,7 +403,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return result
-    def extract_table_cell_text(self, cell):
+    def extract_table_cell_text(self, cell: Tag):
         """Extract text from a table cell, including lists with indents."""
         contains_lists = cell.find(["ul", "ol"])
         if contains_lists is None:
@@ -414,7 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             )
             return cell.text
-    def handle_figure(self, element, idx, doc):
+    def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
         """Handles image tags (img)."""
         # Extract the image URI from the <img> tag
@@ -437,6 +437,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 caption=fig_caption,
             )
-    def handle_image(self, element, idx, doc):
+    def handle_image(self, element: Tag, idx, doc: DoclingDocument):
         """Handles image tags (img)."""
         doc.add_picture(parent=self.parents[self.level], caption=None)

{docling-2.17.0 → docling-2.19.0}/docling/backend/md_backend.py RENAMED Viewed

@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import List, Optional, Set, Union
 import marko
+import marko.element
 import marko.ext
 import marko.ext.gfm
 import marko.inline
@@ -23,14 +24,19 @@ from docling_core.types.doc import (
 from marko import Markdown
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
+_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
+_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
+_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
+    def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
         # This regex will match any sequence of underscores
         pattern = r"_+"
@@ -66,6 +72,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         self.in_table = False
         self.md_table_buffer: list[str] = []
         self.inline_texts: list[str] = []
+        self._html_blocks: int = 0
         try:
             if isinstance(self.path_or_stream, BytesIO):
@@ -74,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 # very long sequences of underscores will lead to unnecessary long processing times.
                 # In any proper Markdown files, underscores have to be escaped,
                 # otherwise they represent emphasis (bold or italic)
-                self.markdown = self.shorten_underscore_sequences(text_stream)
+                self.markdown = self._shorten_underscore_sequences(text_stream)
             if isinstance(self.path_or_stream, Path):
                 with open(self.path_or_stream, "r", encoding="utf-8") as f:
                     md_content = f.read()
@@ -82,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     # very long sequences of underscores will lead to unnecessary long processing times.
                     # In any proper Markdown files, underscores have to be escaped,
                     # otherwise they represent emphasis (bold or italic)
-                    self.markdown = self.shorten_underscore_sequences(md_content)
+                    self.markdown = self._shorten_underscore_sequences(md_content)
             self.valid = True
             _log.debug(self.markdown)
@@ -92,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             ) from e
         return
-    def close_table(self, doc: DoclingDocument):
+    def _close_table(self, doc: DoclingDocument):
         if self.in_table:
             _log.debug("=== TABLE START ===")
             for md_table_row in self.md_table_buffer:
@@ -149,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 doc.add_table(data=table_data)
         return
-    def process_inline_text(
-        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    def _process_inline_text(
+        self, parent_item: Optional[NodeItem], doc: DoclingDocument
     ):
         txt = " ".join(self.inline_texts)
         if len(txt) > 0:
             doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
-                parent=parent_element,
+                parent=parent_item,
                 text=txt,
             )
         self.inline_texts = []
-    def iterate_elements(
+    def _iterate_elements(
         self,
-        element: marko.block.Element,
+        element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
-        parent_element: Optional[NodeItem] = None,
+        visited: Set[marko.element.Element],
+        parent_item: Optional[NodeItem] = None,
     ):
+        if element in visited:
+            return
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
-        if isinstance(element, marko.block.Heading):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+        if isinstance(element, marko.block.Heading) and len(element.children) > 0:
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(
                 f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
@@ -200,41 +212,48 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             traverse(element)
             snippet_text = "".join(strings)
             if len(snippet_text) > 0:
-                parent_element = doc.add_text(
-                    label=doc_label, parent=parent_element, text=snippet_text
+                parent_item = doc.add_text(
+                    label=doc_label, parent=parent_item, text=snippet_text
                 )
         elif isinstance(element, marko.block.List):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            has_non_empty_list_items = False
+            for child in element.children:
+                if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
+                    has_non_empty_list_items = True
+                    break
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
-            list_label = GroupLabel.LIST
-            if element.ordered:
-                list_label = GroupLabel.ORDERED_LIST
-            parent_element = doc.add_group(
-                label=list_label, name=f"list", parent=parent_element
-            )
+            if has_non_empty_list_items:
+                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
+                parent_item = doc.add_group(
+                    label=label, name=f"list", parent=parent_item
+                )
-        elif isinstance(element, marko.block.ListItem):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+        elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(" - List item")
-            snippet_text = str(element.children[0].children[0].children)  # type: ignore
+            first_child = element.children[0]
+            snippet_text = str(first_child.children[0].children)  # type: ignore
             is_numbered = False
             if (
-                parent_element is not None
-                and isinstance(parent_element, DocItem)
-                and parent_element.label == GroupLabel.ORDERED_LIST
+                parent_item is not None
+                and isinstance(parent_item, DocItem)
+                and parent_item.label == GroupLabel.ORDERED_LIST
             ):
                 is_numbered = True
             doc.add_list_item(
-                enumerated=is_numbered, parent=parent_element, text=snippet_text
+                enumerated=is_numbered, parent=parent_item, text=snippet_text
             )
+            visited.add(first_child)
         elif isinstance(element, marko.inline.Image):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
             fig_caption: Optional[TextItem] = None
@@ -243,50 +262,44 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     label=DocItemLabel.CAPTION, text=element.title
                 )
-            doc.add_picture(parent=parent_element, caption=fig_caption)
+            doc.add_picture(parent=parent_item, caption=fig_caption)
-        elif isinstance(element, marko.block.Paragraph):
-            self.process_inline_text(parent_element, doc)
+        elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
+            self._process_inline_text(parent_item, doc)
         elif isinstance(element, marko.inline.RawText):
             _log.debug(f" - Paragraph (raw text): {element.children}")
-            snippet_text = str(element.children).strip()
+            snippet_text = element.children.strip()
             # Detect start of the table:
             if "|" in snippet_text:
                 # most likely part of the markdown table
                 self.in_table = True
                 if len(self.md_table_buffer) > 0:
-                    self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
-                        snippet_text
-                    )
+                    self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
                 else:
                     self.md_table_buffer.append(snippet_text)
             else:
-                self.close_table(doc)
-                self.in_table = False
+                self._close_table(doc)
                 # most likely just inline text
                 self.inline_texts.append(str(element.children))
         elif isinstance(element, marko.inline.CodeSpan):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_element, text=snippet_text)
-        elif isinstance(element, marko.block.CodeBlock):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
-            _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()  # type: ignore
-            doc.add_code(parent=parent_element, text=snippet_text)
-        elif isinstance(element, marko.block.FencedCode):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            doc.add_code(parent=parent_item, text=snippet_text)
+        elif (
+            isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
+            and len(element.children) > 0
+            and isinstance((first_child := element.children[0]), marko.inline.RawText)
+            and len(snippet_text := (first_child.children.strip())) > 0
+        ):
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()  # type: ignore
-            doc.add_code(parent=parent_element, text=snippet_text)
+            doc.add_code(parent=parent_item, text=snippet_text)
         elif isinstance(element, marko.inline.LineBreak):
             if self.in_table:
@@ -294,29 +307,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 self.md_table_buffer.append("")
         elif isinstance(element, marko.block.HTMLBlock):
-            self.process_inline_text(parent_element, doc)
-            self.close_table(doc)
+            self._html_blocks += 1
+            self._process_inline_text(parent_item, doc)
+            self._close_table(doc)
             _log.debug("HTML Block: {}".format(element))
             if (
-                len(element.children) > 0
+                len(element.body) > 0
             ):  # If Marko doesn't return any content for HTML block, skip it
-                snippet_text = str(element.children).strip()
-                doc.add_text(
-                    label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-                )
+                html_block = element.body.strip()
+                # wrap in markers to enable post-processing in convert()
+                text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
+                doc.add_code(parent=parent_item, text=text_to_add)
         else:
             if not isinstance(element, str):
-                self.close_table(doc)
+                self._close_table(doc)
                 _log.debug("Some other element: {}".format(element))
+        processed_block_types = (
+            marko.block.Heading,
+            marko.block.CodeBlock,
+            marko.block.FencedCode,
+            marko.inline.RawText,
+        )
         # Iterate through the element's children (if any)
-        if not isinstance(element, marko.block.ListItem):
-            if not isinstance(element, marko.block.Heading):
-                if not isinstance(element, marko.block.FencedCode):
-                    # if not isinstance(element, marko.block.Paragraph):
-                    if hasattr(element, "children"):
-                        for child in element.children:
-                            self.iterate_elements(child, depth + 1, doc, parent_element)
+        if hasattr(element, "children") and not isinstance(
+            element, processed_block_types
+        ):
+            for child in element.children:
+                self._iterate_elements(
+                    element=child,
+                    depth=depth + 1,
+                    doc=doc,
+                    visited=visited,
+                    parent_item=parent_item,
+                )
     def is_valid(self) -> bool:
         return self.valid
@@ -350,8 +376,51 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             marko_parser = Markdown()
             parsed_ast = marko_parser.parse(self.markdown)
             # Start iterating from the root of the AST
-            self.iterate_elements(parsed_ast, 0, doc, None)
-            self.process_inline_text(None, doc)  # handle last hanging inline text
+            self._iterate_elements(
+                element=parsed_ast,
+                depth=0,
+                doc=doc,
+                parent_item=None,
+                visited=set(),
+            )
+            self._process_inline_text(None, doc)  # handle last hanging inline text
+            self._close_table(doc=doc)  # handle any last hanging table
+            # if HTML blocks were detected, export to HTML and delegate to HTML backend
+            if self._html_blocks > 0:
+                # export to HTML
+                html_backend_cls = HTMLDocumentBackend
+                html_str = doc.export_to_html()
+                def _restore_original_html(txt, regex):
+                    _txt, count = re.subn(regex, "", txt)
+                    if count != self._html_blocks:
+                        raise RuntimeError(
+                            "An internal error has occurred during Markdown conversion."
+                        )
+                    return _txt
+                # restore original HTML by removing previouly added markers
+                for regex in [
+                    rf"<pre>\s*<code>\s*{_START_MARKER}",
+                    rf"{_STOP_MARKER}\s*</code>\s*</pre>",
+                ]:
+                    html_str = _restore_original_html(txt=html_str, regex=regex)
+                self._html_blocks = 0
+                # delegate to HTML backend
+                stream = BytesIO(bytes(html_str, encoding="utf-8"))
+                in_doc = InputDocument(
+                    path_or_stream=stream,
+                    format=InputFormat.HTML,
+                    backend=html_backend_cls,
+                    filename=self.file.name,
+                )
+                html_backend_obj = html_backend_cls(
+                    in_doc=in_doc, path_or_stream=stream
+                )
+                doc = html_backend_obj.convert()
         else:
             raise RuntimeError(
                 f"Cannot convert md with {self.document_hash} because the backend failed to init."

docling 2.17.0__tar.gz → 2.19.0__tar.gz

docling 2.17.0tar.gz → 2.19.0tar.gz