PyPI - docling - Versions diffs - 2.2.0__py3-none-any.whl → 2.2.1__py3-none-any.whl - Mend

docling 2.2.0py3-none-any.whl → 2.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

docling/backend/html_backend.py CHANGED Viewed

@@ -136,7 +136,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def get_direct_text(self, item):
         """Get the direct text of the <li> element (ignoring nested lists)."""
         text = item.find(string=True, recursive=False)
         if isinstance(text, str):
             return text.strip()
@@ -149,21 +148,20 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if isinstance(item, str):
             return [item]
-        result.append(self.get_direct_text(item))
-        try:
-            # Iterate over the children (and their text and tails)
-            for child in item:
-                try:
-                    # Recursively get the child's text content
-                    result.extend(self.extract_text_recursively(child))
-                except:
-                    pass
-        except:
-            _log.warn("item has no children")
-            pass
-        return " ".join(result)
+        if item.name not in ["ul", "ol"]:
+            try:
+                # Iterate over the children (and their text and tails)
+                for child in item:
+                    try:
+                        # Recursively get the child's text content
+                        result.extend(self.extract_text_recursively(child))
+                    except:
+                        pass
+            except:
+                _log.warn("item has no children")
+                pass
+        return "".join(result) + " "
     def handle_header(self, element, idx, doc):
         """Handles header tags (h1, h2, etc.)."""
@@ -182,11 +180,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 parent=self.parents[0], label=DocItemLabel.TITLE, text=text
             )
-        elif hlevel == self.level:
-            self.parents[hlevel] = doc.add_text(
-                parent=self.parents[hlevel - 1], label=label, text=text
-            )
         elif hlevel > self.level:
             # add invisible group
@@ -196,10 +189,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     label=GroupLabel.SECTION,
                     parent=self.parents[i - 1],
                 )
-            self.parents[hlevel] = doc.add_text(
-                parent=self.parents[hlevel - 1], label=label, text=text
-            )
             self.level = hlevel
         elif hlevel < self.level:
@@ -208,12 +197,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             for key, val in self.parents.items():
                 if key > hlevel:
                     self.parents[key] = None
-            self.parents[hlevel] = doc.add_text(
-                parent=self.parents[hlevel - 1], label=label, text=text
-            )
             self.level = hlevel
+        self.parents[hlevel] = doc.add_heading(
+            parent=self.parents[hlevel - 1],
+            text=text,
+            level=hlevel,
+        )
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""
         if element.text is None:
@@ -255,7 +246,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if nested_lists:
             name = element.name
-            text = self.get_direct_text(element)
+            # Text in list item can be hidden within hierarchy, hence
+            # we need to extract it recursively
+            text = self.extract_text_recursively(element)
+            # Flatten text, remove break lines:
+            text = text.replace("\n", "").replace("\r", "")
+            text = " ".join(text.split()).strip()
             marker = ""
             enumerated = False
@@ -263,14 +259,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 marker = str(index_in_list)
                 enumerated = True
-            # create a list-item
-            self.parents[self.level + 1] = doc.add_list_item(
-                text=text,
-                enumerated=enumerated,
-                marker=marker,
-                parent=self.parents[self.level],
-            )
-            self.level += 1
+            if len(text) > 0:
+                # create a list-item
+                self.parents[self.level + 1] = doc.add_list_item(
+                    text=text,
+                    enumerated=enumerated,
+                    marker=marker,
+                    parent=self.parents[self.level],
+                )
+                self.level += 1
             self.walk(element, doc)

docling/backend/md_backend.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import logging
+import re
+import warnings
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
@@ -25,6 +27,30 @@ _log = logging.getLogger(__name__)
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
+    def shorten_underscore_sequences(self, markdown_text, max_length=10):
+        # This regex will match any sequence of underscores
+        pattern = r"_+"
+        def replace_match(match):
+            underscore_sequence = match.group(
+                0
+            )  # Get the full match (sequence of underscores)
+            # Shorten the sequence if it exceeds max_length
+            if len(underscore_sequence) > max_length:
+                return "_" * max_length
+            else:
+                return underscore_sequence  # Leave it unchanged if it is shorter or equal to max_length
+        # Use re.sub to replace long underscore sequences
+        shortened_text = re.sub(pattern, replace_match, markdown_text)
+        if len(shortened_text) != len(markdown_text):
+            warnings.warn("Detected potentially incorrect Markdown, correcting...")
+        return shortened_text
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
@@ -42,11 +68,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         try:
             if isinstance(self.path_or_stream, BytesIO):
                 text_stream = self.path_or_stream.getvalue().decode("utf-8")
-                self.markdown = text_stream
+                # remove invalid sequences
+                # very long sequences of underscores will lead to unnecessary long processing times.
+                # In any proper Markdown files, underscores have to be escaped,
+                # otherwise they represent emphasis (bold or italic)
+                self.markdown = self.shorten_underscore_sequences(text_stream)
             if isinstance(self.path_or_stream, Path):
                 with open(self.path_or_stream, "r", encoding="utf-8") as f:
                     md_content = f.read()
-                    self.markdown = md_content
+                    # remove invalid sequences
+                    # very long sequences of underscores will lead to unnecessary long processing times.
+                    # In any proper Markdown files, underscores have to be escaped,
+                    # otherwise they represent emphasis (bold or italic)
+                    self.markdown = self.shorten_underscore_sequences(md_content)
             self.valid = True
             _log.debug(self.markdown)
@@ -135,11 +169,29 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 doc_label = DocItemLabel.TITLE
             else:
                 doc_label = DocItemLabel.SECTION_HEADER
-            snippet_text = element.children[0].children.strip()
-            parent_element = doc.add_text(
-                label=doc_label, parent=parent_element, text=snippet_text
-            )
+            # Header could have arbitrary inclusion of bold, italic or emphasis,
+            # hence we need to traverse the tree to get full text of a header
+            strings = []
+            # Define a recursive function to traverse the tree
+            def traverse(node):
+                # Check if the node has a "children" attribute
+                if hasattr(node, "children"):
+                    # If "children" is a list, continue traversal
+                    if isinstance(node.children, list):
+                        for child in node.children:
+                            traverse(child)
+                    # If "children" is text, add it to header text
+                    elif isinstance(node.children, str):
+                        strings.append(node.children)
+            traverse(element)
+            snippet_text = "".join(strings)
+            if len(snippet_text) > 0:
+                parent_element = doc.add_text(
+                    label=doc_label, parent=parent_element, text=snippet_text
+                )
         elif isinstance(element, marko.block.List):
             self.close_table(doc)
@@ -286,6 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             parsed_ast = marko_parser.parse(self.markdown)
             # Start iterating from the root of the AST
             self.iterate_elements(parsed_ast, 0, doc, None)
+            self.process_inline_text(None, doc)  # handle last hanging inline text
         else:
             raise RuntimeError(
                 f"Cannot convert md with {self.document_hash} because the backend failed to init."

docling/backend/msword_backend.py CHANGED Viewed

@@ -294,13 +294,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         level = self.get_level()
         if isinstance(curr_level, int):
-            if curr_level == level:
-                self.parents[level] = doc.add_heading(
-                    parent=self.parents[level - 1], text=text
-                )
-            elif curr_level > level:
+            if curr_level > level:
                 # add invisible group
                 for i in range(level, curr_level):
@@ -310,10 +304,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                         name=f"header-{i}",
                     )
-                self.parents[curr_level] = doc.add_heading(
-                    parent=self.parents[curr_level - 1], text=text
-                )
             elif curr_level < level:
                 # remove the tail
@@ -321,13 +311,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     if key >= curr_level:
                         self.parents[key] = None
-                self.parents[curr_level] = doc.add_heading(
-                    parent=self.parents[curr_level - 1], text=text
-                )
+            self.parents[curr_level] = doc.add_heading(
+                parent=self.parents[curr_level - 1],
+                text=text,
+                level=curr_level,
+            )
         else:
             self.parents[self.level] = doc.add_heading(
-                parent=self.parents[self.level - 1], text=text
+                parent=self.parents[self.level - 1],
+                text=text,
+                level=1,
             )
         return

{docling-2.2.0.dist-info → docling-2.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.2.0
+Version: 2.2.1
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -23,7 +23,7 @@ Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
-Requires-Dist: docling-core (>=2.1.0,<3.0.0)
+Requires-Dist: docling-core (>=2.2.1,<3.0.0)
 Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
 Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)

{docling-2.2.0.dist-info → docling-2.2.1.dist-info}/RECORD RENAMED Viewed

@@ -4,10 +4,10 @@ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq
 docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
 docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
 docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
-docling/backend/html_backend.py,sha256=wfh5PWEwoqsCXxFCQbFBdJvEtlqZhXgqfPfTYETWHfE,14974
-docling/backend/md_backend.py,sha256=osYiNLnep9UgLq8mUH9bmwG3kP9RXxt69I8LlyeJN6g,11505
+docling/backend/html_backend.py,sha256=TUY5EVv3bo28A_w5CvBgNW4ZqL1d-VxOQPh1_taPHgU,15070
+docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
 docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
-docling/backend/msword_backend.py,sha256=6bY0ebOaeSbpskUJY5t5pOf4a2VclWzeHeSo-vzsaO0,17470
+docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpRwhxI,17230
 docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
 docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,8 +37,8 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
 docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-2.2.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.2.0.dist-info/METADATA,sha256=TkaywA2l2ImdMc9WpUYWUQy3n50zG9Y9eC7ziElBlU0,6205
-docling-2.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-2.2.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-2.2.0.dist-info/RECORD,,
+docling-2.2.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.2.1.dist-info/METADATA,sha256=BOYg-5kaA2Fjxc2bwaJOuAd9LmrQerOzQLHCyaiQ1aE,6205
+docling-2.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.2.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.2.1.dist-info/RECORD,,

{docling-2.2.0.dist-info → docling-2.2.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-2.2.0.dist-info → docling-2.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.2.0.dist-info → docling-2.2.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling 2.2.0__py3-none-any.whl → 2.2.1__py3-none-any.whl

docling 2.2.0py3-none-any.whl → 2.2.1py3-none-any.whl