PyPI - docling - Versions diffs - 2.29.0__tar.gz → 2.31.0__tar.gz - Mend

docling 2.29.0tar.gz → 2.31.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{docling-2.29.0 → docling-2.31.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.29.0
+Version: 2.31.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/docling-project/docling
 License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
 Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
-Requires-Dist: docling-core[chunking] (>=2.24.1,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
 Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
 Requires-Dist: tqdm (>=4.65.0,<5.0.0)
 Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
 Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
-Requires-Dist: typer (>=0.12.5,<0.13.0)
+Requires-Dist: typer (>=0.12.5,<0.16.0)
 Project-URL: Repository, https://github.com/docling-project/docling
 Description-Content-Type: text/markdown
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
 [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
 [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
 [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
 Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.

{docling-2.29.0 → docling-2.31.0}/README.md RENAMED Viewed

@@ -22,6 +22,7 @@
 [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
 [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
 [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
 Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.

{docling-2.29.0 → docling-2.31.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                 text_stream = self.path_or_stream.getvalue().decode("utf-8")
                 self.lines = text_stream.split("\n")
             if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                     self.lines = f.readlines()
             self.valid = True
@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         return doc
-    def _parse(self, doc: DoclingDocument):
+    def _parse(self, doc: DoclingDocument):  # noqa: C901
         """
         Main function that orchestrates the parsing by yielding components:
         title, section headers, text, lists, and tables.
         """
-        content = ""
         in_list = False
         in_table = False
@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
         indents: dict[int, Union[GroupItem, None]] = {}
-        for i in range(0, 10):
+        for i in range(10):
             parents[i] = None
             indents[i] = None
@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
             # Lists
             elif self._is_list_item(line):
                 _log.debug(f"line: {line}")
                 item = self._parse_list_item(line)
                 _log.debug(f"parsed list-item: {item}")
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                     indents[level + 1] = item["indent"]
                 elif in_list and item["indent"] < indents[level]:
                     # print(item["indent"], " => ", indents[level])
                     while item["indent"] < indents[level]:
                         # print(item["indent"], " => ", indents[level])
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
             elif in_table and (
                 (not self._is_table_line(line)) or line.strip() == "|==="
             ):  # end of table
                 caption = None
                 if len(caption_data) > 0:
                     caption = doc.add_text(
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
             # Picture
             elif self._is_picture(line):
                 caption = None
                 if len(caption_data) > 0:
                     caption = doc.add_text(
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                 text_data = []
             elif len(line.strip()) > 0:  # allow multiline texts
                 item = self._parse_text(line)
                 text_data.append(item["text"])
@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
     def _get_current_level(self, parents):
         for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                 return k - 1
         return 0
     def _get_current_parent(self, parents):
         for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                 return parents[k - 1]
         return None
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                     "marker": marker,
                     "text": text.strip(),
                     "numbered": False,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                 }
             else:
                 return {
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                     "marker": marker,
                     "text": text.strip(),
                     "numbered": True,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                 }
         else:
             # Fallback if no match
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         return [cell.strip() for cell in line.split("|") if cell.strip()]
     def _populate_table_as_grid(self, table_data):
         num_rows = len(table_data)
         # Adjust the table data into a grid format

{docling-2.29.0 → docling-2.31.0}/docling/backend/csv_backend.py RENAMED Viewed

@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
         head = self.content.readline()
         dialect = csv.Sniffer().sniff(head, ",;\t|:")
         _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
-        if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
+        if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
             raise RuntimeError(
                 f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
             )

{docling-2.29.0 → docling-2.31.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()
         if not cropbox:

{docling-2.29.0 → docling-2.31.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()
         if not cropbox:

{docling-2.29.0 → docling-2.31.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

@@ -1,14 +1,14 @@
 import logging
-import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
-from PIL import Image, ImageDraw
+from PIL import Image
 from pypdfium2 import PdfPage
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()
         if not cropbox:

{docling-2.29.0 → docling-2.31.0}/docling/backend/docx/latex/latex_dict.py RENAMED Viewed

@@ -1,12 +1,8 @@
-# -*- coding: utf-8 -*-
 """
 Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
 On 23/01/2025
 """
-from __future__ import unicode_literals
 CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
 BLANK = ""
@@ -79,7 +75,6 @@ CHR_BO = {
 }
 T = {
-    "\u2192": "\\rightarrow ",
     # Greek letters
     "\U0001d6fc": "\\alpha ",
     "\U0001d6fd": "\\beta ",

{docling-2.29.0 → docling-2.31.0}/docling/backend/docx/latex/omml.py RENAMED Viewed

@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
         return default
-class Tag2Method(object):
+class Tag2Method:
     def call_method(self, elm, stag=None):
         getmethod = self.tag2meth.get
         if stag is None:
@@ -130,7 +129,6 @@ class Tag2Method(object):
 class Pr(Tag2Method):
     text = ""
     __val_tags = ("chr", "pos", "begChr", "endChr", "type")
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
     def do_common(self, elm):
         stag = elm.tag.replace(OMML_NS, "")
         if stag in self.__val_tags:
-            t = elm.get("{0}val".format(OMML_NS))
+            t = elm.get(f"{OMML_NS}val")
             self.__innerdict[stag] = t
         return None
@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
         """
         the Pre-Sub-Superscript object -- Not support yet
         """
-        pass
     def do_sub(self, elm):
         text = self.process_children(elm)
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
         t_dict = self.process_children_dict(elm, include=("e", "lim"))
         latex_s = LIM_FUNC.get(t_dict["e"])
         if not latex_s:
-            raise NotSupport("Not support lim %s" % t_dict["e"])
+            raise RuntimeError("Not support lim {}".format(t_dict["e"]))
         else:
             return latex_s.format(lim=t_dict.get("lim"))
@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
         """
         _str = []
         _base_str = []
-        found_text = elm.findtext("./{0}t".format(OMML_NS))
+        found_text = elm.findtext(f"./{OMML_NS}t")
         if found_text:
             for s in found_text:
                 out_latex_str = self.process_unicode(s)

{docling-2.29.0 → docling-2.31.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -26,6 +26,8 @@ _log = logging.getLogger(__name__)
 # tags that generate NodeItem elements
 TAGS_FOR_NODE_ITEMS: Final = [
+    "address",
+    "details",
     "h1",
     "h2",
     "h3",
@@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
     "ul",
     "ol",
     "li",
+    "summary",
     "table",
     "figure",
     "img",
@@ -55,7 +58,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.max_levels = 10
         self.level = 0
         self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
-        for i in range(0, self.max_levels):
+        for i in range(self.max_levels):
             self.parents[i] = None
         try:
@@ -126,7 +129,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return doc
     def walk(self, tag: Tag, doc: DoclingDocument) -> None:
         # Iterate over elements in the body of the document
         text: str = ""
         for element in tag.children:
@@ -135,7 +137,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     self.analyze_tag(cast(Tag, element), doc)
                 except Exception as exc_child:
                     _log.error(
-                        f"Error processing child from tag {tag.name}: {repr(exc_child)}"
+                        f"Error processing child from tag {tag.name}: {exc_child!r}"
                     )
                     raise exc_child
             elif isinstance(element, NavigableString) and not isinstance(
@@ -147,7 +149,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     item for item in element.next_siblings if isinstance(item, Tag)
                 ]
                 if element.next_sibling is None or any(
-                    [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
+                    item.name in TAGS_FOR_NODE_ITEMS for item in siblings
                 ):
                     text = text.strip()
                     if text and tag.name in ["div"]:
@@ -164,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
         if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
             self.handle_header(tag, doc)
-        elif tag.name in ["p"]:
+        elif tag.name in ["p", "address", "summary"]:
             self.handle_paragraph(tag, doc)
         elif tag.name in ["pre", "code"]:
             self.handle_code(tag, doc)
@@ -178,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.handle_figure(tag, doc)
         elif tag.name == "img":
             self.handle_image(tag, doc)
+        elif tag.name == "details":
+            self.handle_details(tag, doc)
         else:
             self.walk(tag, doc)
@@ -202,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return ["".join(result) + " "]
+    def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
+        """Handle details tag (details) and its content."""
+        self.parents[self.level + 1] = doc.add_group(
+            name="details",
+            label=GroupLabel.SECTION,
+            parent=self.parents[self.level],
+            content_layer=self.content_layer,
+        )
+        self.level += 1
+        self.walk(element, doc)
+        self.parents[self.level + 1] = None
+        self.level -= 1
     def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles header tags (h1, h2, etc.)."""
         hlevel = int(element.name.replace("h", ""))
@@ -222,7 +241,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             )
         else:
             if hlevel > self.level:
                 # add invisible group
                 for i in range(self.level + 1, hlevel):
                     self.parents[i] = doc.add_group(
@@ -234,7 +252,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 self.level = hlevel
             elif hlevel < self.level:
                 # remove the tail
                 for key in self.parents.keys():
                     if key > hlevel:
@@ -261,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             )
     def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles paragraph tags (p)."""
+        """Handles paragraph tags (p) or equivalent ones."""
         if element.text is None:
             return
         text = element.text.strip()
@@ -360,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             marker = ""
             enumerated = False
             if parent_label == GroupLabel.ORDERED_LIST:
-                marker = f"{str(index_in_list)}."
+                marker = f"{index_in_list!s}."
                 enumerated = True
             doc.add_list_item(
                 text=text,

{docling-2.29.0 → docling-2.31.0}/docling/backend/md_backend.py RENAMED Viewed

@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 # otherwise they represent emphasis (bold or italic)
                 self.markdown = self._shorten_underscore_sequences(text_stream)
             if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                     md_content = f.read()
                     # remove invalid sequences
                     # very long sequences of underscores will lead to unnecessary long processing times.
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             )
         self.inline_texts = []
-    def _iterate_elements(
+    def _iterate_elements(  # noqa: C901
         self,
         element: marko.element.Element,
         depth: int,
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         visited: Set[marko.element.Element],
         parent_item: Optional[NodeItem] = None,
     ):
         if element in visited:
             return
@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             if has_non_empty_list_items:
                 label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
                 parent_item = doc.add_group(
-                    label=label, name=f"list", parent=parent_item
+                    label=label, name="list", parent=parent_item
                 )
         elif (
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             self._html_blocks += 1
             self._process_inline_text(parent_item, doc)
             self._close_table(doc)
-            _log.debug("HTML Block: {}".format(element))
+            _log.debug(f"HTML Block: {element}")
             if (
                 len(element.body) > 0
             ):  # If Marko doesn't return any content for HTML block, skip it
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         else:
             if not isinstance(element, str):
                 self._close_table(doc)
-                _log.debug("Some other element: {}".format(element))
+                _log.debug(f"Some other element: {element}")
         processed_block_types = (
             marko.block.Heading,
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             # if HTML blocks were detected, export to HTML and delegate to HTML backend
             if self._html_blocks > 0:
                 # export to HTML
                 html_backend_cls = HTMLDocumentBackend
                 html_str = doc.export_to_html()

docling 2.29.0__tar.gz → 2.31.0__tar.gz

docling 2.29.0tar.gz → 2.31.0tar.gz