PyPI - docling - Versions diffs - 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl - Mend

docling 2.15.1py3-none-any.whl → 2.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

docling/backend/abstract_backend.py +0 -1
docling/backend/asciidoc_backend.py +0 -1
docling/backend/docling_parse_backend.py +1 -1
docling/backend/docling_parse_v2_backend.py +1 -1
docling/backend/html_backend.py +4 -3
docling/backend/json/__init__.py +0 -0
docling/backend/json/docling_json_backend.py +58 -0
docling/backend/md_backend.py +49 -36
docling/backend/msexcel_backend.py +50 -38
docling/backend/msword_backend.py +0 -1
docling/backend/pdf_backend.py +0 -2
docling/backend/pypdfium2_backend.py +1 -1
docling/backend/xml/uspto_backend.py +25 -25
docling/cli/main.py +18 -3
docling/datamodel/base_models.py +30 -3
docling/datamodel/document.py +4 -0
docling/datamodel/pipeline_options.py +7 -9
docling/document_converter.py +4 -0
docling/models/base_model.py +62 -6
docling/models/code_formula_model.py +245 -0
docling/models/document_picture_classifier.py +187 -0
docling/models/layout_model.py +10 -86
docling/models/page_assemble_model.py +1 -33
docling/models/rapid_ocr_model.py +1 -0
docling/models/tesseract_ocr_cli_model.py +72 -5
docling/models/tesseract_ocr_model.py +68 -20
docling/pipeline/base_pipeline.py +40 -17
docling/pipeline/standard_pdf_pipeline.py +31 -2
docling/utils/glm_utils.py +4 -1
docling/utils/ocr_utils.py +9 -0
docling/utils/visualization.py +80 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/METADATA +17 -13
docling-2.17.0.dist-info/RECORD +62 -0
docling-2.15.1.dist-info/RECORD +0 -56
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0

docling/backend/abstract_backend.py CHANGED Viewed

@@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC):
     def supports_pagination(cls) -> bool:
         pass
-    @abstractmethod
     def unload(self):
         if isinstance(self.path_or_stream, BytesIO):
             self.path_or_stream.close()

docling/backend/asciidoc_backend.py CHANGED Viewed

@@ -24,7 +24,6 @@ _log = logging.getLogger(__name__)
 class AsciiDocBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)

docling/backend/docling_parse_backend.py CHANGED Viewed

@@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend):
                 l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
             )
         else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t

docling/backend/docling_parse_v2_backend.py CHANGED Viewed

@@ -178,7 +178,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                 l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
             )
         else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t

docling/backend/html_backend.py CHANGED Viewed

@@ -78,10 +78,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if self.is_valid():
             assert self.soup is not None
+            content = self.soup.body or self.soup
             # Replace <br> tags with newline characters
-            for br in self.soup.body.find_all("br"):
+            for br in content.find_all("br"):
                 br.replace_with("\n")
-            doc = self.walk(self.soup.body, doc)
+            doc = self.walk(content, doc)
         else:
             raise RuntimeError(
                 f"Cannot convert doc with {self.document_hash} because the backend failed to init."
@@ -215,7 +216,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         label = DocItemLabel.CODE
         if len(text) == 0:
             return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+        doc.add_code(parent=self.parents[self.level], label=label, text=text)
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""

docling/backend/json/__init__.py ADDED Viewed

File without changes

docling/backend/json/docling_json_backend.py ADDED Viewed

@@ -0,0 +1,58 @@
+from io import BytesIO
+from pathlib import Path
+from typing import Union
+from docling_core.types.doc import DoclingDocument
+from typing_extensions import override
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+class DoclingJSONBackend(DeclarativeDocumentBackend):
+    @override
+    def __init__(
+        self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
+    ) -> None:
+        super().__init__(in_doc, path_or_stream)
+        # given we need to store any actual conversion exception for raising it from
+        # convert(), this captures the successful result or the actual error in a
+        # mutually exclusive way:
+        self._doc_or_err = self._get_doc_or_err()
+    @override
+    def is_valid(self) -> bool:
+        return isinstance(self._doc_or_err, DoclingDocument)
+    @classmethod
+    @override
+    def supports_pagination(cls) -> bool:
+        return False
+    @classmethod
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
+        return {InputFormat.JSON_DOCLING}
+    def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
+        try:
+            json_data: Union[str, bytes]
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, encoding="utf-8") as f:
+                    json_data = f.read()
+            elif isinstance(self.path_or_stream, BytesIO):
+                json_data = self.path_or_stream.getvalue()
+            else:
+                raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
+            return DoclingDocument.model_validate_json(json_data=json_data)
+        except Exception as e:
+            return e
+    @override
+    def convert(self) -> DoclingDocument:
+        if isinstance(self._doc_or_err, DoclingDocument):
+            return self._doc_or_err
+        else:
+            raise self._doc_or_err

docling/backend/md_backend.py CHANGED Viewed

@@ -3,19 +3,22 @@ import re
 import warnings
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import List, Optional, Set, Union
 import marko
 import marko.ext
 import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
+    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
+    NodeItem,
     TableCell,
     TableData,
+    TextItem,
 )
 from marko import Markdown
@@ -27,8 +30,7 @@ _log = logging.getLogger(__name__)
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-    def shorten_underscore_sequences(self, markdown_text, max_length=10):
+    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
         # This regex will match any sequence of underscores
         pattern = r"_+"
@@ -63,7 +65,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         self.in_table = False
         self.md_table_buffer: list[str] = []
-        self.inline_text_buffer = ""
+        self.inline_texts: list[str] = []
         try:
             if isinstance(self.path_or_stream, BytesIO):
@@ -90,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             ) from e
         return
-    def close_table(self, doc=None):
+    def close_table(self, doc: DoclingDocument):
         if self.in_table:
             _log.debug("=== TABLE START ===")
             for md_table_row in self.md_table_buffer:
                 _log.debug(md_table_row)
             _log.debug("=== TABLE END ===")
-            tcells = []
+            tcells: List[TableCell] = []
             result_table = []
             for n, md_table_row in enumerate(self.md_table_buffer):
                 data = []
@@ -137,33 +139,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             self.in_table = False
             self.md_table_buffer = []  # clean table markdown buffer
             # Initialize Docling TableData
-            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
+            table_data = TableData(
+                num_rows=num_rows, num_cols=num_cols, table_cells=tcells
+            )
             # Populate
             for tcell in tcells:
-                data.table_cells.append(tcell)
+                table_data.table_cells.append(tcell)
             if len(tcells) > 0:
-                doc.add_table(data=data)
+                doc.add_table(data=table_data)
         return
-    def process_inline_text(self, parent_element, doc=None):
-        # self.inline_text_buffer += str(text_in)
-        txt = self.inline_text_buffer.strip()
+    def process_inline_text(
+        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    ):
+        txt = " ".join(self.inline_texts)
         if len(txt) > 0:
             doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
                 parent=parent_element,
                 text=txt,
             )
-        self.inline_text_buffer = ""
-    def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
+        self.inline_texts = []
+    def iterate_elements(
+        self,
+        element: marko.block.Element,
+        depth: int,
+        doc: DoclingDocument,
+        parent_element: Optional[NodeItem] = None,
+    ):
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(
-                f" - Heading level {element.level}, content: {element.children[0].children}"
+                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
             if element.level == 1:
                 doc_label = DocItemLabel.TITLE
@@ -172,10 +183,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             # Header could have arbitrary inclusion of bold, italic or emphasis,
             # hence we need to traverse the tree to get full text of a header
-            strings = []
+            strings: List[str] = []
             # Define a recursive function to traverse the tree
-            def traverse(node):
+            def traverse(node: marko.block.BlockElement):
                 # Check if the node has a "children" attribute
                 if hasattr(node, "children"):
                     # If "children" is a list, continue traversal
@@ -209,9 +220,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             self.process_inline_text(parent_element, doc)
             _log.debug(" - List item")
-            snippet_text = str(element.children[0].children[0].children)
+            snippet_text = str(element.children[0].children[0].children)  # type: ignore
             is_numbered = False
-            if parent_element.label == GroupLabel.ORDERED_LIST:
+            if (
+                parent_element is not None
+                and isinstance(parent_element, DocItem)
+                and parent_element.label == GroupLabel.ORDERED_LIST
+            ):
                 is_numbered = True
             doc.add_list_item(
                 enumerated=is_numbered, parent=parent_element, text=snippet_text
@@ -221,7 +236,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
-            doc.add_picture(parent=parent_element, caption=element.title)
+            fig_caption: Optional[TextItem] = None
+            if element.title is not None and element.title != "":
+                fig_caption = doc.add_text(
+                    label=DocItemLabel.CAPTION, text=element.title
+                )
+            doc.add_picture(parent=parent_element, caption=fig_caption)
         elif isinstance(element, marko.block.Paragraph):
             self.process_inline_text(parent_element, doc)
@@ -243,39 +265,30 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 self.close_table(doc)
                 self.in_table = False
                 # most likely just inline text
-                self.inline_text_buffer += str(
-                    element.children
-                )  # do not strip an inline text, as it may contain important spaces
+                self.inline_texts.append(str(element.children))
         elif isinstance(element, marko.inline.CodeSpan):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            doc.add_code(parent=parent_element, text=snippet_text)
         elif isinstance(element, marko.block.CodeBlock):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
         elif isinstance(element, marko.block.FencedCode):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
         elif isinstance(element, marko.inline.LineBreak):
-            self.process_inline_text(parent_element, doc)
             if self.in_table:
                 _log.debug("Line break in a table")
                 self.md_table_buffer.append("")

docling/backend/msexcel_backend.py CHANGED Viewed

@@ -26,6 +26,7 @@ _log = logging.getLogger(__name__)
 from typing import Any, List
+from PIL import Image as PILImage
 from pydantic import BaseModel
@@ -44,7 +45,6 @@ class ExcelTable(BaseModel):
 class MsExcelDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
@@ -326,49 +326,61 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
         self, doc: DoclingDocument, sheet: Worksheet
     ) -> DoclingDocument:
-        # FIXME: mypy does not agree with _images ...
-        """
-        # Iterate over images in the sheet
-        for idx, image in enumerate(sheet._images):  # Access embedded images
+        # Iterate over byte images in the sheet
+        for idx, image in enumerate(sheet._images):  # type: ignore
-            image_bytes = BytesIO(image.ref.blob)
-            pil_image = Image.open(image_bytes)
+            try:
+                pil_image = PILImage.open(image.ref)
-            doc.add_picture(
-                parent=self.parents[0],
-                image=ImageRef.from_pil(image=pil_image, dpi=72),
-                caption=None,
-            )
-        """
+                doc.add_picture(
+                    parent=self.parents[0],
+                    image=ImageRef.from_pil(image=pil_image, dpi=72),
+                    caption=None,
+                )
+            except:
+                _log.error("could not extract the image from excel sheets")
-        # FIXME: mypy does not agree with _charts ...
         """
-        for idx, chart in enumerate(sheet._charts):  # Access embedded charts
-            chart_path = f"chart_{idx + 1}.png"
-            _log.info(
-                f"Chart found, but dynamic rendering is required for: {chart_path}"
-            )
+        for idx, chart in enumerate(sheet._charts):  # type: ignore
+            try:
+                chart_path = f"chart_{idx + 1}.png"
+                _log.info(
+                    f"Chart found, but dynamic rendering is required for: {chart_path}"
+                )
-            _log.info(f"Chart {idx + 1}:")
-            # Chart type
-            _log.info(f"Type: {type(chart).__name__}")
-            # Title
-            if chart.title:
-                _log.info(f"Title: {chart.title}")
-            else:
-                _log.info("No title")
-            # Data series
-            for series in chart.series:
-                _log.info(" => series ...")
-                _log.info(f"Data Series: {series.title}")
-                _log.info(f"Values: {series.values}")
-                _log.info(f"Categories: {series.categories}")
+                _log.info(f"Chart {idx + 1}:")
-            # Position
-            # _log.info(f"Anchor Cell: {chart.anchor}")
+                # Chart type
+                # _log.info(f"Type: {type(chart).__name__}")
+                print(f"Type: {type(chart).__name__}")
+                # Extract series data
+                for series_idx, series in enumerate(chart.series):
+                    #_log.info(f"Series {series_idx + 1}:")
+                    print(f"Series {series_idx + 1} type: {type(series).__name__}")
+                    #print(f"x-values: {series.xVal}")
+                    #print(f"y-values: {series.yVal}")
+                    print(f"xval type: {type(series.xVal).__name__}")
+                    xvals = []
+                    for _ in series.xVal.numLit.pt:
+                        print(f"xval type: {type(_).__name__}")
+                        if hasattr(_, 'v'):
+                            xvals.append(_.v)
+                    print(f"x-values: {xvals}")
+                    yvals = []
+                    for _ in series.yVal:
+                        if hasattr(_, 'v'):
+                            yvals.append(_.v)
+                    print(f"y-values: {yvals}")
+            except Exception as exc:
+                print(exc)
+                continue
         """
         return doc

docling/backend/msword_backend.py CHANGED Viewed

@@ -26,7 +26,6 @@ _log = logging.getLogger(__name__)
 class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
         self.XML_KEY = (

docling/backend/pdf_backend.py CHANGED Viewed

@@ -12,7 +12,6 @@ from docling.datamodel.document import InputDocument
 class PdfPageBackend(ABC):
     @abstractmethod
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
         pass
@@ -45,7 +44,6 @@ class PdfPageBackend(ABC):
 class PdfDocumentBackend(PaginatedDocumentBackend):
     def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -210,7 +210,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
                 l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
             )
         else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t

docling 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl

docling 2.15.1py3-none-any.whl → 2.17.0py3-none-any.whl