PyPI - docling - Versions diffs - 2.17.0__py3-none-any.whl → 2.19.0__py3-none-any.whl - Mend

docling 2.17.0py3-none-any.whl → 2.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

docling/backend/html_backend.py +18 -18
docling/backend/md_backend.py +144 -75
docling/backend/mspowerpoint_backend.py +39 -27
docling/backend/msword_backend.py +173 -131
docling/cli/main.py +8 -0
docling/cli/models.py +105 -0
docling/cli/tools.py +17 -0
docling/datamodel/document.py +2 -0
docling/datamodel/settings.py +18 -1
docling/document_converter.py +12 -2
docling/models/base_model.py +3 -0
docling/models/code_formula_model.py +15 -9
docling/models/document_picture_classifier.py +11 -8
docling/models/easyocr_model.py +50 -3
docling/models/layout_model.py +49 -3
docling/models/table_structure_model.py +53 -7
docling/pipeline/base_pipeline.py +4 -2
docling/pipeline/standard_pdf_pipeline.py +25 -24
docling/utils/glm_utils.py +4 -0
docling/utils/model_downloader.py +72 -0
docling/utils/utils.py +24 -0
{docling-2.17.0.dist-info → docling-2.19.0.dist-info}/METADATA +11 -5
{docling-2.17.0.dist-info → docling-2.19.0.dist-info}/RECORD +26 -23
{docling-2.17.0.dist-info → docling-2.19.0.dist-info}/WHEEL +1 -1
{docling-2.17.0.dist-info → docling-2.19.0.dist-info}/entry_points.txt +1 -0
{docling-2.17.0.dist-info → docling-2.19.0.dist-info}/LICENSE +0 -0

docling/backend/msword_backend.py CHANGED Viewed

@@ -2,21 +2,28 @@ import logging
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Any, Optional, Union
-import docx
 from docling_core.types.doc import (
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
     ImageRef,
+    NodeItem,
     TableCell,
     TableData,
 )
+from docx import Document
+from docx.document import Document as DocxDocument
+from docx.oxml.table import CT_Tc
+from docx.oxml.xmlchemy import BaseOxmlElement
+from docx.table import Table, _Cell
+from docx.text.paragraph import Paragraph
 from lxml import etree
 from lxml.etree import XPath
 from PIL import Image, UnidentifiedImageError
+from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@@ -26,7 +33,10 @@ _log = logging.getLogger(__name__)
 class MsWordDocumentBackend(DeclarativeDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+    @override
+    def __init__(
+        self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
+    ) -> None:
         super().__init__(in_doc, path_or_stream)
         self.XML_KEY = (
             "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
@@ -36,19 +46,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         }
         # self.initialise(path_or_stream)
         # Word file:
-        self.path_or_stream = path_or_stream
-        self.valid = False
+        self.path_or_stream: Union[BytesIO, Path] = path_or_stream
+        self.valid: bool = False
         # Initialise the parents for the hierarchy
-        self.max_levels = 10
-        self.level_at_new_list = None
-        self.parents = {}  # type: ignore
+        self.max_levels: int = 10
+        self.level_at_new_list: Optional[int] = None
+        self.parents: dict[int, Optional[NodeItem]] = {}
         for i in range(-1, self.max_levels):
             self.parents[i] = None
         self.level = 0
         self.listIter = 0
-        self.history = {
+        self.history: dict[str, Any] = {
             "names": [None],
             "levels": [None],
             "numids": [None],
@@ -58,9 +68,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.docx_obj = None
         try:
             if isinstance(self.path_or_stream, BytesIO):
-                self.docx_obj = docx.Document(self.path_or_stream)
+                self.docx_obj = Document(self.path_or_stream)
             elif isinstance(self.path_or_stream, Path):
-                self.docx_obj = docx.Document(str(self.path_or_stream))
+                self.docx_obj = Document(str(self.path_or_stream))
             self.valid = True
         except Exception as e:
@@ -68,13 +78,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
             ) from e
+    @override
     def is_valid(self) -> bool:
         return self.valid
     @classmethod
+    @override
     def supports_pagination(cls) -> bool:
         return False
+    @override
     def unload(self):
         if isinstance(self.path_or_stream, BytesIO):
             self.path_or_stream.close()
@@ -82,11 +95,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.path_or_stream = None
     @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.DOCX}
+    @override
     def convert(self) -> DoclingDocument:
-        # Parses the DOCX into a structured document model.
+        """Parses the DOCX into a structured document model.
+        Returns:
+            The parsed document.
+        """
         origin = DocumentOrigin(
             filename=self.file.name or "file",
@@ -104,23 +123,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 f"Cannot convert doc with {self.document_hash} because the backend failed to init."
             )
-    def update_history(self, name, level, numid, ilevel):
+    def update_history(
+        self,
+        name: str,
+        level: Optional[int],
+        numid: Optional[int],
+        ilevel: Optional[int],
+    ):
         self.history["names"].append(name)
         self.history["levels"].append(level)
         self.history["numids"].append(numid)
         self.history["indents"].append(ilevel)
-    def prev_name(self):
+    def prev_name(self) -> Optional[str]:
         return self.history["names"][-1]
-    def prev_level(self):
+    def prev_level(self) -> Optional[int]:
         return self.history["levels"][-1]
-    def prev_numid(self):
+    def prev_numid(self) -> Optional[int]:
         return self.history["numids"][-1]
-    def prev_indent(self):
+    def prev_indent(self) -> Optional[int]:
         return self.history["indents"][-1]
     def get_level(self) -> int:
@@ -130,13 +155,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 return k
         return 0
-    def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
+    def walk_linear(
+        self,
+        body: BaseOxmlElement,
+        docx_obj: DocxDocument,
+        doc: DoclingDocument,
+    ) -> DoclingDocument:
         for element in body:
             tag_name = etree.QName(element).localname
             # Check for Inline Images (blip elements)
             namespaces = {
                 "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
                 "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+                "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
             }
             xpath_expr = XPath(".//a:blip", namespaces=namespaces)
             drawing_blip = xpath_expr(element)
@@ -149,7 +180,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     _log.debug("could not parse a table, broken docx table")
             elif drawing_blip:
-                self.handle_pictures(element, docx_obj, drawing_blip, doc)
+                self.handle_pictures(docx_obj, drawing_blip, doc)
+            # Check for the sdt containers, like table of contents
+            elif tag_name in ["sdt"]:
+                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
+                if sdt_content is not None:
+                    # Iterate paragraphs, runs, or text inside <w:sdtContent>.
+                    paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
+                    for p in paragraphs:
+                        self.handle_text_elements(p, docx_obj, doc)
             # Check for Text
             elif tag_name in ["p"]:
                 # "tcPr", "sectPr"
@@ -158,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
         return doc
-    def str_to_int(self, s, default=0):
+    def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
         if s is None:
             return None
         try:
@@ -166,7 +205,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         except ValueError:
             return default
-    def split_text_and_number(self, input_string):
+    def split_text_and_number(self, input_string: str) -> list[str]:
         match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
         if match:
             parts = list(filter(None, match.groups()))
@@ -174,7 +213,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         else:
             return [input_string]
-    def get_numId_and_ilvl(self, paragraph):
+    def get_numId_and_ilvl(
+        self, paragraph: Paragraph
+    ) -> tuple[Optional[int], Optional[int]]:
         # Access the XML element of the paragraph
         numPr = paragraph._element.find(
             ".//w:numPr", namespaces=paragraph._element.nsmap
@@ -187,13 +228,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
             ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
-            return self.str_to_int(numId, default=None), self.str_to_int(
-                ilvl, default=None
-            )
+            return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
         return None, None  # If the paragraph is not part of a list
-    def get_label_and_level(self, paragraph):
+    def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
         if paragraph.style is None:
             return "Normal", None
         label = paragraph.style.style_id
@@ -203,26 +242,31 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             parts = label.split(":")
             if len(parts) == 2:
-                return parts[0], int(parts[1])
+                return parts[0], self.str_to_int(parts[1], None)
         parts = self.split_text_and_number(label)
         if "Heading" in label and len(parts) == 2:
             parts.sort()
-            label_str = ""
-            label_level = 0
+            label_str: str = ""
+            label_level: Optional[int] = 0
             if parts[0] == "Heading":
                 label_str = parts[0]
-                label_level = self.str_to_int(parts[1], default=None)
+                label_level = self.str_to_int(parts[1], None)
             if parts[1] == "Heading":
                 label_str = parts[1]
-                label_level = self.str_to_int(parts[0], default=None)
+                label_level = self.str_to_int(parts[0], None)
             return label_str, label_level
         else:
             return label, None
-    def handle_text_elements(self, element, docx_obj, doc):
-        paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
+    def handle_text_elements(
+        self,
+        element: BaseOxmlElement,
+        docx_obj: DocxDocument,
+        doc: DoclingDocument,
+    ) -> None:
+        paragraph = Paragraph(element, docx_obj)
         if paragraph.text is None:
             return
@@ -240,13 +284,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             numid = None
         # Handle lists
-        if numid is not None and ilevel is not None:
+        if (
+            numid is not None
+            and ilevel is not None
+            and p_style_id not in ["Title", "Heading"]
+        ):
             self.add_listitem(
-                element,
-                docx_obj,
                 doc,
-                p_style_id,
-                p_level,
                 numid,
                 ilevel,
                 text,
@@ -254,20 +298,30 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             )
             self.update_history(p_style_id, p_level, numid, ilevel)
             return
-        elif numid is None and self.prev_numid() is not None:  # Close list
-            for key, val in self.parents.items():
-                if key >= self.level_at_new_list:
+        elif (
+            numid is None
+            and self.prev_numid() is not None
+            and p_style_id not in ["Title", "Heading"]
+        ):  # Close list
+            if self.level_at_new_list:
+                for key in range(len(self.parents)):
+                    if key >= self.level_at_new_list:
+                        self.parents[key] = None
+                self.level = self.level_at_new_list - 1
+                self.level_at_new_list = None
+            else:
+                for key in range(len(self.parents)):
                     self.parents[key] = None
-            self.level = self.level_at_new_list - 1
-            self.level_at_new_list = None
+                self.level = 0
         if p_style_id in ["Title"]:
-            for key, val in self.parents.items():
+            for key in range(len(self.parents)):
                 self.parents[key] = None
             self.parents[0] = doc.add_text(
                 parent=None, label=DocItemLabel.TITLE, text=text
             )
         elif "Heading" in p_style_id:
-            self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
+            self.add_header(doc, p_level, text)
         elif p_style_id in [
             "Paragraph",
@@ -295,7 +349,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.update_history(p_style_id, p_level, numid, ilevel)
         return
-    def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
+    def add_header(
+        self, doc: DoclingDocument, curr_level: Optional[int], text: str
+    ) -> None:
         level = self.get_level()
         if isinstance(curr_level, int):
             if curr_level > level:
@@ -308,7 +364,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     )
             elif curr_level < level:
                 # remove the tail
-                for key, val in self.parents.items():
+                for key in range(len(self.parents)):
                     if key >= curr_level:
                         self.parents[key] = None
@@ -327,22 +383,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def add_listitem(
         self,
-        element,
-        docx_obj,
-        doc,
-        p_style_id,
-        p_level,
-        numid,
-        ilevel,
+        doc: DoclingDocument,
+        numid: int,
+        ilevel: int,
         text: str,
-        is_numbered=False,
-    ):
-        # is_numbered = is_numbered
+        is_numbered: bool = False,
+    ) -> None:
         enum_marker = ""
         level = self.get_level()
+        prev_indent = self.prev_indent()
         if self.prev_numid() is None:  # Open new list
-            self.level_at_new_list = level  # type: ignore
+            self.level_at_new_list = level
             self.parents[level] = doc.add_group(
                 label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
@@ -361,10 +413,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             )
         elif (
-            self.prev_numid() == numid and self.prev_indent() < ilevel
+            self.prev_numid() == numid
+            and self.level_at_new_list is not None
+            and prev_indent is not None
+            and prev_indent < ilevel
         ):  # Open indented list
             for i in range(
-                self.level_at_new_list + self.prev_indent() + 1,
+                self.level_at_new_list + prev_indent + 1,
                 self.level_at_new_list + ilevel + 1,
             ):
                 # Determine if this is an unordered list or an ordered list.
@@ -393,7 +448,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 text=text,
             )
-        elif self.prev_numid() == numid and ilevel < self.prev_indent():  # Close list
+        elif (
+            self.prev_numid() == numid
+            and self.level_at_new_list is not None
+            and prev_indent is not None
+            and ilevel < prev_indent
+        ):  # Close list
             for k, v in self.parents.items():
                 if k > self.level_at_new_list + ilevel:
                     self.parents[k] = None
@@ -411,7 +471,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             )
             self.listIter = 0
-        elif self.prev_numid() == numid or self.prev_indent() == ilevel:
+        elif self.prev_numid() == numid or prev_indent == ilevel:
             # TODO: Set marker and enumerated arguments if this is an enumeration element.
             self.listIter += 1
             if is_numbered:
@@ -425,31 +485,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             )
         return
-    def handle_tables(self, element, docx_obj, doc):
-        # Function to check if a cell has a colspan (gridSpan)
-        def get_colspan(cell):
-            grid_span = cell._element.xpath("@w:gridSpan")
-            if grid_span:
-                return int(grid_span[0])  # Return the number of columns spanned
-            return 1  # Default is 1 (no colspan)
-        # Function to check if a cell has a rowspan (vMerge)
-        def get_rowspan(cell):
-            v_merge = cell._element.xpath("@w:vMerge")
-            if v_merge:
-                return v_merge[
-                    0
-                ]  # 'restart' indicates the beginning of a rowspan, others are continuation
-            return 1
-        table = docx.table.Table(element, docx_obj)
+    def handle_tables(
+        self,
+        element: BaseOxmlElement,
+        docx_obj: DocxDocument,
+        doc: DoclingDocument,
+    ) -> None:
+        table: Table = Table(element, docx_obj)
         num_rows = len(table.rows)
-        num_cols = 0
-        for row in table.rows:
-            # Calculate the max number of columns
-            num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
+        num_cols = len(table.columns)
+        _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
         if num_rows == 1 and num_cols == 1:
             cell_element = table.rows[0].cells[0]
@@ -458,59 +503,56 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             self.walk_linear(cell_element._element, docx_obj, doc)
             return
-        # Initialize the table grid
-        table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+        data = TableData(num_rows=num_rows, num_cols=num_cols)
+        cell_set: set[CT_Tc] = set()
         for row_idx, row in enumerate(table.rows):
+            _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
             col_idx = 0
-            for c, cell in enumerate(row.cells):
-                row_span = get_rowspan(cell)
-                col_span = get_colspan(cell)
-                cell_text = cell.text
-                # In case cell doesn't return text via docx library:
-                if len(cell_text) == 0:
-                    cell_xml = cell._element
-                    texts = [""]
-                    for elem in cell_xml.iter():
-                        if elem.tag.endswith("t"):  # <w:t> tags that contain text
-                            if elem.text:
-                                texts.append(elem.text)
-                    # Join the collected text
-                    cell_text = " ".join(texts).strip()
-                # Find the next available column in the grid
-                while table_grid[row_idx][col_idx] is not None:
-                    col_idx += 1
-                # Fill the grid with the cell value, considering rowspan and colspan
-                for i in range(row_span if row_span == "restart" else 1):
-                    for j in range(col_span):
-                        table_grid[row_idx + i][col_idx + j] = ""
-                cell = TableCell(
-                    text=cell_text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
+            while col_idx < num_cols:
+                cell: _Cell = row.cells[col_idx]
+                _log.debug(
+                    f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
+                )
+                if cell is None or cell._tc in cell_set:
+                    _log.debug(f"  skipped since repeated content")
+                    col_idx += cell.grid_span
+                    continue
+                else:
+                    cell_set.add(cell._tc)
+                spanned_idx = row_idx
+                spanned_tc: Optional[CT_Tc] = cell._tc
+                while spanned_tc == cell._tc:
+                    spanned_idx += 1
+                    spanned_tc = (
+                        table.rows[spanned_idx].cells[col_idx]._tc
+                        if spanned_idx < num_rows
+                        else None
+                    )
+                _log.debug(f"  spanned before row {spanned_idx}")
+                table_cell = TableCell(
+                    text=cell.text,
+                    row_span=spanned_idx - row_idx,
+                    col_span=cell.grid_span,
+                    start_row_offset_idx=row.grid_cols_before + row_idx,
+                    end_row_offset_idx=row.grid_cols_before + spanned_idx,
                     start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
+                    end_col_offset_idx=col_idx + cell.grid_span,
                     col_header=False,
                     row_header=False,
                 )
-                data.table_cells.append(cell)
+                data.table_cells.append(table_cell)
+                col_idx += cell.grid_span
         level = self.get_level()
         doc.add_table(data=data, parent=self.parents[level - 1])
         return
-    def handle_pictures(self, element, docx_obj, drawing_blip, doc):
-        def get_docx_image(element, drawing_blip):
+    def handle_pictures(
+        self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
+    ) -> None:
+        def get_docx_image(drawing_blip):
             rId = drawing_blip[0].get(
                 "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
             )
@@ -520,11 +562,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 image_data = image_part.blob  # Get the binary image data
             return image_data
-        image_data = get_docx_image(element, drawing_blip)
-        image_bytes = BytesIO(image_data)
         level = self.get_level()
         # Open the BytesIO object with PIL to create an Image
         try:
+            image_data = get_docx_image(drawing_blip)
+            image_bytes = BytesIO(image_data)
             pil_image = Image.open(image_bytes)
             doc.add_picture(
                 parent=self.parents[level - 1],

docling/cli/main.py CHANGED Viewed

@@ -219,6 +219,13 @@ def convert(
         bool,
         typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
     ] = False,
+    enrich_picture_classes: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="Enable the picture classification enrichment model in the pipeline.",
+        ),
+    ] = False,
     artifacts_path: Annotated[
         Optional[Path],
         typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -375,6 +382,7 @@ def convert(
             do_table_structure=True,
             do_code_enrichment=enrich_code,
             do_formula_enrichment=enrich_formula,
+            do_picture_classification=enrich_picture_classes,
             document_timeout=document_timeout,
         )
         pipeline_options.table_structure_options.do_cell_matching = (

docling 2.17.0__py3-none-any.whl → 2.19.0__py3-none-any.whl

docling 2.17.0py3-none-any.whl → 2.19.0py3-none-any.whl