PyPI - docling - Versions diffs - 2.57.0__py3-none-any.whl → 2.59.0__py3-none-any.whl - Mend

docling 2.57.0py3-none-any.whl → 2.59.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (35) hide show

docling/backend/abstract_backend.py +24 -3
docling/backend/asciidoc_backend.py +3 -3
docling/backend/docling_parse_v4_backend.py +15 -4
docling/backend/html_backend.py +130 -20
docling/backend/md_backend.py +27 -5
docling/backend/msexcel_backend.py +121 -29
docling/backend/mspowerpoint_backend.py +2 -2
docling/backend/msword_backend.py +18 -18
docling/backend/pdf_backend.py +9 -2
docling/backend/pypdfium2_backend.py +12 -3
docling/cli/main.py +104 -38
docling/datamodel/asr_model_specs.py +408 -6
docling/datamodel/backend_options.py +82 -0
docling/datamodel/base_models.py +19 -2
docling/datamodel/document.py +81 -48
docling/datamodel/pipeline_options_asr_model.py +21 -1
docling/datamodel/pipeline_options_vlm_model.py +1 -0
docling/document_converter.py +37 -45
docling/document_extractor.py +12 -11
docling/models/api_vlm_model.py +5 -3
docling/models/picture_description_vlm_model.py +5 -1
docling/models/readingorder_model.py +6 -7
docling/models/vlm_models_inline/hf_transformers_model.py +13 -3
docling/models/vlm_models_inline/mlx_model.py +9 -3
docling/models/vlm_models_inline/nuextract_transformers_model.py +13 -3
docling/models/vlm_models_inline/vllm_model.py +42 -8
docling/pipeline/asr_pipeline.py +149 -6
docling/utils/api_image_request.py +20 -9
docling/utils/layout_postprocessor.py +23 -24
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/METADATA +11 -8
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/RECORD +35 -34
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/WHEEL +0 -0
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/entry_points.txt +0 -0
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/top_level.txt +0 -0

docling/backend/msexcel_backend.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Optional, Union, cast
+from typing import Annotated, Any, Optional, Union, cast
 from docling_core.types.doc import (
     BoundingBox,
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
 from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
 from openpyxl.worksheet.worksheet import Worksheet
 from PIL import Image as PILImage
-from pydantic import BaseModel, NonNegativeInt, PositiveInt
+from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
+from pydantic.dataclasses import dataclass
 from typing_extensions import override
 from docling.backend.abstract_backend import (
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
+@dataclass
+class DataRegion:
+    """Represents the bounding rectangle of non-empty cells in a worksheet."""
+    min_row: Annotated[
+        PositiveInt, Field(description="Smallest row index (1-based index).")
+    ]
+    max_row: Annotated[
+        PositiveInt, Field(description="Largest row index (1-based index).")
+    ]
+    min_col: Annotated[
+        PositiveInt, Field(description="Smallest column index (1-based index).")
+    ]
+    max_col: Annotated[
+        PositiveInt, Field(description="Largest column index (1-based index).")
+    ]
+    def width(self) -> PositiveInt:
+        """Number of columns in the data region."""
+        return self.max_col - self.min_col + 1
+    def height(self) -> PositiveInt:
+        """Number of rows in the data region."""
+        return self.max_row - self.min_row + 1
 class ExcelCell(BaseModel):
     """Represents an Excel cell.
@@ -112,10 +139,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         self.workbook = None
         try:
             if isinstance(self.path_or_stream, BytesIO):
-                self.workbook = load_workbook(filename=self.path_or_stream)
+                self.workbook = load_workbook(
+                    filename=self.path_or_stream, data_only=True
+                )
             elif isinstance(self.path_or_stream, Path):
-                self.workbook = load_workbook(filename=str(self.path_or_stream))
+                self.workbook = load_workbook(
+                    filename=str(self.path_or_stream), data_only=True
+                )
             self.valid = self.workbook is not None
         except Exception as e:
@@ -294,6 +325,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         return doc
+    def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
+        """Find the true data boundaries (min/max rows and columns) in a worksheet.
+        This function scans all cells to find the smallest rectangular region that contains
+        all non-empty cells or merged cell ranges. It returns the minimal and maximal
+        row/column indices that bound the actual data region.
+        Args:
+            sheet: The worksheet to analyze.
+        Returns:
+            A data region representing the smallest rectangle that covers all data and merged cells.
+            If the sheet is empty, returns (1, 1, 1, 1) by default.
+        """
+        min_row, min_col = None, None
+        max_row, max_col = 0, 0
+        for cell in sheet._cells.values():
+            if cell.value is not None:
+                r, c = cell.row, cell.column
+                min_row = r if min_row is None else min(min_row, r)
+                min_col = c if min_col is None else min(min_col, c)
+                max_row = max(max_row, r)
+                max_col = max(max_col, c)
+        # Expand bounds to include merged cells
+        for merged in sheet.merged_cells.ranges:
+            min_row = (
+                merged.min_row if min_row is None else min(min_row, merged.min_row)
+            )
+            min_col = (
+                merged.min_col if min_col is None else min(min_col, merged.min_col)
+            )
+            max_row = max(max_row, merged.max_row)
+            max_col = max(max_col, merged.max_col)
+        # If no data found, default to (1, 1, 1, 1)
+        if min_row is None or min_col is None:
+            min_row = min_col = max_row = max_col = 1
+        return DataRegion(min_row, max_row, min_col, max_col)
     def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
         """Find all compact rectangular data tables in an Excel worksheet.
@@ -303,18 +376,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         Returns:
             A list of ExcelTable objects representing the data tables.
         """
+        bounds: DataRegion = self._find_true_data_bounds(
+            sheet
+        )  # The true data boundaries
         tables: list[ExcelTable] = []  # List to store found tables
         visited: set[tuple[int, int]] = set()  # Track already visited cells
-        # Iterate over all cells in the sheet
-        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
-            for rj, cell in enumerate(row):
-                # Skip empty or already visited cells
+        # Limit scan to actual data bounds
+        for ri, row in enumerate(
+            sheet.iter_rows(
+                min_row=bounds.min_row,
+                max_row=bounds.max_row,
+                min_col=bounds.min_col,
+                max_col=bounds.max_col,
+                values_only=False,
+            ),
+            start=bounds.min_row - 1,
+        ):
+            for rj, cell in enumerate(row, start=bounds.min_col - 1):
                 if cell.value is None or (ri, rj) in visited:
                     continue
                 # If the cell starts a new table, find its bounds
-                table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
+                table_bounds, visited_cells = self._find_table_bounds(
+                    sheet, ri, rj, bounds.max_row, bounds.max_col
+                )
                 visited.update(visited_cells)  # Mark these cells as visited
                 tables.append(table_bounds)
@@ -326,6 +412,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         sheet: Worksheet,
         start_row: int,
         start_col: int,
+        max_row: int,
+        max_col: int,
     ) -> tuple[ExcelTable, set[tuple[int, int]]]:
         """Determine the bounds of a compact rectangular table.
@@ -333,14 +421,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
             sheet: The Excel worksheet to be parsed.
             start_row: The row number of the starting cell.
             start_col: The column number of the starting cell.
+            max_row: Maximum row boundary from true data bounds.
+            max_col: Maximum column boundary from true data bounds.
         Returns:
             A tuple with an Excel table and a set of cell coordinates.
         """
         _log.debug("find_table_bounds")
-        max_row = self._find_table_bottom(sheet, start_row, start_col)
-        max_col = self._find_table_right(sheet, start_row, start_col)
+        table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
+        table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)
         # Collect the data within the bounds
         data = []
@@ -348,9 +438,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         for ri, row in enumerate(
             sheet.iter_rows(
                 min_row=start_row + 1,  # start_row is 0-based but iter_rows is 1-based
-                max_row=max_row + 1,
+                max_row=table_max_row + 1,
                 min_col=start_col + 1,
-                max_col=max_col + 1,
+                max_col=table_max_col + 1,
                 values_only=False,
             ),
             start_row,
@@ -390,15 +480,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         return (
             ExcelTable(
                 anchor=(start_col, start_row),
-                num_rows=max_row + 1 - start_row,
-                num_cols=max_col + 1 - start_col,
+                num_rows=table_max_row + 1 - start_row,
+                num_cols=table_max_col + 1 - start_col,
                 data=data,
             ),
             visited_cells,
         )
     def _find_table_bottom(
-        self, sheet: Worksheet, start_row: int, start_col: int
+        self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
     ) -> int:
         """Find the bottom boundary of a table.
@@ -406,16 +496,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
             sheet: The Excel worksheet to be parsed.
             start_row: The starting row of the table.
             start_col: The starting column of the table.
+            max_row: Maximum row boundary from true data bounds.
         Returns:
             The row index representing the bottom boundary of the table.
         """
-        max_row: int = start_row
+        table_max_row: int = start_row
         for ri, (cell,) in enumerate(
             sheet.iter_rows(
                 min_row=start_row + 2,
-                max_row=sheet.max_row,
+                max_row=max_row,
                 min_col=start_col + 1,
                 max_col=start_col + 1,
                 values_only=False,
@@ -431,16 +522,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
             if cell.value is None and not merged_range:
                 break  # Stop if the cell is empty and not merged
-            # Expand max_row to include the merged range if applicable
+            # Expand table_max_row to include the merged range if applicable
             if merged_range:
-                max_row = max(max_row, merged_range.max_row - 1)
+                table_max_row = max(table_max_row, merged_range.max_row - 1)
             else:
-                max_row = ri
+                table_max_row = ri
-        return max_row
+        return table_max_row
     def _find_table_right(
-        self, sheet: Worksheet, start_row: int, start_col: int
+        self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
     ) -> int:
         """Find the right boundary of a table.
@@ -448,18 +539,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
             sheet: The Excel worksheet to be parsed.
             start_row: The starting row of the table.
             start_col: The starting column of the table.
+            max_col: The actual max column of the table.
         Returns:
             The column index representing the right boundary of the table."
         """
-        max_col: int = start_col
+        table_max_col: int = start_col
         for rj, (cell,) in enumerate(
             sheet.iter_cols(
                 min_row=start_row + 1,
                 max_row=start_row + 1,
                 min_col=start_col + 2,
-                max_col=sheet.max_column,
+                max_col=max_col,
                 values_only=False,
             ),
             start_col + 1,
@@ -473,13 +565,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
             if cell.value is None and not merged_range:
                 break  # Stop if the cell is empty and not merged
-            # Expand max_col to include the merged range if applicable
+            # Expand table_max_col to include the merged range if applicable
             if merged_range:
-                max_col = max(max_col, merged_range.max_col - 1)
+                table_max_col = max(table_max_col, merged_range.max_col - 1)
             else:
-                max_col = rj
+                table_max_col = rj
-        return max_col
+        return table_max_col
     def _find_images_in_sheet(
         self, doc: DoclingDocument, sheet: Worksheet

docling/backend/mspowerpoint_backend.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Union
 from docling_core.types.doc import (
     BoundingBox,
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         self.path_or_stream = None
     @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.PPTX}
     def convert(self) -> DoclingDocument:

docling/backend/msword_backend.py CHANGED Viewed

@@ -3,7 +3,7 @@ import re
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, Optional, Union
 from docling_core.types.doc import (
     DocItemLabel,
@@ -69,7 +69,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.numbered_headers: dict[int, int] = {}
         self.equation_bookends: str = "<eq>{EQ}</eq>"
         # Track processed textbox elements to avoid duplication
-        self.processed_textbox_elements: List[int] = []
+        self.processed_textbox_elements: list[int] = []
         self.docx_to_pdf_converter: Optional[Callable] = None
         self.docx_to_pdf_converter_init = False
         self.display_drawingml_warning = True
@@ -726,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         textbox_elements: list,
         docx_obj: DocxDocument,
         doc: DoclingDocument,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
         """Process textbox content and add it to the document structure."""
         level = self._get_level()
         # Create a textbox group to contain all text from the textbox
@@ -856,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         element: BaseOxmlElement,
         docx_obj: DocxDocument,
         doc: DoclingDocument,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
         paragraph = Paragraph(element, docx_obj)
         paragraph_elements = self._get_paragraph_elements(paragraph)
         text, equations = self._handle_equations_in_text(
@@ -1032,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         curr_level: Optional[int],
         text: str,
         is_numbered_style: bool = False,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
         level = self._get_level()
         if isinstance(curr_level, int):
             if curr_level > level:
@@ -1102,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         marker: str,
         enumerated: bool,
         level: int,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
         # This should not happen by construction
         if not isinstance(self.parents[level], ListGroup):
             return elem_ref
@@ -1148,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         ilevel: int,
         elements: list,
         is_numbered: bool = False,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
         # this method is always called with is_numbered. Numbered lists should be properly addressed.
         if not elements:
             return elem_ref
@@ -1244,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         element: BaseOxmlElement,
         docx_obj: DocxDocument,
         doc: DoclingDocument,
-    ) -> List[RefItem]:
-        elem_ref: List[RefItem] = []
+    ) -> list[RefItem]:
+        elem_ref: list[RefItem] = []
         table: Table = Table(element, docx_obj)
         num_rows = len(table.rows)
         num_cols = len(table.columns)
@@ -1299,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 else:
                     text = text.replace("<eq>", "$").replace("</eq>", "$")
-                provs_in_cell: List[RefItem] = []
+                provs_in_cell: list[RefItem] = []
                 _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
                 ref_for_rich_cell = provs_in_cell[0]
                 rich_table_cell = False
                 def group_cell_elements(
-                    group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
+                    group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
                 ) -> RefItem:
                     group_element = doc.add_group(
                         label=GroupLabel.UNSPECIFIED,
@@ -1379,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def _handle_pictures(
         self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
-    ) -> List[RefItem]:
+    ) -> list[RefItem]:
         def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
             image_data: Optional[bytes] = None
             rId = drawing_blip[0].get(
@@ -1391,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 image_data = image_part.blob  # Get the binary image data
             return image_data
-        elem_ref: List[RefItem] = []
+        elem_ref: list[RefItem] = []
         level = self._get_level()
         # Open the BytesIO object with PIL to create an Image
         image_data: Optional[bytes] = get_docx_image(drawing_blip)

docling/backend/pdf_backend.py CHANGED Viewed

@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from PIL import Image
 from docling.backend.abstract_backend import PaginatedDocumentBackend
+from docling.datamodel.backend_options import PdfBackendOptions
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):
 class PdfDocumentBackend(PaginatedDocumentBackend):
-    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
+    def __init__(
+        self,
+        in_doc: InputDocument,
+        path_or_stream: Union[BytesIO, Path],
+        options: PdfBackendOptions = PdfBackendOptions(),
+    ):
+        super().__init__(in_doc, path_or_stream, options)
+        self.options: PdfBackendOptions
         if self.input_format is not InputFormat.PDF:
             if self.input_format is InputFormat.IMAGE:

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.backend_options import PdfBackendOptions
 from docling.utils.locks import pypdfium2_lock
@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
+    def __init__(
+        self,
+        in_doc: "InputDocument",
+        path_or_stream: Union[BytesIO, Path],
+        options: PdfBackendOptions = PdfBackendOptions(),
+    ):
+        super().__init__(in_doc, path_or_stream, options)
+        password = (
+            self.options.password.get_secret_value() if self.options.password else None
+        )
         try:
             with pypdfium2_lock:
-                self._pdoc = pdfium.PdfDocument(self.path_or_stream)
+                self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
         except PdfiumError as e:
             raise RuntimeError(
                 f"pypdfium could not load document with hash {self.document_hash}"

docling 2.57.0__py3-none-any.whl → 2.59.0__py3-none-any.whl

Potentially problematic release.

docling 2.57.0py3-none-any.whl → 2.59.0py3-none-any.whl