PyPI - docling - Versions diffs - 2.5.2__py3-none-any.whl → 2.7.0__py3-none-any.whl - Mend

docling 2.5.2py3-none-any.whl → 2.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

docling/backend/msexcel_backend.py +374 -0
docling/backend/mspowerpoint_backend.py +16 -1
docling/backend/msword_backend.py +26 -11
docling/cli/main.py +55 -19
docling/datamodel/base_models.py +6 -0
docling/datamodel/pipeline_options.py +23 -4
docling/document_converter.py +13 -4
docling/models/ocr_mac_model.py +118 -0
docling/pipeline/standard_pdf_pipeline.py +12 -0
{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/METADATA +11 -7
{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/RECORD +14 -12
{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/LICENSE +0 -0
{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/WHEEL +0 -0
{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/entry_points.txt +0 -0

docling/backend/msexcel_backend.py ADDED Viewed

@@ -0,0 +1,374 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, Set, Tuple, Union
+from docling_core.types.doc import (
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    ImageRef,
+    TableCell,
+    TableData,
+)
+# from lxml import etree
+from openpyxl import Workbook, load_workbook
+from openpyxl.cell.cell import Cell
+from openpyxl.drawing.image import Image
+from openpyxl.worksheet.worksheet import Worksheet
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+from typing import Any, List
+from pydantic import BaseModel
+class ExcelCell(BaseModel):
+    row: int
+    col: int
+    text: str
+    row_span: int
+    col_span: int
+class ExcelTable(BaseModel):
+    num_rows: int
+    num_cols: int
+    data: List[ExcelCell]
+class MsExcelDocumentBackend(DeclarativeDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        # Initialise the parents for the hierarchy
+        self.max_levels = 10
+        self.parents: Dict[int, Any] = {}
+        for i in range(-1, self.max_levels):
+            self.parents[i] = None
+        self.workbook = None
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.workbook = load_workbook(filename=self.path_or_stream)
+            elif isinstance(self.path_or_stream, Path):
+                self.workbook = load_workbook(filename=str(self.path_or_stream))
+            self.valid = True
+        except Exception as e:
+            self.valid = False
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+            ) from e
+    def is_valid(self) -> bool:
+        _log.info(f"valid: {self.valid}")
+        return self.valid
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.XLSX}
+    def convert(self) -> DoclingDocument:
+        # Parses the XLSX into a structured document model.
+        origin = DocumentOrigin(
+            filename=self.file.name or "file.xlsx",
+            mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            binary_hash=self.document_hash,
+        )
+        doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
+        if self.is_valid():
+            doc = self._convert_workbook(doc)
+        else:
+            raise RuntimeError(
+                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+            )
+        return doc
+    def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
+        if self.workbook is not None:
+            # Iterate over all sheets
+            for sheet_name in self.workbook.sheetnames:
+                _log.info(f"Processing sheet: {sheet_name}")
+                # Access the sheet by name
+                sheet = self.workbook[sheet_name]
+                self.parents[0] = doc.add_group(
+                    parent=None,
+                    label=GroupLabel.SECTION,
+                    name=f"sheet: {sheet_name}",
+                )
+                doc = self._convert_sheet(doc, sheet)
+        else:
+            _log.error("Workbook is not initialized.")
+        return doc
+    def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
+        doc = self._find_tables_in_sheet(doc, sheet)
+        doc = self._find_images_in_sheet(doc, sheet)
+        return doc
+    def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
+        tables = self._find_data_tables(sheet)
+        for excel_table in tables:
+            num_rows = excel_table.num_rows
+            num_cols = excel_table.num_cols
+            table_data = TableData(
+                num_rows=num_rows,
+                num_cols=num_cols,
+                table_cells=[],
+            )
+            for excel_cell in excel_table.data:
+                cell = TableCell(
+                    text=excel_cell.text,
+                    row_span=excel_cell.row_span,
+                    col_span=excel_cell.col_span,
+                    start_row_offset_idx=excel_cell.row,
+                    end_row_offset_idx=excel_cell.row + excel_cell.row_span,
+                    start_col_offset_idx=excel_cell.col,
+                    end_col_offset_idx=excel_cell.col + excel_cell.col_span,
+                    col_header=False,
+                    row_header=False,
+                )
+                table_data.table_cells.append(cell)
+            doc.add_table(data=table_data, parent=self.parents[0])
+        return doc
+    def _find_data_tables(self, sheet: Worksheet):
+        """
+        Find all compact rectangular data tables in a sheet.
+        """
+        # _log.info("find_data_tables")
+        tables = []  # List to store found tables
+        visited: set[Tuple[int, int]] = set()  # Track already visited cells
+        # Iterate over all cells in the sheet
+        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
+            for rj, cell in enumerate(row):
+                # Skip empty or already visited cells
+                if cell.value is None or (ri, rj) in visited:
+                    continue
+                # If the cell starts a new table, find its bounds
+                table_bounds, visited_cells = self._find_table_bounds(
+                    sheet, ri, rj, visited
+                )
+                visited.update(visited_cells)  # Mark these cells as visited
+                tables.append(table_bounds)
+        return tables
+    def _find_table_bounds(
+        self,
+        sheet: Worksheet,
+        start_row: int,
+        start_col: int,
+        visited: set[Tuple[int, int]],
+    ):
+        """
+        Determine the bounds of a compact rectangular table.
+        Returns:
+        - A dictionary with the bounds and data.
+        - A set of visited cell coordinates.
+        """
+        _log.info("find_table_bounds")
+        max_row = self._find_table_bottom(sheet, start_row, start_col)
+        max_col = self._find_table_right(sheet, start_row, start_col)
+        # Collect the data within the bounds
+        data = []
+        visited_cells = set()
+        for ri in range(start_row, max_row + 1):
+            for rj in range(start_col, max_col + 1):
+                cell = sheet.cell(row=ri + 1, column=rj + 1)  # 1-based indexing
+                # Check if the cell belongs to a merged range
+                row_span = 1
+                col_span = 1
+                # _log.info(sheet.merged_cells.ranges)
+                for merged_range in sheet.merged_cells.ranges:
+                    if (
+                        merged_range.min_row <= ri + 1
+                        and ri + 1 <= merged_range.max_row
+                        and merged_range.min_col <= rj + 1
+                        and rj + 1 <= merged_range.max_col
+                    ):
+                        row_span = merged_range.max_row - merged_range.min_row + 1
+                        col_span = merged_range.max_col - merged_range.min_col + 1
+                        break
+                if (ri, rj) not in visited_cells:
+                    data.append(
+                        ExcelCell(
+                            row=ri - start_row,
+                            col=rj - start_col,
+                            text=str(cell.value),
+                            row_span=row_span,
+                            col_span=col_span,
+                        )
+                    )
+                    # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
+                    # Mark all cells in the span as visited
+                    for span_row in range(ri, ri + row_span):
+                        for span_col in range(rj, rj + col_span):
+                            visited_cells.add((span_row, span_col))
+        return (
+            ExcelTable(
+                num_rows=max_row + 1 - start_row,
+                num_cols=max_col + 1 - start_col,
+                data=data,
+            ),
+            visited_cells,
+        )
+    def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
+        """Function to find the bottom boundary of the table"""
+        max_row = start_row
+        while max_row < sheet.max_row - 1:
+            # Get the cell value or check if it is part of a merged cell
+            cell = sheet.cell(row=max_row + 2, column=start_col + 1)
+            # Check if the cell is part of a merged range
+            merged_range = next(
+                (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
+                None,
+            )
+            if cell.value is None and not merged_range:
+                break  # Stop if the cell is empty and not merged
+            # Expand max_row to include the merged range if applicable
+            if merged_range:
+                max_row = max(max_row, merged_range.max_row - 1)
+            else:
+                max_row += 1
+        return max_row
+    def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
+        """Function to find the right boundary of the table"""
+        max_col = start_col
+        while max_col < sheet.max_column - 1:
+            # Get the cell value or check if it is part of a merged cell
+            cell = sheet.cell(row=start_row + 1, column=max_col + 2)
+            # Check if the cell is part of a merged range
+            merged_range = next(
+                (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
+                None,
+            )
+            if cell.value is None and not merged_range:
+                break  # Stop if the cell is empty and not merged
+            # Expand max_col to include the merged range if applicable
+            if merged_range:
+                max_col = max(max_col, merged_range.max_col - 1)
+            else:
+                max_col += 1
+        return max_col
+    def _find_images_in_sheet(
+        self, doc: DoclingDocument, sheet: Worksheet
+    ) -> DoclingDocument:
+        # FIXME: mypy does not agree with _images ...
+        """
+        # Iterate over images in the sheet
+        for idx, image in enumerate(sheet._images):  # Access embedded images
+            image_bytes = BytesIO(image.ref.blob)
+            pil_image = Image.open(image_bytes)
+            doc.add_picture(
+                parent=self.parents[0],
+                image=ImageRef.from_pil(image=pil_image, dpi=72),
+                caption=None,
+            )
+        """
+        # FIXME: mypy does not agree with _charts ...
+        """
+        for idx, chart in enumerate(sheet._charts):  # Access embedded charts
+            chart_path = f"chart_{idx + 1}.png"
+            _log.info(
+                f"Chart found, but dynamic rendering is required for: {chart_path}"
+            )
+            _log.info(f"Chart {idx + 1}:")
+            # Chart type
+            _log.info(f"Type: {type(chart).__name__}")
+            # Title
+            if chart.title:
+                _log.info(f"Title: {chart.title}")
+            else:
+                _log.info("No title")
+            # Data series
+            for series in chart.series:
+                _log.info(" => series ...")
+                _log.info(f"Data Series: {series.title}")
+                _log.info(f"Values: {series.values}")
+                _log.info(f"Categories: {series.categories}")
+            # Position
+            # _log.info(f"Anchor Cell: {chart.anchor}")
+        """
+        return doc

docling/backend/mspowerpoint_backend.py CHANGED Viewed

@@ -10,11 +10,13 @@ from docling_core.types.doc import (
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
+    ImageRef,
     ProvenanceItem,
     Size,
     TableCell,
     TableData,
 )
+from PIL import Image
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@@ -268,9 +270,22 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         return
     def handle_pictures(self, shape, parent_slide, slide_ind, doc):
+        # Get the image bytes
+        image = shape.image
+        image_bytes = image.blob
+        im_dpi, _ = image.dpi
+        # Open it with PIL
+        pil_image = Image.open(BytesIO(image_bytes))
         # shape has picture
         prov = self.generate_prov(shape, slide_ind, "")
-        doc.add_picture(parent=parent_slide, caption=None, prov=prov)
+        doc.add_picture(
+            parent=parent_slide,
+            image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
+            caption=None,
+            prov=prov,
+        )
         return
     def handle_tables(self, shape, parent_slide, slide_ind, doc):

docling/backend/msword_backend.py CHANGED Viewed

@@ -9,10 +9,12 @@ from docling_core.types.doc import (
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
+    ImageRef,
     TableCell,
     TableData,
 )
 from lxml import etree
+from PIL import Image
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@@ -130,13 +132,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
         for element in body:
             tag_name = etree.QName(element).localname
-            # Check for Inline Images (drawings or blip elements)
-            found_drawing = etree.ElementBase.xpath(
-                element, ".//w:drawing", namespaces=self.xml_namespaces
-            )
-            found_pict = etree.ElementBase.xpath(
-                element, ".//w:pict", namespaces=self.xml_namespaces
-            )
+            # Check for Inline Images (blip elements)
+            drawing_blip = element.xpath(".//a:blip")
             # Check for Tables
             if element.tag.endswith("tbl"):
@@ -145,8 +142,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 except Exception:
                     _log.debug("could not parse a table, broken docx table")
-            elif found_drawing or found_pict:
-                self.handle_pictures(element, docx_obj, doc)
+            elif drawing_blip:
+                self.handle_pictures(element, docx_obj, drawing_blip, doc)
             # Check for Text
             elif tag_name in ["p"]:
                 self.handle_text_elements(element, docx_obj, doc)
@@ -491,6 +488,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         doc.add_table(data=data, parent=self.parents[level - 1])
         return
-    def handle_pictures(self, element, docx_obj, doc):
-        doc.add_picture(parent=self.parents[self.level], caption=None)
+    def handle_pictures(self, element, docx_obj, drawing_blip, doc):
+        def get_docx_image(element, drawing_blip):
+            rId = drawing_blip[0].get(
+                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
+            )
+            if rId in docx_obj.part.rels:
+                # Access the image part using the relationship ID
+                image_part = docx_obj.part.rels[rId].target_part
+                image_data = image_part.blob  # Get the binary image data
+            return image_data
+        image_data = get_docx_image(element, drawing_blip)
+        image_bytes = BytesIO(image_data)
+        # Open the BytesIO object with PIL to create an Image
+        pil_image = Image.open(image_bytes)
+        doc.add_picture(
+            parent=self.parents[self.level],
+            image=ImageRef.from_pil(image=pil_image, dpi=72),
+            caption=None,
+        )
         return

docling/cli/main.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import importlib
 import json
 import logging
+import re
 import time
 import warnings
 from enum import Enum
@@ -23,6 +24,7 @@ from docling.datamodel.base_models import (
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrMacOptions,
     OcrOptions,
     PdfPipelineOptions,
     TableFormerMode,
@@ -73,6 +75,7 @@ class OcrEngine(str, Enum):
     EASYOCR = "easyocr"
     TESSERACT_CLI = "tesseract_cli"
     TESSERACT = "tesseract"
+    OCRMAC = "ocrmac"
 def export_documents(
@@ -129,6 +132,12 @@ def export_documents(
     )
+def _split_list(raw: Optional[str]) -> Optional[List[str]]:
+    if raw is None:
+        return None
+    return re.split(r"[;,]", raw)
 @app.command(no_args_is_help=True)
 def convert(
     input_sources: Annotated[
@@ -163,6 +172,13 @@ def convert(
     ocr_engine: Annotated[
         OcrEngine, typer.Option(..., help="The OCR engine to use.")
     ] = OcrEngine.EASYOCR,
+    ocr_lang: Annotated[
+        Optional[str],
+        typer.Option(
+            ...,
+            help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
+        ),
+    ] = None,
     pdf_backend: Annotated[
         PdfBackend, typer.Option(..., help="The PDF backend to use.")
     ] = PdfBackend.DLPARSE_V1,
@@ -185,6 +201,15 @@ def convert(
     output: Annotated[
         Path, typer.Option(..., help="Output directory where results are saved.")
     ] = Path("."),
+    verbose: Annotated[
+        int,
+        typer.Option(
+            "--verbose",
+            "-v",
+            count=True,
+            help="Set the verbosity level. -v for info logging, -vv for debug logging.",
+        ),
+    ] = 0,
     version: Annotated[
         Optional[bool],
         typer.Option(
@@ -195,7 +220,12 @@ def convert(
         ),
     ] = None,
 ):
-    logging.basicConfig(level=logging.INFO)
+    if verbose == 0:
+        logging.basicConfig(level=logging.WARNING)
+    elif verbose == 1:
+        logging.basicConfig(level=logging.INFO)
+    elif verbose == 2:
+        logging.basicConfig(level=logging.DEBUG)
     if from_formats is None:
         from_formats = [e for e in InputFormat]
@@ -224,15 +254,20 @@ def convert(
     export_txt = OutputFormat.TEXT in to_formats
     export_doctags = OutputFormat.DOCTAGS in to_formats
-    match ocr_engine:
-        case OcrEngine.EASYOCR:
-            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
-        case OcrEngine.TESSERACT_CLI:
-            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
-        case OcrEngine.TESSERACT:
-            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
-        case _:
-            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
+    if ocr_engine == OcrEngine.EASYOCR:
+        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
+    elif ocr_engine == OcrEngine.TESSERACT_CLI:
+        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
+    elif ocr_engine == OcrEngine.TESSERACT:
+        ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
+    elif ocr_engine == OcrEngine.OCRMAC:
+        ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
+    else:
+        raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
+    ocr_lang_list = _split_list(ocr_lang)
+    if ocr_lang_list is not None:
+        ocr_options.lang = ocr_lang_list
     pipeline_options = PdfPipelineOptions(
         do_ocr=ocr,
@@ -245,15 +280,14 @@ def convert(
     if artifacts_path is not None:
         pipeline_options.artifacts_path = artifacts_path
-    match pdf_backend:
-        case PdfBackend.DLPARSE_V1:
-            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
-        case PdfBackend.DLPARSE_V2:
-            backend = DoclingParseV2DocumentBackend
-        case PdfBackend.PYPDFIUM2:
-            backend = PyPdfiumDocumentBackend
-        case _:
-            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
+    if pdf_backend == PdfBackend.DLPARSE_V1:
+        backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+    elif pdf_backend == PdfBackend.DLPARSE_V2:
+        backend = DoclingParseV2DocumentBackend
+    elif pdf_backend == PdfBackend.PYPDFIUM2:
+        backend = PyPdfiumDocumentBackend
+    else:
+        raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
     format_options: Dict[InputFormat, FormatOption] = {
         InputFormat.PDF: PdfFormatOption(
@@ -287,5 +321,7 @@ def convert(
     _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+click_app = typer.main.get_command(app)
 if __name__ == "__main__":
     app()

docling/datamodel/base_models.py CHANGED Viewed

@@ -32,6 +32,7 @@ class InputFormat(str, Enum):
     PDF = "pdf"
     ASCIIDOC = "asciidoc"
     MD = "md"
+    XLSX = "xlsx"
 class OutputFormat(str, Enum):
@@ -49,6 +50,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.HTML: ["html", "htm", "xhtml"],
     InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
+    InputFormat.XLSX: ["xlsx"],
 }
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -72,7 +74,11 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
     InputFormat.PDF: ["application/pdf"],
     InputFormat.ASCIIDOC: ["text/asciidoc"],
     InputFormat.MD: ["text/markdown", "text/x-markdown"],
+    InputFormat.XLSX: [
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    ],
 }
 MimeTypeToFormat = {
     mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
 }

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
 class OcrOptions(BaseModel):
     kind: str
+    lang: List[str]
     force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
     bitmap_area_threshold: float = (
         0.05  # percentage of the area for a bitmap to processed with OCR
@@ -62,6 +63,17 @@ class TesseractOcrOptions(OcrOptions):
     )
+class OcrMacOptions(OcrOptions):
+    kind: Literal["ocrmac"] = "ocrmac"
+    lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
+    recognition: str = "accurate"
+    framework: str = "vision"
+    model_config = ConfigDict(
+        extra="forbid",
+    )
 class PipelineOptions(BaseModel):
     create_legacy_output: bool = (
         True  # This defautl will be set to False on a future version of docling
@@ -74,11 +86,18 @@ class PdfPipelineOptions(PipelineOptions):
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
     table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
-        Field(EasyOcrOptions(), discriminator="kind")
-    )
+    ocr_options: Union[
+        EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
+    ] = Field(EasyOcrOptions(), discriminator="kind")
     images_scale: float = 1.0
     generate_page_images: bool = False
     generate_picture_images: bool = False
-    generate_table_images: bool = False
+    generate_table_images: bool = Field(
+        default=False,
+        deprecated=(
+            "Field `generate_table_images` is deprecated. "
+            "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
+            "before conversion and then use the `TableItem.get_image` function."
+        ),
+    )

docling/document_converter.py CHANGED Viewed

@@ -3,7 +3,7 @@ import sys
 import time
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Type
+from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
 from pydantic import BaseModel, ConfigDict, model_validator, validate_call
@@ -12,6 +12,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
+from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
@@ -44,6 +45,11 @@ class FormatOption(BaseModel):
         return self
+class ExcelFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
 class WordFormatOption(FormatOption):
     pipeline_cls: Type = SimplePipeline
     backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
@@ -80,6 +86,9 @@ class ImageFormatOption(FormatOption):
 _format_to_default_options = {
+    InputFormat.XLSX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
+    ),
     InputFormat.DOCX: FormatOption(
         pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
     ),
@@ -146,7 +155,7 @@ class DocumentConverter:
     @validate_call(config=ConfigDict(strict=True))
     def convert(
         self,
-        source: Path | str | DocumentStream,  # TODO review naming
+        source: Union[Path, str, DocumentStream],  # TODO review naming
         raises_on_error: bool = True,
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
@@ -163,7 +172,7 @@ class DocumentConverter:
     @validate_call(config=ConfigDict(strict=True))
     def convert_all(
         self,
-        source: Iterable[Path | str | DocumentStream],  # TODO review naming
+        source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
         raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
@@ -174,7 +183,7 @@ class DocumentConverter:
         )
         conv_input = _DocumentConversionInput(
             path_or_stream_iterator=source,
-            limit=limits,
+            limits=limits,
         )
         conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
         for conv_res in conv_res_iter:

docling/models/ocr_mac_model.py ADDED Viewed

@@ -0,0 +1,118 @@
+import logging
+import tempfile
+from typing import Iterable, Optional, Tuple
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import OcrMacOptions
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
+_log = logging.getLogger(__name__)
+class OcrMacModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: OcrMacOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: OcrMacOptions
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        if self.enabled:
+            install_errmsg = (
+                "ocrmac is not correctly installed. "
+                "Please install it via `pip install ocrmac` to use this OCR engine. "
+                "Alternatively, Docling has support for other OCR engines. See the documentation: "
+                "https://ds4sd.github.io/docling/installation/"
+            )
+            try:
+                from ocrmac import ocrmac
+            except ImportError:
+                raise ImportError(install_errmsg)
+            self.reader_RIL = ocrmac.OCR
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "ocr"):
+                    ocr_rects = self.get_ocr_rects(page)
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
+                        with tempfile.NamedTemporaryFile(
+                            suffix=".png", mode="w"
+                        ) as image_file:
+                            fname = image_file.name
+                            high_res_image.save(fname)
+                            boxes = self.reader_RIL(
+                                fname,
+                                recognition_level=self.options.recognition,
+                                framework=self.options.framework,
+                                language_preference=self.options.lang,
+                            ).recognize()
+                        im_width, im_height = high_res_image.size
+                        cells = []
+                        for ix, (text, confidence, box) in enumerate(boxes):
+                            x = float(box[0])
+                            y = float(box[1])
+                            w = float(box[2])
+                            h = float(box[3])
+                            x1 = x * im_width
+                            y2 = (1 - y) * im_height
+                            x2 = x1 + w * im_width
+                            y1 = y2 - h * im_height
+                            left = x1 / self.scale
+                            top = y1 / self.scale
+                            right = x2 / self.scale
+                            bottom = y2 / self.scale
+                            cells.append(
+                                OcrCell(
+                                    id=ix,
+                                    text=text,
+                                    confidence=confidence,
+                                    bbox=BoundingBox.from_tuple(
+                                        coord=(left, top, right, bottom),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    ),
+                                )
+                            )
+                        # del high_res_image
+                        all_ocr_cells.extend(cells)
+                    # Post-process the cells
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                # DEBUG code:
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
+                yield page

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import sys
 from pathlib import Path
 from typing import Optional
@@ -10,6 +11,7 @@ from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrMacOptions,
     PdfPipelineOptions,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
@@ -18,6 +20,7 @@ from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.ds_glm_model import GlmModel, GlmOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
+from docling.models.ocr_mac_model import OcrMacModel
 from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
 from docling.models.page_preprocessing_model import (
     PagePreprocessingModel,
@@ -118,6 +121,15 @@ class StandardPdfPipeline(PaginatedPipeline):
                 enabled=self.pipeline_options.do_ocr,
                 options=self.pipeline_options.ocr_options,
             )
+        elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
+            if "darwin" != sys.platform:
+                raise RuntimeError(
+                    f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
+                )
+            return OcrMacModel(
+                enabled=self.pipeline_options.do_ocr,
+                options=self.pipeline_options.ocr_options,
+            )
         return None
     def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:

{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,13 +1,13 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.5.2
+Version: 2.7.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
 Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
 Author: Christoph Auer
 Author-email: cau@zurich.ibm.com
-Requires-Python: >=3.10,<4.0
+Requires-Python: >=3.9,<4.0
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
@@ -15,21 +15,25 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: MacOS :: MacOS X
 Classifier: Operating System :: POSIX :: Linux
 Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Provides-Extra: ocrmac
 Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
-Requires-Dist: docling-core (>=2.3.0,<3.0.0)
-Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
-Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
+Requires-Dist: docling-core (>=2.4.0,<3.0.0)
+Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
+Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
 Requires-Dist: marko (>=2.1.2,<3.0.0)
+Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
+Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
 Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
@@ -39,7 +43,7 @@ Requires-Dist: python-docx (>=1.1.2,<2.0.0)
 Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
 Requires-Dist: requests (>=2.32.3,<3.0.0)
 Requires-Dist: rtree (>=1.3.0,<2.0.0)
-Requires-Dist: scipy (>=1.14.1,<2.0.0)
+Requires-Dist: scipy (>=1.6.0,<2.0.0)
 Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
 Requires-Dist: typer (>=0.12.5,<0.13.0)
 Project-URL: Repository, https://github.com/DS4SD/docling
@@ -60,7 +64,7 @@ Description-Content-Type: text/markdown
 [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
 [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
-![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
 [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)

{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/RECORD RENAMED Viewed

@@ -6,24 +6,26 @@ docling/backend/docling_parse_backend.py,sha256=csWy6ZGxDuZfNr0YTrUU40DXqelN_TJk
 docling/backend/docling_parse_v2_backend.py,sha256=gUr9_fwHbkj238oYQPJ9AxpjFL2jGvhjBlBQPblmSAg,8589
 docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
 docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
-docling/backend/mspowerpoint_backend.py,sha256=YaVJc6RXWmM1EPTp0TzAiXpGxu6K-MZdPNsmR_64LSg,15358
-docling/backend/msword_backend.py,sha256=IEqGz-lUrQw0tgBly_gv_mYGC0X0iNnGhkwnDWaDtBY,17341
+docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
+docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
+docling/backend/msword_backend.py,sha256=-cCEh4EhdGknHrxiVGFE4GDo_iYpAqP2QxRaeqrJHUE,17939
 docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
 docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=7stF4dMjGVp5R0Gvcawm21rff5RbEQnWj8ZzoAHvV9k,9619
+docling/cli/main.py,sha256=MpjbAXhOlbGnAnl5_OaKCdub61YPQBy1NOqroXQtNYE,10722
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/datamodel/base_models.py,sha256=fmkS6iTxGZCTtNCo2zsgMmBC11Ogf2Ht-mNIlZ9GP-o,5375
+docling/datamodel/base_models.py,sha256=6qlwPamDZ3XUsE2kTAyGKG6O2IJClVjCqaE7DZ74KHU,5533
 docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
-docling/datamodel/pipeline_options.py,sha256=-PXwqkdwSpWjIMCxyqwB8Q453szVNR1zVM-7d0PAOWQ,2530
+docling/datamodel/pipeline_options.py,sha256=aC_CmtEhNLIbn9n3JuYhL_aA8UA0vFgw7HcGMUuOI4o,3117
 docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
-docling/document_converter.py,sha256=U52_rZQDm2wzrnsuUrvsfX2MnmOWFFhjBzfS8tEvt6Y,10595
+docling/document_converter.py,sha256=L0A3g7IQBaKIK7dWpUFC72ZqKywIPYkyh71Qd6DiNPE,10940
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
 docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
 docling/models/ds_glm_model.py,sha256=hBRCx6oFGhxBbKEJlRSWVndDwFtB5IpeLOowFAVqFM0,12033
 docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
 docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
+docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
 docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
 docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
 docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
@@ -32,14 +34,14 @@ docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUs
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
 docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
-docling/pipeline/standard_pdf_pipeline.py,sha256=h59eA0CLMYuuJoH-0SyCRkYEregNs6i0pa46Ioqf8kU,7947
+docling/pipeline/standard_pdf_pipeline.py,sha256=btm_y1ZsjUrtWvMbF6RA8BVM0ENrK4z_rqF0jjdeZmU,8473
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
 docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-2.5.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.5.2.dist-info/METADATA,sha256=oEAVaoncnXpewHqwn3rbOuszNifzG8s-TtWxhcnufzs,6530
-docling-2.5.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-2.5.2.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-2.5.2.dist-info/RECORD,,
+docling-2.7.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.7.0.dist-info/METADATA,sha256=6cpEQMbjK1tKCQ3kkzeOD7URm41HPx2xUSs-gxvlsM4,6761
+docling-2.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.7.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.7.0.dist-info/RECORD,,

{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.5.2.dist-info → docling-2.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling 2.5.2__py3-none-any.whl → 2.7.0__py3-none-any.whl

docling 2.5.2py3-none-any.whl → 2.7.0py3-none-any.whl