PyPI - docling - Versions diffs - 2.55.1__tar.gz → 2.56.1__tar.gz - Mend

docling 2.55.1tar.gz → 2.56.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (148) hide show

{docling-2.55.1 → docling-2.56.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.55.1
+Version: 2.56.1
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -34,7 +34,8 @@ Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
 Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
 Requires-Dist: huggingface_hub<1,>=0.23
 Requires-Dist: requests<3.0.0,>=2.32.2
-Requires-Dist: easyocr<2.0,>=1.7
+Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin"
+Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14"
 Requires-Dist: certifi>=2024.7.4
 Requires-Dist: rtree<2.0.0,>=1.3.0
 Requires-Dist: typer<0.20.0,>=0.12.5
@@ -52,6 +53,8 @@ Requires-Dist: pylatexenc<3.0,>=2.10
 Requires-Dist: scipy<2.0.0,>=1.6.0
 Requires-Dist: accelerate<2,>=1.0.0
 Requires-Dist: polyfactory>=2.22.2
+Provides-Extra: easyocr
+Requires-Dist: easyocr<2.0,>=1.7; extra == "easyocr"
 Provides-Extra: tesserocr
 Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
 Provides-Extra: ocrmac
@@ -65,7 +68,6 @@ Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
 Provides-Extra: rapidocr
 Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
-Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
 Provides-Extra: asr
 Requires-Dist: openai-whisper>=20250625; extra == "asr"
 Dynamic: license-file

{docling-2.55.1 → docling-2.56.1}/docling/backend/html_backend.py RENAMED Viewed

@@ -272,9 +272,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         for br in content("br"):
             br.replace_with(NavigableString("\n"))
         # set default content layer
-        headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
+        # Furniture before the first heading rule, except for headers in tables
+        header = None
+        # Find all headers first
+        all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
+        # Keep only those that do NOT have a <table> in a parent chain
+        clean_headers = [h for h in all_headers if not h.find_parent("table")]
+        # Pick the first header from the remaining
+        if len(clean_headers):
+            header = clean_headers[0]
+        # Set starting content layer
         self.content_layer = (
-            ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
+            ContentLayer.BODY if header is None else ContentLayer.FURNITURE
         )
         # reset context
         self.ctx = _Context()
@@ -309,9 +319,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         group_name: str,
         doc: DoclingDocument,
         docling_table: TableItem,
-    ) -> tuple[bool, RefItem]:
+    ) -> tuple[bool, Union[RefItem, None]]:
         rich_table_cell = False
-        ref_for_rich_cell = provs_in_cell[0]
+        ref_for_rich_cell = None
+        if len(provs_in_cell) > 0:
+            ref_for_rich_cell = provs_in_cell[0]
         if len(provs_in_cell) > 1:
             # Cell has multiple elements, we need to group them
             rich_table_cell = True
@@ -324,7 +336,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             if isinstance(pr_item, TextItem):
                 # Cell has only one element and it's just a text
                 rich_table_cell = False
-                doc.delete_items(node_items=[pr_item])
+                try:
+                    doc.delete_items(node_items=[pr_item])
+                except Exception as e:
+                    _log.error(f"Error while making rich table: {e}.")
             else:
                 rich_table_cell = True
                 ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
@@ -391,17 +406,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 provs_in_cell: list[RefItem] = []
                 # Parse table cell sub-tree for Rich Cells content:
+                table_level = self.level
                 provs_in_cell = self._walk(html_cell, doc)
+                # After walking sub-tree in cell, restore previously set level
+                self.level = table_level
                 rich_table_cell = False
                 ref_for_rich_cell = None
-                if len(provs_in_cell) > 0:
-                    group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
-                    rich_table_cell, ref_for_rich_cell = (
-                        HTMLDocumentBackend.process_rich_table_cells(
-                            provs_in_cell, group_name, doc, docling_table
-                        )
+                group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
+                rich_table_cell, ref_for_rich_cell = (
+                    HTMLDocumentBackend.process_rich_table_cells(
+                        provs_in_cell, group_name, doc, docling_table
                     )
+                )
                 # Extracting text
                 text = self.get_text(html_cell).strip()
@@ -774,13 +791,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             for key in self.parents.keys():
                 self.parents[key] = None
             self.level = 0
-            docling_title = self.parents[self.level + 1] = doc.add_title(
+            self.parents[self.level + 1] = doc.add_title(
                 text_clean,
                 content_layer=self.content_layer,
                 formatting=annotated_text.formatting,
                 hyperlink=annotated_text.hyperlink,
             )
-            added_ref = [docling_title.get_ref()]
+            p1 = self.parents[self.level + 1]
+            if p1 is not None:
+                added_ref = [p1.get_ref()]
         # the other levels need to be lowered by 1 if a title was set
         else:
             level -= 1
@@ -802,7 +821,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         _log.debug(f"Remove the tail of level {key}")
                         self.parents[key] = None
                 self.level = level
-            docling_heading = self.parents[self.level + 1] = doc.add_heading(
+            self.parents[self.level + 1] = doc.add_heading(
                 parent=self.parents[self.level],
                 text=text_clean,
                 orig=annotated_text.text,
@@ -811,7 +830,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 formatting=annotated_text.formatting,
                 hyperlink=annotated_text.hyperlink,
             )
-            added_ref = [docling_heading.get_ref()]
+            p2 = self.parents[self.level + 1]
+            if p2 is not None:
+                added_ref = [p2.get_ref()]
         self.level += 1
         for img_tag in tag("img"):
             if isinstance(img_tag, Tag):

{docling-2.55.1 → docling-2.56.1}/docling/backend/msexcel_backend.py RENAMED Viewed

@@ -18,6 +18,7 @@ from docling_core.types.doc import (
     TableData,
 )
 from openpyxl import load_workbook
+from openpyxl.chartsheet.chartsheet import Chartsheet
 from openpyxl.drawing.image import Image
 from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
 from openpyxl.worksheet.worksheet import Worksheet
@@ -186,18 +187,18 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         if self.workbook is not None:
             # Iterate over all sheets
-            for sheet_name in self.workbook.sheetnames:
-                _log.info(f"Processing sheet: {sheet_name}")
+            for idx, name in enumerate(self.workbook.sheetnames):
+                _log.info(f"Processing sheet {idx}: {name}")
-                sheet = self.workbook[sheet_name]
-                page_no = self.workbook.index(sheet) + 1
+                sheet = self.workbook[name]
+                page_no = idx + 1
                 # do not rely on sheet.max_column, sheet.max_row if there are images
                 page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
                 self.parents[0] = doc.add_group(
                     parent=None,
                     label=GroupLabel.SECTION,
-                    name=f"sheet: {sheet_name}",
+                    name=f"sheet: {name}",
                     content_layer=self._get_sheet_content_layer(sheet),
                 )
                 doc = self._convert_sheet(doc, sheet)
@@ -208,7 +209,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         return doc
-    def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
+    def _convert_sheet(
+        self, doc: DoclingDocument, sheet: Union[Worksheet, Chartsheet]
+    ) -> DoclingDocument:
         """Parse an Excel worksheet and attach its structure to a DoclingDocument
         Args:
@@ -218,10 +221,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         Returns:
             The updated DoclingDocument.
         """
+        if isinstance(sheet, Worksheet):
+            doc = self._find_tables_in_sheet(doc, sheet)
+            doc = self._find_images_in_sheet(doc, sheet)
-        doc = self._find_tables_in_sheet(doc, sheet)
-        doc = self._find_images_in_sheet(doc, sheet)
+        # TODO: parse charts in sheet
         return doc

{docling-2.55.1 → docling-2.56.1}/docling/cli/main.py RENAMED Viewed

@@ -49,7 +49,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     AsrPipelineOptions,
     ConvertPipelineOptions,
-    EasyOcrOptions,
+    OcrAutoOptions,
     OcrOptions,
     PaginatedPipelineOptions,
     PdfBackend,
@@ -57,6 +57,8 @@ from docling.datamodel.pipeline_options import (
     PipelineOptions,
     ProcessingPipeline,
     TableFormerMode,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
     VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
@@ -372,7 +374,7 @@ def convert(  # noqa: C901
                 f"Use the option --show-external-plugins to see the options allowed with external plugins."
             ),
         ),
-    ] = EasyOcrOptions.kind,
+    ] = OcrAutoOptions.kind,
     ocr_lang: Annotated[
         Optional[str],
         typer.Option(
@@ -380,6 +382,13 @@ def convert(  # noqa: C901
             help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
         ),
     ] = None,
+    psm: Annotated[
+        Optional[int],
+        typer.Option(
+            ...,
+            help="Page Segmentation Mode for the OCR engine (0-13).",
+        ),
+    ] = None,
     pdf_backend: Annotated[
         PdfBackend, typer.Option(..., help="The PDF backend to use.")
     ] = PdfBackend.DLPARSE_V2,
@@ -547,13 +556,25 @@ def convert(  # noqa: C901
                     if local_path.exists() and local_path.is_dir():
                         for fmt in from_formats:
                             for ext in FormatToExtensions[fmt]:
-                                input_doc_paths.extend(
-                                    list(local_path.glob(f"**/*.{ext}"))
-                                )
-                                input_doc_paths.extend(
-                                    list(local_path.glob(f"**/*.{ext.upper()}"))
-                                )
+                                for path in local_path.glob(f"**/*.{ext}"):
+                                    if path.name.startswith("~$") and ext == "docx":
+                                        _log.info(
+                                            f"Ignoring temporary Word file: {path}"
+                                        )
+                                        continue
+                                    input_doc_paths.append(path)
+                                for path in local_path.glob(f"**/*.{ext.upper()}"):
+                                    if path.name.startswith("~$") and ext == "docx":
+                                        _log.info(
+                                            f"Ignoring temporary Word file: {path}"
+                                        )
+                                        continue
+                                    input_doc_paths.append(path)
                     elif local_path.exists():
+                        if not local_path.name.startswith("~$") and ext == "docx":
+                            _log.info(f"Ignoring temporary Word file: {path}")
+                            continue
                         input_doc_paths.append(local_path)
                     else:
                         err_console.print(
@@ -584,6 +605,10 @@ def convert(  # noqa: C901
         ocr_lang_list = _split_list(ocr_lang)
         if ocr_lang_list is not None:
             ocr_options.lang = ocr_lang_list
+        if psm is not None and isinstance(
+            ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
+        ):
+            ocr_options.psm = psm
         accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
         # pipeline_options: PaginatedPipelineOptions

{docling-2.55.1 → docling-2.56.1}/docling/cli/models.py RENAMED Viewed

@@ -38,6 +38,7 @@ class _AvailableModels(str, Enum):
     SMOLDOCLING = "smoldocling"
     SMOLDOCLING_MLX = "smoldocling_mlx"
     GRANITE_VISION = "granite_vision"
+    RAPIDOCR = "rapidocr"
     EASYOCR = "easyocr"
@@ -46,7 +47,7 @@ _default_models = [
     _AvailableModels.TABLEFORMER,
     _AvailableModels.CODE_FORMULA,
     _AvailableModels.PICTURE_CLASSIFIER,
-    _AvailableModels.EASYOCR,
+    _AvailableModels.RAPIDOCR,
 ]
@@ -115,6 +116,7 @@ def download(
         with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
         with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
         with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
+        with_rapidocr=_AvailableModels.RAPIDOCR in to_download,
         with_easyocr=_AvailableModels.EASYOCR in to_download,
     )

{docling-2.55.1 → docling-2.56.1}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -81,6 +81,13 @@ class OcrOptions(BaseOptions):
     )
+class OcrAutoOptions(OcrOptions):
+    """Options for pick OCR engine automatically."""
+    kind: ClassVar[Literal["auto"]] = "auto"
+    lang: List[str] = []
 class RapidOcrOptions(OcrOptions):
     """Options for the RapidOCR engine."""
@@ -154,6 +161,9 @@ class TesseractCliOcrOptions(OcrOptions):
     lang: List[str] = ["fra", "deu", "spa", "eng"]
     tesseract_cmd: str = "tesseract"
     path: Optional[str] = None
+    psm: Optional[int] = (
+        None  # Page Segmentation Mode (0-13), defaults to tesseract's default
+    )
     model_config = ConfigDict(
         extra="forbid",
@@ -166,6 +176,9 @@ class TesseractOcrOptions(OcrOptions):
     kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
     lang: List[str] = ["fra", "deu", "spa", "eng"]
     path: Optional[str] = None
+    psm: Optional[int] = (
+        None  # Page Segmentation Mode (0-13), defaults to tesseract's default
+    )
     model_config = ConfigDict(
         extra="forbid",
@@ -249,6 +262,7 @@ class PdfBackend(str, Enum):
 class OcrEngine(str, Enum):
     """Enum of valid OCR engines."""
+    AUTO = "auto"
     EASYOCR = "easyocr"
     TESSERACT_CLI = "tesseract_cli"
     TESSERACT = "tesseract"
@@ -330,7 +344,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
     # If True, text from backend will be used instead of generated text
     table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: OcrOptions = EasyOcrOptions()
+    ocr_options: OcrOptions = OcrAutoOptions()
     layout_options: LayoutOptions = LayoutOptions()
     images_scale: float = 1.0

docling-2.56.1/docling/models/auto_ocr_model.py ADDED Viewed

@@ -0,0 +1,132 @@
+import logging
+import sys
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Optional, Type
+from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrAutoOptions,
+    OcrMacOptions,
+    OcrOptions,
+    RapidOcrOptions,
+)
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.models.easyocr_model import EasyOcrModel
+from docling.models.ocr_mac_model import OcrMacModel
+from docling.models.rapid_ocr_model import RapidOcrModel
+_log = logging.getLogger(__name__)
+class OcrAutoModel(BaseOcrModel):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: OcrAutoOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: OcrAutoOptions
+        self._engine: Optional[BaseOcrModel] = None
+        if self.enabled:
+            if "darwin" == sys.platform:
+                try:
+                    from ocrmac import ocrmac
+                    self._engine = OcrMacModel(
+                        enabled=self.enabled,
+                        artifacts_path=artifacts_path,
+                        options=OcrMacOptions(
+                            bitmap_area_threshold=self.options.bitmap_area_threshold,
+                            force_full_page_ocr=self.options.force_full_page_ocr,
+                        ),
+                        accelerator_options=accelerator_options,
+                    )
+                    _log.info("Auto OCR model selected ocrmac.")
+                except ImportError:
+                    _log.info("ocrmac cannot be used because ocrmac is not installed.")
+            if self._engine is None:
+                try:
+                    import onnxruntime
+                    from rapidocr import EngineType, RapidOCR  # type: ignore
+                    self._engine = RapidOcrModel(
+                        enabled=self.enabled,
+                        artifacts_path=artifacts_path,
+                        options=RapidOcrOptions(
+                            backend="onnxruntime",
+                            bitmap_area_threshold=self.options.bitmap_area_threshold,
+                            force_full_page_ocr=self.options.force_full_page_ocr,
+                        ),
+                        accelerator_options=accelerator_options,
+                    )
+                    _log.info("Auto OCR model selected rapidocr with onnxruntime.")
+                except ImportError:
+                    _log.info(
+                        "rapidocr cannot be used because onnxruntime is not installed."
+                    )
+            if self._engine is None:
+                try:
+                    import easyocr
+                    self._engine = EasyOcrModel(
+                        enabled=self.enabled,
+                        artifacts_path=artifacts_path,
+                        options=EasyOcrOptions(
+                            bitmap_area_threshold=self.options.bitmap_area_threshold,
+                            force_full_page_ocr=self.options.force_full_page_ocr,
+                        ),
+                        accelerator_options=accelerator_options,
+                    )
+                    _log.info("Auto OCR model selected easyocr.")
+                except ImportError:
+                    _log.info("easyocr cannot be used because it is not installed.")
+            if self._engine is None:
+                try:
+                    import torch
+                    from rapidocr import EngineType, RapidOCR  # type: ignore
+                    self._engine = RapidOcrModel(
+                        enabled=self.enabled,
+                        artifacts_path=artifacts_path,
+                        options=RapidOcrOptions(
+                            backend="torch",
+                            bitmap_area_threshold=self.options.bitmap_area_threshold,
+                            force_full_page_ocr=self.options.force_full_page_ocr,
+                        ),
+                        accelerator_options=accelerator_options,
+                    )
+                    _log.info("Auto OCR model selected rapidocr with torch.")
+                except ImportError:
+                    _log.info(
+                        "rapidocr cannot be used because rapidocr or torch is not installed."
+                    )
+            if self._engine is None:
+                _log.warning("No OCR engine found. Please review the install details.")
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        if not self.enabled or self._engine is None:
+            yield from page_batch
+            return
+        yield from self._engine(conv_res, page_batch)
+    @classmethod
+    def get_options_type(cls) -> Type[OcrOptions]:
+        return OcrAutoOptions

{docling-2.55.1 → docling-2.56.1}/docling/models/base_model.py RENAMED Viewed

@@ -173,11 +173,11 @@ class BaseItemAndImageEnrichmentModel(
         assert isinstance(element, DocItem)
         # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
-        if len(element.prov) == 0 and isinstance(element, PictureItem):
+        if isinstance(element, PictureItem):
             embedded_im = element.get_image(conv_res.document)
             if embedded_im is not None:
                 return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
-            else:
+            elif len(element.prov) == 0:
                 return None
         # Crop the image form the page

{docling-2.55.1 → docling-2.56.1}/docling/models/plugins/defaults.py RENAMED Viewed

@@ -1,4 +1,5 @@
 def ocr_engines():
+    from docling.models.auto_ocr_model import OcrAutoModel
     from docling.models.easyocr_model import EasyOcrModel
     from docling.models.ocr_mac_model import OcrMacModel
     from docling.models.rapid_ocr_model import RapidOcrModel
@@ -7,6 +8,7 @@ def ocr_engines():
     return {
         "ocr_engines": [
+            OcrAutoModel,
             EasyOcrModel,
             OcrMacModel,
             RapidOcrModel,

docling 2.55.1__tar.gz → 2.56.1__tar.gz

Potentially problematic release.

docling 2.55.1tar.gz → 2.56.1tar.gz