PyPI - docling - Versions diffs - 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl - Mend

docling 2.36.1py3-none-any.whl → 2.38.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

docling/backend/asciidoc_backend.py +39 -18
docling/backend/docling_parse_backend.py +61 -59
docling/backend/docling_parse_v2_backend.py +72 -62
docling/backend/docling_parse_v4_backend.py +21 -19
docling/backend/md_backend.py +101 -81
docling/backend/mspowerpoint_backend.py +72 -113
docling/backend/msword_backend.py +99 -80
docling/backend/noop_backend.py +51 -0
docling/backend/pypdfium2_backend.py +127 -53
docling/cli/main.py +82 -14
docling/datamodel/asr_model_specs.py +92 -0
docling/datamodel/base_models.py +21 -4
docling/datamodel/document.py +3 -1
docling/datamodel/pipeline_options.py +15 -2
docling/datamodel/pipeline_options_asr_model.py +57 -0
docling/datamodel/pipeline_options_vlm_model.py +4 -4
docling/document_converter.py +8 -0
docling/models/api_vlm_model.py +3 -1
docling/models/base_model.py +1 -1
docling/models/base_ocr_model.py +33 -11
docling/models/easyocr_model.py +1 -1
docling/models/layout_model.py +2 -3
docling/models/ocr_mac_model.py +1 -1
docling/models/page_preprocessing_model.py +3 -6
docling/models/rapid_ocr_model.py +1 -1
docling/models/readingorder_model.py +3 -3
docling/models/tesseract_ocr_cli_model.py +4 -3
docling/models/tesseract_ocr_model.py +1 -1
docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
docling/models/vlm_models_inline/mlx_model.py +3 -1
docling/pipeline/asr_pipeline.py +253 -0
docling/pipeline/base_pipeline.py +11 -0
docling/pipeline/standard_pdf_pipeline.py +0 -1
docling/utils/layout_postprocessor.py +11 -6
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0

docling/cli/main.py CHANGED Viewed

@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+    WHISPER_BASE,
+    WHISPER_LARGE,
+    WHISPER_MEDIUM,
+    WHISPER_SMALL,
+    WHISPER_TINY,
+    WHISPER_TURBO,
+    AsrModelType,
+)
 from docling.datamodel.base_models import (
     ConversionStatus,
     FormatToExtensions,
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AsrPipelineOptions,
     EasyOcrOptions,
     OcrOptions,
     PaginatedPipelineOptions,
     PdfBackend,
-    PdfPipeline,
     PdfPipelineOptions,
+    PipelineOptions,
+    ProcessingPipeline,
     TableFormerMode,
     VlmPipelineOptions,
 )
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
     SMOLDOCLING_TRANSFORMERS,
     VlmModelType,
 )
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.document_converter import (
+    AudioFormatOption,
+    DocumentConverter,
+    FormatOption,
+    PdfFormatOption,
+)
 from docling.models.factories import get_ocr_factory
+from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.vlm_pipeline import VlmPipeline
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +313,17 @@ def convert(  # noqa: C901
         ),
     ] = ImageRefMode.EMBEDDED,
     pipeline: Annotated[
-        PdfPipeline,
+        ProcessingPipeline,
         typer.Option(..., help="Choose the pipeline to process PDF or image files."),
-    ] = PdfPipeline.STANDARD,
+    ] = ProcessingPipeline.STANDARD,
     vlm_model: Annotated[
         VlmModelType,
         typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
     ] = VlmModelType.SMOLDOCLING,
+    asr_model: Annotated[
+        AsrModelType,
+        typer.Option(..., help="Choose the ASR model to use with audio/video files."),
+    ] = AsrModelType.WHISPER_TINY,
     ocr: Annotated[
         bool,
         typer.Option(
@@ -450,12 +471,14 @@ def convert(  # noqa: C901
         ),
     ] = None,
 ):
+    log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
     if verbose == 0:
-        logging.basicConfig(level=logging.WARNING)
+        logging.basicConfig(level=logging.WARNING, format=log_format)
     elif verbose == 1:
-        logging.basicConfig(level=logging.INFO)
+        logging.basicConfig(level=logging.INFO, format=log_format)
     else:
-        logging.basicConfig(level=logging.DEBUG)
+        logging.basicConfig(level=logging.DEBUG, format=log_format)
     settings.debug.visualize_cells = debug_visualize_cells
     settings.debug.visualize_layout = debug_visualize_layout
@@ -530,9 +553,12 @@ def convert(  # noqa: C901
             ocr_options.lang = ocr_lang_list
         accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
-        pipeline_options: PaginatedPipelineOptions
+        # pipeline_options: PaginatedPipelineOptions
+        pipeline_options: PipelineOptions
+        format_options: Dict[InputFormat, FormatOption] = {}
-        if pipeline == PdfPipeline.STANDARD:
+        if pipeline == ProcessingPipeline.STANDARD:
             pipeline_options = PdfPipelineOptions(
                 allow_external_plugins=allow_external_plugins,
                 enable_remote_services=enable_remote_services,
@@ -574,7 +600,13 @@ def convert(  # noqa: C901
                 pipeline_options=pipeline_options,
                 backend=backend,  # pdf_backend
             )
-        elif pipeline == PdfPipeline.VLM:
+            format_options = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+        elif pipeline == ProcessingPipeline.VLM:
             pipeline_options = VlmPipelineOptions(
                 enable_remote_services=enable_remote_services,
             )
@@ -600,13 +632,48 @@ def convert(  # noqa: C901
                 pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
             )
+            format_options = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+        elif pipeline == ProcessingPipeline.ASR:
+            pipeline_options = AsrPipelineOptions(
+                # enable_remote_services=enable_remote_services,
+                # artifacts_path = artifacts_path
+            )
+            if asr_model == AsrModelType.WHISPER_TINY:
+                pipeline_options.asr_options = WHISPER_TINY
+            elif asr_model == AsrModelType.WHISPER_SMALL:
+                pipeline_options.asr_options = WHISPER_SMALL
+            elif asr_model == AsrModelType.WHISPER_MEDIUM:
+                pipeline_options.asr_options = WHISPER_MEDIUM
+            elif asr_model == AsrModelType.WHISPER_BASE:
+                pipeline_options.asr_options = WHISPER_BASE
+            elif asr_model == AsrModelType.WHISPER_LARGE:
+                pipeline_options.asr_options = WHISPER_LARGE
+            elif asr_model == AsrModelType.WHISPER_TURBO:
+                pipeline_options.asr_options = WHISPER_TURBO
+            else:
+                _log.error(f"{asr_model} is not known")
+                raise ValueError(f"{asr_model} is not known")
+            _log.info(f"pipeline_options: {pipeline_options}")
+            audio_format_option = AudioFormatOption(
+                pipeline_cls=AsrPipeline,
+                pipeline_options=pipeline_options,
+            )
+            format_options = {
+                InputFormat.AUDIO: audio_format_option,
+            }
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
+            # audio_pipeline_options.artifacts_path = artifacts_path
-        format_options: Dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
         doc_converter = DocumentConverter(
             allowed_formats=from_formats,
             format_options=format_options,
@@ -614,6 +681,7 @@ def convert(  # noqa: C901
         start_time = time.time()
+        _log.info(f"paths: {input_doc_paths}")
         conv_results = doc_converter.convert_all(
             input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
         )

docling/datamodel/asr_model_specs.py ADDED Viewed

@@ -0,0 +1,92 @@
+import logging
+from enum import Enum
+from pydantic import (
+    AnyUrl,
+)
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_asr_model import (
+    # AsrResponseFormat,
+    # ApiAsrOptions,
+    InferenceAsrFramework,
+    InlineAsrNativeWhisperOptions,
+    TransformersModelType,
+)
+_log = logging.getLogger(__name__)
+WHISPER_TINY = InlineAsrNativeWhisperOptions(
+    repo_id="tiny",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_SMALL = InlineAsrNativeWhisperOptions(
+    repo_id="small",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
+    repo_id="medium",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_BASE = InlineAsrNativeWhisperOptions(
+    repo_id="base",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_LARGE = InlineAsrNativeWhisperOptions(
+    repo_id="large",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_TURBO = InlineAsrNativeWhisperOptions(
+    repo_id="turbo",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+class AsrModelType(str, Enum):
+    WHISPER_TINY = "whisper_tiny"
+    WHISPER_SMALL = "whisper_small"
+    WHISPER_MEDIUM = "whisper_medium"
+    WHISPER_BASE = "whisper_base"
+    WHISPER_LARGE = "whisper_large"
+    WHISPER_TURBO = "whisper_turbo"

docling/datamodel/base_models.py CHANGED Viewed

@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
     JSON_DOCLING = "json_docling"
+    AUDIO = "audio"
 class OutputFormat(str, Enum):
@@ -67,12 +68,13 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.MD: ["md"],
     InputFormat.HTML: ["html", "htm", "xhtml"],
     InputFormat.XML_JATS: ["xml", "nxml"],
-    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
+    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.CSV: ["csv"],
-    InputFormat.XLSX: ["xlsx"],
+    InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
     InputFormat.JSON_DOCLING: ["json"],
+    InputFormat.AUDIO: ["wav", "mp3"],
 }
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
     InputFormat.JSON_DOCLING: ["application/json"],
+    InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
 }
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -232,7 +235,6 @@ class Page(BaseModel):
     page_no: int
     # page_hash: Optional[str] = None
     size: Optional[Size] = None
-    cells: List[TextCell] = []
     parsed_page: Optional[SegmentedPdfPage] = None
     predictions: PagePredictions = PagePredictions()
     assembled: Optional[AssembledUnit] = None
@@ -245,12 +247,27 @@ class Page(BaseModel):
         float, Image
     ] = {}  # Cache of images in different scales. By default it is cleared during assembling.
+    @property
+    def cells(self) -> List[TextCell]:
+        """Return text cells as a read-only view of parsed_page.textline_cells."""
+        if self.parsed_page is not None:
+            return self.parsed_page.textline_cells
+        else:
+            return []
     def get_image(
-        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
+        self,
+        scale: float = 1.0,
+        max_size: Optional[int] = None,
+        cropbox: Optional[BoundingBox] = None,
     ) -> Optional[Image]:
         if self._backend is None:
             return self._image_cache.get(scale, None)
+        if max_size:
+            assert self.size is not None
+            scale = min(scale, max_size / max(self.size.as_tuple()))
         if scale not in self._image_cache:
             if cropbox is None:
                 self._image_cache[scale] = self._backend.get_page_image(scale=scale)

docling/datamodel/document.py CHANGED Viewed

@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
             backend: Type[AbstractDocumentBackend]
             if format not in format_options.keys():
                 _log.error(
-                    f"Input document {obj.name} does not match any allowed format."
+                    f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
                 )
                 backend = _DummyBackend
             else:
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
         mime = mime or _DocumentConversionInput._detect_csv(content)
         mime = mime or "text/plain"
         formats = MimeTypeToFormat.get(mime, [])
+        _log.info(f"detected formats: {formats}")
         if formats:
             if len(formats) == 1 and mime not in ("text/plain"):
                 return formats[0]

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -11,8 +11,13 @@ from pydantic import (
 )
 from typing_extensions import deprecated
+from docling.datamodel import asr_model_specs
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options_asr_model import (
+    InlineAsrOptions,
+)
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
     )
+class AsrPipelineOptions(PipelineOptions):
+    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
+    artifacts_path: Optional[Union[Path, str]] = None
 class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
@@ -292,9 +302,12 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
         ),
     )
-    generate_parsed_pages: bool = False
+    generate_parsed_pages: Literal[True] = (
+        True  # Always True since parsed_page is now mandatory
+    )
-class PdfPipeline(str, Enum):
+class ProcessingPipeline(str, Enum):
     STANDARD = "standard"
     VLM = "vlm"
+    ASR = "asr"

docling/datamodel/pipeline_options_asr_model.py ADDED Viewed

@@ -0,0 +1,57 @@
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Union
+from pydantic import AnyUrl, BaseModel
+from typing_extensions import deprecated
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_vlm_model import (
+    # InferenceFramework,
+    TransformersModelType,
+)
+class BaseAsrOptions(BaseModel):
+    kind: str
+    # prompt: str
+class InferenceAsrFramework(str, Enum):
+    # MLX = "mlx" # disabled for now
+    # TRANSFORMERS = "transformers" # disabled for now
+    WHISPER = "whisper"
+class InlineAsrOptions(BaseAsrOptions):
+    kind: Literal["inline_model_options"] = "inline_model_options"
+    repo_id: str
+    verbose: bool = False
+    timestamps: bool = True
+    temperature: float = 0.0
+    max_new_tokens: int = 256
+    max_time_chunk: float = 30.0
+    torch_dtype: Optional[str] = None
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ]
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+class InlineAsrNativeWhisperOptions(InlineAsrOptions):
+    inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
+    language: str = "en"
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+    ]
+    word_timestamps: bool = True

docling/datamodel/pipeline_options_vlm_model.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any, Dict, List, Literal
+from typing import Any, Dict, List, Literal, Optional, Union
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
@@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
     kind: str
     prompt: str
+    scale: float = 2.0
+    max_size: Optional[int] = None
 class ResponseFormat(str, Enum):
@@ -42,14 +44,13 @@ class InlineVlmOptions(BaseVlmOptions):
     transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
     response_format: ResponseFormat
+    torch_dtype: Optional[str] = None
     supported_devices: List[AcceleratorDevice] = [
         AcceleratorDevice.CPU,
         AcceleratorDevice.CUDA,
         AcceleratorDevice.MPS,
     ]
-    scale: float = 2.0
     temperature: float = 0.0
     stop_strings: List[str] = []
     extra_generation_config: Dict[str, Any] = {}
@@ -75,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
     )  # Default to ollama
     headers: Dict[str, str] = {}
     params: Dict[str, Any] = {}
-    scale: float = 2.0
     timeout: float = 60
     concurrency: int = 1
     response_format: ResponseFormat

docling/document_converter.py CHANGED Viewed

@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.noop_backend import NoOpBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
     settings,
 )
 from docling.exceptions import ConversionError
+from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
     backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
+class AudioFormatOption(FormatOption):
+    pipeline_cls: Type = AsrPipeline
+    backend: Type[AbstractDocumentBackend] = NoOpBackend
 def _get_default_option(format: InputFormat) -> FormatOption:
     format_to_default_options = {
         InputFormat.CSV: FormatOption(
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.JSON_DOCLING: FormatOption(
             pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
         ),
+        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
     }
     if (options := format_to_default_options.get(format)) is not None:
         return options

docling/models/api_vlm_model.py CHANGED Viewed

@@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
                 with TimeRecorder(conv_res, "vlm"):
                     assert page.size is not None
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
                     assert hi_res_image is not None
                     if hi_res_image:
                         if hi_res_image.mode != "RGB":

docling/models/base_model.py CHANGED Viewed

@@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
             coord_origin=bbox.coord_origin,
         )
-        page_ix = element_prov.page_no - 1
+        page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
         cropped_image = conv_res.pages[page_ix].get_image(
             scale=self.images_scale, cropbox=expanded_bbox
         )

docling/models/base_ocr_model.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List, Optional, Type
 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
@@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
             return []
     # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
-    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
+    def _filter_ocr_cells(
+        self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
+    ) -> List[TextCell]:
         # Create R-tree index for programmatic cells
         p = index.Property()
         p.dimension = 2
@@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
         ]
         return filtered_ocr_cells
-    def post_process_cells(self, ocr_cells, programmatic_cells):
+    def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
         r"""
-        Post-process the ocr and programmatic cells and return the final list of of cells
+        Post-process the OCR cells and update the page object.
+        Updates parsed_page.textline_cells directly since page.cells is now read-only.
         """
+        # Get existing cells from the read-only property
+        existing_cells = page.cells
+        # Combine existing and OCR cells with overlap filtering
+        final_cells = self._combine_cells(existing_cells, ocr_cells)
+        assert page.parsed_page is not None
+        # Update parsed_page.textline_cells directly
+        page.parsed_page.textline_cells = final_cells
+        page.parsed_page.has_lines = len(final_cells) > 0
+    def _combine_cells(
+        self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
+    ) -> List[TextCell]:
+        """Combine existing and OCR cells with filtering and re-indexing."""
         if self.options.force_full_page_ocr:
-            # If a full page OCR is forced, use only the OCR cells
-            cells = ocr_cells
-            return cells
-        ## Remove OCR cells which overlap with programmatic cells.
-        filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
-        programmatic_cells.extend(filtered_ocr_cells)
-        return programmatic_cells
+            combined = ocr_cells
+        else:
+            filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
+            combined = list(existing_cells) + filtered_ocr_cells
+        # Re-index in-place
+        for i, cell in enumerate(combined):
+            cell.index = i
+        return combined
     def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
         image = copy.deepcopy(page.image)

docling/models/easyocr_model.py CHANGED Viewed

@@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
                         all_ocr_cells.extend(cells)
                     # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                 # DEBUG code:
                 if settings.debug.visualize_ocr:

docling/models/layout_model.py CHANGED Viewed

@@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
                     # Apply postprocessing
                     processed_clusters, processed_cells = LayoutPostprocessor(
-                        page.cells, clusters, page.size
+                        page, clusters
                     ).postprocess()
-                    # processed_clusters, processed_cells = clusters, page.cells
+                    # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
                     with warnings.catch_warnings():
                         warnings.filterwarnings(
@@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
                             )
                         )
-                    page.cells = processed_cells
                     page.predictions.layout = LayoutPrediction(
                         clusters=processed_clusters
                     )

docling/models/ocr_mac_model.py CHANGED Viewed

@@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
                         all_ocr_cells.extend(cells)
                     # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                 # DEBUG code:
                 if settings.debug.visualize_ocr:

docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl

docling 2.36.1py3-none-any.whl → 2.38.0py3-none-any.whl