PyPI - docling - Versions diffs - 1.19.1__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

docling 1.19.1py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docling/backend/abstract_backend.py +32 -37
docling/backend/docling_parse_backend.py +16 -12
docling/backend/docling_parse_v2_backend.py +240 -0
docling/backend/html_backend.py +425 -0
docling/backend/mspowerpoint_backend.py +375 -0
docling/backend/msword_backend.py +509 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +15 -10
docling/cli/main.py +61 -60
docling/datamodel/base_models.py +73 -193
docling/datamodel/document.py +379 -324
docling/datamodel/pipeline_options.py +16 -0
docling/datamodel/settings.py +1 -0
docling/document_converter.py +215 -252
docling/models/base_model.py +25 -0
docling/models/base_ocr_model.py +19 -6
docling/models/ds_glm_model.py +220 -22
docling/models/easyocr_model.py +45 -40
docling/models/layout_model.py +130 -114
docling/models/page_assemble_model.py +119 -95
docling/models/page_preprocessing_model.py +61 -0
docling/models/table_structure_model.py +122 -111
docling/models/tesseract_ocr_cli_model.py +63 -56
docling/models/tesseract_ocr_model.py +58 -50
docling/pipeline/base_pipeline.py +190 -0
docling/pipeline/simple_pipeline.py +59 -0
docling/pipeline/standard_pdf_pipeline.py +198 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling-2.1.0.dist-info/METADATA +149 -0
docling-2.1.0.dist-info/RECORD +42 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.19.1.dist-info/METADATA +0 -380
docling-1.19.1.dist-info/RECORD +0 -34
{docling-1.19.1.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
{docling-1.19.1.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
{docling-1.19.1.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from enum import Enum, auto
+from pathlib import Path
 from typing import List, Literal, Optional, Union
 from pydantic import BaseModel, ConfigDict, Field
@@ -21,6 +22,9 @@ class TableStructureOptions(BaseModel):
 class OcrOptions(BaseModel):
     kind: str
+    bitmap_area_threshold: float = (
+        0.05  # percentage of the area for a bitmap to processed with OCR
+    )
 class EasyOcrOptions(OcrOptions):
@@ -58,6 +62,13 @@ class TesseractOcrOptions(OcrOptions):
 class PipelineOptions(BaseModel):
+    create_legacy_output: bool = (
+        True  # This defautl will be set to False on a future version of docling
+    )
+class PdfPipelineOptions(PipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
@@ -65,3 +76,8 @@ class PipelineOptions(BaseModel):
     ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
         Field(EasyOcrOptions(), discriminator="kind")
     )
+    images_scale: float = 1.0
+    generate_page_images: bool = False
+    generate_picture_images: bool = False
+    generate_table_images: bool = False

docling/datamodel/settings.py CHANGED Viewed

@@ -14,6 +14,7 @@ class BatchConcurrencySettings(BaseModel):
     doc_batch_concurrency: int = 2
     page_batch_size: int = 4
     page_batch_concurrency: int = 2
+    elements_batch_size: int = 16
     # doc_batch_size: int = 1
     # doc_batch_concurrency: int = 1

docling/document_converter.py CHANGED Viewed

@@ -1,297 +1,260 @@
-import functools
 import logging
-import tempfile
+import sys
 import time
-import traceback
+from functools import partial
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
-import requests
-from PIL import ImageDraw
-from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
-from docling.backend.abstract_backend import PdfDocumentBackend
-from docling.datamodel.base_models import (
-    AssembledUnit,
-    AssembleOptions,
-    ConversionStatus,
-    DoclingComponentType,
-    ErrorItem,
-    Page,
-)
+from typing import Dict, Iterable, Iterator, List, Optional, Type
+from pydantic import BaseModel, ConfigDict, model_validator, validate_call
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
+from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
 from docling.datamodel.document import (
     ConversionResult,
-    DocumentConversionInput,
     InputDocument,
+    _DocumentConversionInput,
 )
 from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import settings
-from docling.models.ds_glm_model import GlmModel
-from docling.models.page_assemble_model import PageAssembleModel
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
-from docling.pipeline.standard_model_pipeline import StandardModelPipeline
-from docling.utils.utils import chunkify, create_hash
+from docling.datamodel.settings import DocumentLimits, settings
+from docling.pipeline.base_pipeline import BasePipeline
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.utils.utils import chunkify
 _log = logging.getLogger(__name__)
-class DocumentConverter:
-    _default_download_filename = "file.pdf"
+class FormatOption(BaseModel):
+    pipeline_cls: Type[BasePipeline]
+    pipeline_options: Optional[PipelineOptions] = None
+    backend: Type[AbstractDocumentBackend]
-    def __init__(
-        self,
-        artifacts_path: Optional[Union[Path, str]] = None,
-        pipeline_options: PipelineOptions = PipelineOptions(),
-        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
-        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
-        assemble_options: AssembleOptions = AssembleOptions(),
-    ):
-        if not artifacts_path:
-            artifacts_path = self.download_models_hf()
+    model_config = ConfigDict(arbitrary_types_allowed=True)
-        artifacts_path = Path(artifacts_path)
+    @model_validator(mode="after")
+    def set_optional_field_default(self) -> "FormatOption":
+        if self.pipeline_options is None:
+            self.pipeline_options = self.pipeline_cls.get_default_options()
+        return self
-        self.model_pipeline = pipeline_cls(
-            artifacts_path=artifacts_path, pipeline_options=pipeline_options
-        )
-        self.page_assemble_model = PageAssembleModel(config={})
-        self.glm_model = GlmModel(config={})
-        self.pdf_backend = pdf_backend
-        self.assemble_options = assemble_options
-    @staticmethod
-    def download_models_hf(
-        local_dir: Optional[Path] = None, force: bool = False
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v2.0.0",
-        )
+class WordFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
-        return Path(download_path)
-    def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
+class PowerpointFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
-        for input_batch in chunkify(
-            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
-        ):
-            _log.info(f"Going to convert document batch...")
-            # parallel processing only within input_batch
-            # with ThreadPoolExecutor(
-            #    max_workers=settings.perf.doc_batch_concurrency
-            # ) as pool:
-            #   yield from pool.map(self.process_document, input_batch)
-            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
-            yield from map(self._process_document, input_batch)
-    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
-        """Convert a single document.
-        Args:
-            source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
-        Raises:
-            ValueError: If source is of unexpected type.
-            RuntimeError: If conversion fails.
-        Returns:
-            ConversionResult: The conversion result object.
-        """
-        with tempfile.TemporaryDirectory() as temp_dir:
-            try:
-                http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
-                res = requests.get(http_url, stream=True)
-                res.raise_for_status()
-                fname = None
-                # try to get filename from response header
-                if cont_disp := res.headers.get("Content-Disposition"):
-                    for par in cont_disp.strip().split(";"):
-                        # currently only handling directive "filename" (not "*filename")
-                        if (split := par.split("=")) and split[0].strip() == "filename":
-                            fname = "=".join(split[1:]).strip().strip("'\"") or None
-                            break
-                # otherwise, use name from URL:
-                if fname is None:
-                    fname = Path(http_url.path).name or self._default_download_filename
-                local_path = Path(temp_dir) / fname
-                with open(local_path, "wb") as f:
-                    for chunk in res.iter_content(chunk_size=1024):  # using 1-KB chunks
-                        f.write(chunk)
-            except ValidationError:
-                try:
-                    local_path = TypeAdapter(Path).validate_python(source)
-                except ValidationError:
-                    raise ValueError(
-                        f"Unexpected file path type encountered: {type(source)}"
-                    )
-            conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
-            conv_res_iter = self.convert(conv_inp)
-            conv_res: ConversionResult = next(conv_res_iter)
-        if conv_res.status not in {
-            ConversionStatus.SUCCESS,
-            ConversionStatus.PARTIAL_SUCCESS,
-        }:
-            raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
-        return conv_res
+class HTMLFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
-    def _process_document(self, in_doc: InputDocument) -> ConversionResult:
-        start_doc_time = time.time()
-        conv_res = ConversionResult(input=in_doc)
-        _log.info(f"Processing document {in_doc.file.name}")
+class PdfFormatOption(FormatOption):
+    pipeline_cls: Type = StandardPdfPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
-        if not in_doc.valid:
-            conv_res.status = ConversionStatus.FAILURE
-            return conv_res
-        for i in range(0, in_doc.page_count):
-            conv_res.pages.append(Page(page_no=i))
+class ImageFormatOption(FormatOption):
+    pipeline_cls: Type = StandardPdfPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
-        all_assembled_pages = []
-        try:
-            # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
-                start_pb_time = time.time()
-                # Pipeline
+_format_to_default_options = {
+    InputFormat.DOCX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
+    ),
+    InputFormat.PPTX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
+    ),
+    InputFormat.HTML: FormatOption(
+        pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
+    ),
+    InputFormat.IMAGE: FormatOption(
+        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+    ),
+    InputFormat.PDF: FormatOption(
+        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+    ),
+}
-                # 1. Initialise the page resources
-                init_pages = map(
-                    functools.partial(self._initialize_page, in_doc), page_batch
-                )
-                # 2. Populate page image
-                pages_with_images = map(
-                    functools.partial(self._populate_page_images, in_doc), init_pages
-                )
+class DocumentConverter:
+    _default_download_filename = "file"
-                # 3. Populate programmatic page cells
-                pages_with_cells = map(
-                    functools.partial(self._parse_page_cells, in_doc),
-                    pages_with_images,
+    def __init__(
+        self,
+        allowed_formats: Optional[List[InputFormat]] = None,
+        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
+    ):
+        self.allowed_formats = allowed_formats
+        self.format_to_options = format_options
+        if self.allowed_formats is None:
+            # if self.format_to_options is not None:
+            #    self.allowed_formats = self.format_to_options.keys()
+            # else:
+            self.allowed_formats = [e for e in InputFormat]  # all formats
+        if self.format_to_options is None:
+            self.format_to_options = _format_to_default_options
+        else:
+            for f in self.allowed_formats:
+                if f not in self.format_to_options.keys():
+                    _log.debug(f"Requested format {f} will use default options.")
+                    self.format_to_options[f] = _format_to_default_options[f]
+            remove_keys = []
+            for f in self.format_to_options.keys():
+                if f not in self.allowed_formats:
+                    remove_keys.append(f)
+            for f in remove_keys:
+                self.format_to_options.pop(f)
+        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
+    @validate_call(config=ConfigDict(strict=True))
+    def convert(
+        self,
+        source: Path | str | DocumentStream,  # TODO review naming
+        raises_on_error: bool = True,
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+    ) -> ConversionResult:
+        all_res = self.convert_all(
+            source=[source],
+            raises_on_error=raises_on_error,
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
+        )
+        return next(all_res)
+    @validate_call(config=ConfigDict(strict=True))
+    def convert_all(
+        self,
+        source: Iterable[Path | str | DocumentStream],  # TODO review naming
+        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+    ) -> Iterator[ConversionResult]:
+        limits = DocumentLimits(
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
+        )
+        conv_input = _DocumentConversionInput(
+            path_or_stream_iterator=source,
+            limit=limits,
+        )
+        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
+        for conv_res in conv_res_iter:
+            if raises_on_error and conv_res.status not in {
+                ConversionStatus.SUCCESS,
+                ConversionStatus.PARTIAL_SUCCESS,
+            }:
+                raise RuntimeError(
+                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
                 )
+            else:
+                yield conv_res
-                # 4. Run pipeline stages
-                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
+    def _convert(
+        self, conv_input: _DocumentConversionInput, raises_on_error: bool
+    ) -> Iterator[ConversionResult]:
+        assert self.format_to_options is not None
-                # 5. Assemble page elements (per page)
-                assembled_pages = self.page_assemble_model(pipeline_pages)
+        for input_batch in chunkify(
+            conv_input.docs(self.format_to_options),
+            settings.perf.doc_batch_size,  # pass format_options
+        ):
+            _log.info(f"Going to convert document batch...")
+            # parallel processing only within input_batch
+            # with ThreadPoolExecutor(
+            #    max_workers=settings.perf.doc_batch_concurrency
+            # ) as pool:
+            #   yield from pool.map(self.process_document, input_batch)
-                # exhaust assembled_pages
-                for assembled_page in assembled_pages:
-                    # Free up mem resources before moving on with next batch
+            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
+            for item in map(
+                partial(self._process_document, raises_on_error=raises_on_error),
+                input_batch,
+            ):
+                if item is not None:
+                    yield item
+    def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
+        assert self.format_to_options is not None
+        fopt = self.format_to_options.get(doc.format)
+        if fopt is None:
+            raise RuntimeError(f"Could not get pipeline for document {doc.file}")
+        else:
+            pipeline_class = fopt.pipeline_cls
+            pipeline_options = fopt.pipeline_options
+        assert pipeline_options is not None
+        # TODO this will ignore if different options have been defined for the same pipeline class.
+        if (
+            pipeline_class not in self.initialized_pipelines
+            or self.initialized_pipelines[pipeline_class].pipeline_options
+            != pipeline_options
+        ):
+            self.initialized_pipelines[pipeline_class] = pipeline_class(
+                pipeline_options=pipeline_options
+            )
+        return self.initialized_pipelines[pipeline_class]
-                    # Remove page images (can be disabled)
-                    if self.assemble_options.images_scale is None:
-                        assembled_page._image_cache = {}
+    def _process_document(
+        self, in_doc: InputDocument, raises_on_error: bool
+    ) -> Optional[ConversionResult]:
+        assert self.allowed_formats is not None
+        assert in_doc.format in self.allowed_formats
-                    # Unload backend
-                    assembled_page._backend.unload()
+        start_doc_time = time.time()
-                    all_assembled_pages.append(assembled_page)
+        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
-                end_pb_time = time.time() - start_pb_time
-                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
+        end_doc_time = time.time() - start_doc_time
+        _log.info(
+            f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
+        )
-            conv_res.pages = all_assembled_pages
-            self._assemble_doc(conv_res)
+        return conv_res
-            status = ConversionStatus.SUCCESS
-            for page in conv_res.pages:
-                if not page._backend.is_valid():
-                    conv_res.errors.append(
-                        ErrorItem(
-                            component_type=DoclingComponentType.PDF_BACKEND,
-                            module_name=type(page._backend).__name__,
-                            error_message=f"Page {page.page_no} failed to parse.",
-                        )
+    def _execute_pipeline(
+        self, in_doc: InputDocument, raises_on_error: bool
+    ) -> ConversionResult:
+        if in_doc.valid:
+            pipeline = self._get_pipeline(in_doc)
+            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+                if raises_on_error:
+                    raise RuntimeError(
+                        f"No pipeline could be initialized for {in_doc.file}."
                     )
-                    status = ConversionStatus.PARTIAL_SUCCESS
+                else:
+                    conv_res = ConversionResult(input=in_doc)
+                    conv_res.status = ConversionStatus.FAILURE
+                    return conv_res
-            conv_res.status = status
+            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
-        except Exception as e:
-            conv_res.status = ConversionStatus.FAILURE
-            trace = "\n".join(traceback.format_exception(e))
-            _log.info(
-                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
-                f"{trace}"
-            )
+        else:
+            if raises_on_error:
+                raise RuntimeError(f"Input document {in_doc.file} is not valid.")
-        finally:
-            # Always unload the PDF backend, even in case of failure
-            if in_doc._backend:
-                in_doc._backend.unload()
-        end_doc_time = time.time() - start_doc_time
-        _log.info(
-            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
-        )
+            else:
+                # invalid doc or not of desired format
+                conv_res = ConversionResult(input=in_doc)
+                conv_res.status = ConversionStatus.FAILURE
+                # TODO add error log why it failed.
         return conv_res
-    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
-    def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
-        page._backend = doc._backend.load_page(page.page_no)
-        page.size = page._backend.get_size()
-        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
-        return page
-    # Generate the page image and store it in the page object
-    def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
-        # default scale
-        page.get_image(
-            scale=1.0
-        )  # puts the page image on the image cache at default scale
-        # user requested scales
-        if self.assemble_options.images_scale is not None:
-            page._default_image_scale = self.assemble_options.images_scale
-            page.get_image(
-                scale=self.assemble_options.images_scale
-            )  # this will trigger storing the image in the internal cache
-        return page
-    # Extract and populate the page cells and store it in the page object
-    def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
-        page.cells = page._backend.get_text_cells()
-        # DEBUG code:
-        def draw_text_boxes(image, cells):
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
-            image.show()
-        # draw_text_boxes(page.get_image(scale=1.0), cells)
-        return page
-    def _assemble_doc(self, conv_res: ConversionResult):
-        all_elements = []
-        all_headers = []
-        all_body = []
-        for p in conv_res.pages:
-            for el in p.assembled.body:
-                all_body.append(el)
-            for el in p.assembled.headers:
-                all_headers.append(el)
-            for el in p.assembled.elements:
-                all_elements.append(el)
-        conv_res.assembled = AssembledUnit(
-            elements=all_elements, headers=all_headers, body=all_body
-        )
-        conv_res.output = self.glm_model(conv_res)

docling/models/base_model.py ADDED Viewed

@@ -0,0 +1,25 @@
+from abc import ABC, abstractmethod
+from typing import Any, Iterable
+from docling_core.types.doc import DoclingDocument, NodeItem
+from docling.datamodel.base_models import Page
+class BasePageModel(ABC):
+    @abstractmethod
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        pass
+class BaseEnrichmentModel(ABC):
+    @abstractmethod
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        pass
+    @abstractmethod
+    def __call__(
+        self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
+    ) -> Iterable[Any]:
+        pass

docling/models/base_ocr_model.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import copy
 import logging
 from abc import abstractmethod
-from typing import Iterable, List, Tuple
+from typing import Iterable, List
 import numpy as np
+from docling_core.types.doc import BoundingBox, CoordOrigin
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import find_objects, label
-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.pipeline_options import OcrOptions
 _log = logging.getLogger(__name__)
@@ -20,8 +21,9 @@ class BaseOcrModel:
         self.options = options
     # Computes the optimum amount and coordinates of rectangles to OCR on a given page
-    def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
+    def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
         BITMAP_COVERAGE_TRESHOLD = 0.75
+        assert page.size is not None
         def find_ocr_rects(size, bitmap_rects):
             image = Image.new(
@@ -60,11 +62,14 @@ class BaseOcrModel:
             return (area_frac, bounding_boxes)  # fraction covered  # boxes
-        bitmap_rects = page._backend.get_bitmap_rects()
+        if page._backend is not None:
+            bitmap_rects = page._backend.get_bitmap_rects()
+        else:
+            bitmap_rects = []
         coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
         # return full-page rectangle if sufficiently covered with bitmaps
-        if coverage > BITMAP_COVERAGE_TRESHOLD:
+        if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
             return [
                 BoundingBox(
                     l=0,
@@ -75,7 +80,15 @@ class BaseOcrModel:
                 )
             ]
         # return individual rectangles if the bitmap coverage is smaller
-        elif coverage < BITMAP_COVERAGE_TRESHOLD:
+        else:  # coverage <= BITMAP_COVERAGE_TRESHOLD:
+            # skip OCR if the bitmap area on the page is smaller than the options threshold
+            ocr_rects = [
+                rect
+                for rect in ocr_rects
+                if rect.area() / (page.size.width * page.size.height)
+                > self.options.bitmap_area_threshold
+            ]
             return ocr_rects
     # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.

docling 1.19.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

docling 1.19.1py3-none-any.whl → 2.1.0py3-none-any.whl