PyPI - docling - Versions diffs - 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

docling 2.2.1py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

docling/backend/asciidoc_backend.py +0 -4
docling/backend/html_backend.py +25 -25
docling/datamodel/base_models.py +1 -1
docling/datamodel/document.py +3 -1
docling/datamodel/settings.py +15 -1
docling/document_converter.py +12 -8
docling/models/base_model.py +4 -1
docling/models/base_ocr_model.py +21 -4
docling/models/ds_glm_model.py +27 -11
docling/models/easyocr_model.py +49 -39
docling/models/layout_model.py +87 -61
docling/models/page_assemble_model.py +102 -100
docling/models/page_preprocessing_model.py +25 -7
docling/models/table_structure_model.py +125 -90
docling/models/tesseract_ocr_cli_model.py +62 -52
docling/models/tesseract_ocr_model.py +57 -45
docling/pipeline/base_pipeline.py +68 -69
docling/pipeline/simple_pipeline.py +8 -11
docling/pipeline/standard_pdf_pipeline.py +59 -56
docling/utils/profiling.py +62 -0
{docling-2.2.1.dist-info → docling-2.3.0.dist-info}/METADATA +5 -4
docling-2.3.0.dist-info/RECORD +45 -0
docling-2.2.1.dist-info/RECORD +0 -44
{docling-2.2.1.dist-info → docling-2.3.0.dist-info}/LICENSE +0 -0
{docling-2.2.1.dist-info → docling-2.3.0.dist-info}/WHEEL +0 -0
{docling-2.2.1.dist-info → docling-2.3.0.dist-info}/entry_points.txt +0 -0

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -8,8 +8,11 @@ import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -102,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
         return df_filtered
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
@@ -113,62 +118,67 @@ class TesseractOcrCliModel(BaseOcrModel):
             if not page._backend.is_valid():
                 yield page
             else:
-                ocr_rects = self.get_ocr_rects(page)
-                all_ocr_cells = []
-                for ocr_rect in ocr_rects:
-                    # Skip zero area boxes
-                    if ocr_rect.area() == 0:
-                        continue
-                    high_res_image = page._backend.get_page_image(
-                        scale=self.scale, cropbox=ocr_rect
-                    )
+                with TimeRecorder(conv_res, "ocr"):
-                    with tempfile.NamedTemporaryFile(
-                        suffix=".png", mode="w"
-                    ) as image_file:
-                        fname = image_file.name
-                        high_res_image.save(fname)
-                        df = self._run_tesseract(fname)
-                    # _log.info(df)
-                    # Print relevant columns (bounding box and text)
-                    for ix, row in df.iterrows():
-                        text = row["text"]
-                        conf = row["conf"]
-                        l = float(row["left"])
-                        b = float(row["top"])
-                        w = float(row["width"])
-                        h = float(row["height"])
-                        t = b + h
-                        r = l + w
-                        cell = OcrCell(
-                            id=ix,
-                            text=text,
-                            confidence=conf / 100.0,
-                            bbox=BoundingBox.from_tuple(
-                                coord=(
-                                    (l / self.scale) + ocr_rect.l,
-                                    (b / self.scale) + ocr_rect.t,
-                                    (r / self.scale) + ocr_rect.l,
-                                    (t / self.scale) + ocr_rect.t,
-                                ),
-                                origin=CoordOrigin.TOPLEFT,
-                            ),
+                    ocr_rects = self.get_ocr_rects(page)
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
                         )
-                        all_ocr_cells.append(cell)
-                ## Remove OCR cells which overlap with programmatic cells.
-                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                        with tempfile.NamedTemporaryFile(
+                            suffix=".png", mode="w"
+                        ) as image_file:
+                            fname = image_file.name
+                            high_res_image.save(fname)
+                            df = self._run_tesseract(fname)
+                        # _log.info(df)
+                        # Print relevant columns (bounding box and text)
+                        for ix, row in df.iterrows():
+                            text = row["text"]
+                            conf = row["conf"]
+                            l = float(row["left"])
+                            b = float(row["top"])
+                            w = float(row["width"])
+                            h = float(row["height"])
+                            t = b + h
+                            r = l + w
+                            cell = OcrCell(
+                                id=ix,
+                                text=text,
+                                confidence=conf / 100.0,
+                                bbox=BoundingBox.from_tuple(
+                                    coord=(
+                                        (l / self.scale) + ocr_rect.l,
+                                        (b / self.scale) + ocr_rect.t,
+                                        (r / self.scale) + ocr_rect.l,
+                                        (t / self.scale) + ocr_rect.t,
+                                    ),
+                                    origin=CoordOrigin.TOPLEFT,
+                                ),
+                            )
+                            all_ocr_cells.append(cell)
+                    ## Remove OCR cells which overlap with programmatic cells.
+                    filtered_ocr_cells = self.filter_ocr_cells(
+                        all_ocr_cells, page.cells
+                    )
-                page.cells.extend(filtered_ocr_cells)
+                    page.cells.extend(filtered_ocr_cells)
                 # DEBUG code:
-                # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
                 yield page

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -4,8 +4,11 @@ from typing import Iterable
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
+from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -61,7 +64,9 @@ class TesseractOcrModel(BaseOcrModel):
             # Finalize the tesseractAPI
             self.reader.End()
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
@@ -72,59 +77,66 @@ class TesseractOcrModel(BaseOcrModel):
             if not page._backend.is_valid():
                 yield page
             else:
-                assert self.reader is not None
+                with TimeRecorder(conv_res, "ocr"):
-                ocr_rects = self.get_ocr_rects(page)
+                    assert self.reader is not None
-                all_ocr_cells = []
-                for ocr_rect in ocr_rects:
-                    # Skip zero area boxes
-                    if ocr_rect.area() == 0:
-                        continue
-                    high_res_image = page._backend.get_page_image(
-                        scale=self.scale, cropbox=ocr_rect
-                    )
+                    ocr_rects = self.get_ocr_rects(page)
-                    # Retrieve text snippets with their bounding boxes
-                    self.reader.SetImage(high_res_image)
-                    boxes = self.reader.GetComponentImages(
-                        self.reader_RIL.TEXTLINE, True
-                    )
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
-                    cells = []
-                    for ix, (im, box, _, _) in enumerate(boxes):
-                        # Set the area of interest. Tesseract uses Bottom-Left for the origin
-                        self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
-                        # Extract text within the bounding box
-                        text = self.reader.GetUTF8Text().strip()
-                        confidence = self.reader.MeanTextConf()
-                        left = box["x"] / self.scale
-                        bottom = box["y"] / self.scale
-                        right = (box["x"] + box["w"]) / self.scale
-                        top = (box["y"] + box["h"]) / self.scale
-                        cells.append(
-                            OcrCell(
-                                id=ix,
-                                text=text,
-                                confidence=confidence,
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(left, top, right, bottom),
-                                    origin=CoordOrigin.TOPLEFT,
-                                ),
-                            )
+                        # Retrieve text snippets with their bounding boxes
+                        self.reader.SetImage(high_res_image)
+                        boxes = self.reader.GetComponentImages(
+                            self.reader_RIL.TEXTLINE, True
                         )
-                    # del high_res_image
-                    all_ocr_cells.extend(cells)
+                        cells = []
+                        for ix, (im, box, _, _) in enumerate(boxes):
+                            # Set the area of interest. Tesseract uses Bottom-Left for the origin
+                            self.reader.SetRectangle(
+                                box["x"], box["y"], box["w"], box["h"]
+                            )
-                ## Remove OCR cells which overlap with programmatic cells.
-                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                            # Extract text within the bounding box
+                            text = self.reader.GetUTF8Text().strip()
+                            confidence = self.reader.MeanTextConf()
+                            left = box["x"] / self.scale
+                            bottom = box["y"] / self.scale
+                            right = (box["x"] + box["w"]) / self.scale
+                            top = (box["y"] + box["h"]) / self.scale
+                            cells.append(
+                                OcrCell(
+                                    id=ix,
+                                    text=text,
+                                    confidence=confidence,
+                                    bbox=BoundingBox.from_tuple(
+                                        coord=(left, top, right, bottom),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    ),
+                                )
+                            )
+                        # del high_res_image
+                        all_ocr_cells.extend(cells)
+                    ## Remove OCR cells which overlap with programmatic cells.
+                    filtered_ocr_cells = self.filter_ocr_cells(
+                        all_ocr_cells, page.cells
+                    )
-                page.cells.extend(filtered_ocr_cells)
+                    page.cells.extend(filtered_ocr_cells)
                 # DEBUG code:
-                # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
                 yield page

docling/pipeline/base_pipeline.py CHANGED Viewed

@@ -19,6 +19,7 @@ from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BaseEnrichmentModel
+from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import chunkify
 _log = logging.getLogger(__name__)
@@ -35,13 +36,16 @@ class BasePipeline(ABC):
         _log.info(f"Processing document {in_doc.file.name}")
         try:
-            # These steps are building and assembling the structure of the
-            # output DoclingDocument
-            conv_res = self._build_document(in_doc, conv_res)
-            conv_res = self._assemble_document(in_doc, conv_res)
-            # From this stage, all operations should rely only on conv_res.output
-            conv_res = self._enrich_document(in_doc, conv_res)
-            conv_res.status = self._determine_status(in_doc, conv_res)
+            with TimeRecorder(
+                conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
+            ):
+                # These steps are building and assembling the structure of the
+                # output DoclingDocument
+                conv_res = self._build_document(conv_res)
+                conv_res = self._assemble_document(conv_res)
+                # From this stage, all operations should rely only on conv_res.output
+                conv_res = self._enrich_document(conv_res)
+                conv_res.status = self._determine_status(conv_res)
         except Exception as e:
             conv_res.status = ConversionStatus.FAILURE
             if raises_on_error:
@@ -50,19 +54,13 @@ class BasePipeline(ABC):
         return conv_res
     @abstractmethod
-    def _build_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
         pass
-    def _assemble_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
         return conv_res
-    def _enrich_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
         def _filter_elements(
             doc: DoclingDocument, model: BaseEnrichmentModel
@@ -71,24 +69,23 @@ class BasePipeline(ABC):
                 if model.is_processable(doc=doc, element=element):
                     yield element
-        for model in self.enrichment_pipe:
-            for element_batch in chunkify(
-                _filter_elements(conv_res.document, model),
-                settings.perf.elements_batch_size,
-            ):
-                # TODO: currently we assume the element itself is modified, because
-                # we don't have an interface to save the element back to the document
-                for element in model(
-                    doc=conv_res.document, element_batch=element_batch
-                ):  # Must exhaust!
-                    pass
+        with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
+            for model in self.enrichment_pipe:
+                for element_batch in chunkify(
+                    _filter_elements(conv_res.document, model),
+                    settings.perf.elements_batch_size,
+                ):
+                    # TODO: currently we assume the element itself is modified, because
+                    # we don't have an interface to save the element back to the document
+                    for element in model(
+                        doc=conv_res.document, element_batch=element_batch
+                    ):  # Must exhaust!
+                        pass
         return conv_res
     @abstractmethod
-    def _determine_status(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionStatus:
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
         pass
     @classmethod
@@ -110,66 +107,68 @@ class BasePipeline(ABC):
 class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
-    def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def _apply_on_pages(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         for model in self.build_pipe:
-            page_batch = model(page_batch)
+            page_batch = model(conv_res, page_batch)
         yield from page_batch
-    def _build_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
-        if not isinstance(in_doc._backend, PdfDocumentBackend):
+        if not isinstance(conv_res.input._backend, PdfDocumentBackend):
             raise RuntimeError(
-                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
+                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
                 f"Can not convert this with a PDF pipeline. "
                 f"Please check your format configuration on DocumentConverter."
             )
             # conv_res.status = ConversionStatus.FAILURE
             # return conv_res
-        for i in range(0, in_doc.page_count):
-            conv_res.pages.append(Page(page_no=i))
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
-        try:
-            # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
-                start_pb_time = time.time()
+            for i in range(0, conv_res.input.page_count):
+                conv_res.pages.append(Page(page_no=i))
-                # 1. Initialise the page resources
-                init_pages = map(
-                    functools.partial(self.initialize_page, in_doc), page_batch
-                )
+            try:
+                # Iterate batches of pages (page_batch_size) in the doc
+                for page_batch in chunkify(
+                    conv_res.pages, settings.perf.page_batch_size
+                ):
+                    start_pb_time = time.time()
-                # 2. Run pipeline stages
-                pipeline_pages = self._apply_on_pages(init_pages)
+                    # 1. Initialise the page resources
+                    init_pages = map(
+                        functools.partial(self.initialize_page, conv_res), page_batch
+                    )
-                for p in pipeline_pages:  # Must exhaust!
-                    pass
+                    # 2. Run pipeline stages
+                    pipeline_pages = self._apply_on_pages(conv_res, init_pages)
-                end_pb_time = time.time() - start_pb_time
-                _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
+                    for p in pipeline_pages:  # Must exhaust!
+                        pass
-        except Exception as e:
-            conv_res.status = ConversionStatus.FAILURE
-            trace = "\n".join(traceback.format_exception(e))
-            _log.warning(
-                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
-                f"{trace}"
-            )
-            raise e
+                    end_pb_time = time.time() - start_pb_time
+                    _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
+            except Exception as e:
+                conv_res.status = ConversionStatus.FAILURE
+                trace = "\n".join(traceback.format_exception(e))
+                _log.warning(
+                    f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
+                    f"{trace}"
+                )
+                raise e
-        finally:
-            # Always unload the PDF backend, even in case of failure
-            if in_doc._backend:
-                in_doc._backend.unload()
+            finally:
+                # Always unload the PDF backend, even in case of failure
+                if conv_res.input._backend:
+                    conv_res.input._backend.unload()
         return conv_res
-    def _determine_status(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionStatus:
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
         status = ConversionStatus.SUCCESS
         for page in conv_res.pages:
             if page._backend is None or not page._backend.is_valid():
@@ -186,5 +185,5 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
     # Initialise and load resources for a page
     @abstractmethod
-    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+    def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
         pass

docling/pipeline/simple_pipeline.py CHANGED Viewed

@@ -5,9 +5,10 @@ from docling.backend.abstract_backend import (
     DeclarativeDocumentBackend,
 )
 from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.pipeline.base_pipeline import BasePipeline
+from docling.utils.profiling import ProfilingScope, TimeRecorder
 _log = logging.getLogger(__name__)
@@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
     def __init__(self, pipeline_options: PipelineOptions):
         super().__init__(pipeline_options)
-    def _build_document(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionResult:
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
-        if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
+        if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
             raise RuntimeError(
-                f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
+                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
                 f"Can not convert this with simple pipeline. "
                 f"Please check your format configuration on DocumentConverter."
             )
@@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
         # Instead of running a page-level pipeline to build up the document structure,
         # the backend is expected to be of type DeclarativeDocumentBackend, which can output
         # a DoclingDocument straight.
-        conv_res.document = in_doc._backend.convert()
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
+            conv_res.document = conv_res.input._backend.convert()
         return conv_res
-    def _determine_status(
-        self, in_doc: InputDocument, conv_res: ConversionResult
-    ) -> ConversionStatus:
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
         # This is called only if the previous steps didn't raise.
         # Since we don't have anything else to evaluate, we can
         # safely return SUCCESS.

docling 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl

docling 2.2.1py3-none-any.whl → 2.3.0py3-none-any.whl