PyPI - docling - Versions diffs - 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

docling 2.2.1py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

docling/backend/asciidoc_backend.py +0 -4
docling/backend/html_backend.py +25 -25
docling/datamodel/base_models.py +1 -1
docling/datamodel/document.py +3 -1
docling/datamodel/settings.py +15 -1
docling/document_converter.py +12 -8
docling/models/base_model.py +4 -1
docling/models/base_ocr_model.py +21 -4
docling/models/ds_glm_model.py +27 -11
docling/models/easyocr_model.py +49 -39
docling/models/layout_model.py +87 -61
docling/models/page_assemble_model.py +102 -100
docling/models/page_preprocessing_model.py +25 -7
docling/models/table_structure_model.py +125 -90
docling/models/tesseract_ocr_cli_model.py +62 -52
docling/models/tesseract_ocr_model.py +57 -45
docling/pipeline/base_pipeline.py +68 -69
docling/pipeline/simple_pipeline.py +8 -11
docling/pipeline/standard_pdf_pipeline.py +59 -56
docling/utils/profiling.py +62 -0
{docling-2.2.1.dist-info → docling-2.3.0.dist-info}/METADATA +5 -4
docling-2.3.0.dist-info/RECORD +45 -0
docling-2.2.1.dist-info/RECORD +0 -44
{docling-2.2.1.dist-info → docling-2.3.0.dist-info}/LICENSE +0 -0
{docling-2.2.1.dist-info → docling-2.3.0.dist-info}/WHEEL +0 -0
{docling-2.2.1.dist-info → docling-2.3.0.dist-info}/entry_points.txt +0 -0

docling/backend/asciidoc_backend.py CHANGED Viewed

@@ -1,24 +1,20 @@
 import logging
-import os
 import re
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
 from docling_core.types.doc import (
-    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
     GroupItem,
     GroupLabel,
     ImageRef,
-    NodeItem,
     Size,
     TableCell,
     TableData,
 )
-from pydantic import AnyUrl
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat

docling/backend/html_backend.py CHANGED Viewed

@@ -179,31 +179,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.parents[self.level] = doc.add_text(
                 parent=self.parents[0], label=DocItemLabel.TITLE, text=text
             )
-        elif hlevel > self.level:
-            # add invisible group
-            for i in range(self.level + 1, hlevel):
-                self.parents[i] = doc.add_group(
-                    name=f"header-{i}",
-                    label=GroupLabel.SECTION,
-                    parent=self.parents[i - 1],
-                )
-            self.level = hlevel
-        elif hlevel < self.level:
-            # remove the tail
-            for key, val in self.parents.items():
-                if key > hlevel:
-                    self.parents[key] = None
-            self.level = hlevel
-        self.parents[hlevel] = doc.add_heading(
-            parent=self.parents[hlevel - 1],
-            text=text,
-            level=hlevel,
-        )
+        else:
+            if hlevel > self.level:
+                # add invisible group
+                for i in range(self.level + 1, hlevel):
+                    self.parents[i] = doc.add_group(
+                        name=f"header-{i}",
+                        label=GroupLabel.SECTION,
+                        parent=self.parents[i - 1],
+                    )
+                self.level = hlevel
+            elif hlevel < self.level:
+                # remove the tail
+                for key, val in self.parents.items():
+                    if key > hlevel:
+                        self.parents[key] = None
+                self.level = hlevel
+            self.parents[hlevel] = doc.add_heading(
+                parent=self.parents[hlevel - 1],
+                text=text,
+                level=hlevel,
+            )
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""

docling/datamodel/base_models.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from enum import Enum, auto
 from io import BytesIO
-from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 from docling_core.types.doc import (
     BoundingBox,

docling/datamodel/document.py CHANGED Viewed

@@ -3,7 +3,7 @@ import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
 import filetype
 from docling_core.types.doc import (
@@ -52,6 +52,7 @@ from docling.datamodel.base_models import (
     Page,
 )
 from docling.datamodel.settings import DocumentLimits
+from docling.utils.profiling import ProfilingItem
 from docling.utils.utils import create_file_hash, create_hash
 if TYPE_CHECKING:
@@ -187,6 +188,7 @@ class ConversionResult(BaseModel):
     pages: List[Page] = []
     assembled: AssembledUnit = AssembledUnit()
+    timings: Dict[str, ProfilingItem] = {}
     document: DoclingDocument = _EMPTY_DOCLING_DOC

docling/datamodel/settings.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import sys
+from pathlib import Path
 from pydantic import BaseModel
 from pydantic_settings import BaseSettings
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
     # To force models into single core: export OMP_NUM_THREADS=1
+class DebugSettings(BaseModel):
+    visualize_cells: bool = False
+    visualize_ocr: bool = False
+    visualize_layout: bool = False
+    visualize_tables: bool = False
+    profile_pipeline_timings: bool = False
+    # Path used to output debug information.
+    debug_output_path: str = str(Path.cwd() / "debug")
 class AppSettings(BaseSettings):
     perf: BatchConcurrencySettings
+    debug: DebugSettings
-settings = AppSettings(perf=BatchConcurrencySettings())
+settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())

docling/document_converter.py CHANGED Viewed

@@ -189,24 +189,35 @@ class DocumentConverter:
     ) -> Iterator[ConversionResult]:
         assert self.format_to_options is not None
+        start_time = time.monotonic()
         for input_batch in chunkify(
             conv_input.docs(self.format_to_options),
             settings.perf.doc_batch_size,  # pass format_options
         ):
             _log.info(f"Going to convert document batch...")
             # parallel processing only within input_batch
             # with ThreadPoolExecutor(
             #    max_workers=settings.perf.doc_batch_concurrency
             # ) as pool:
             #   yield from pool.map(self.process_document, input_batch)
             # Note: PDF backends are not thread-safe, thread pool usage was disabled.
             for item in map(
                 partial(self._process_document, raises_on_error=raises_on_error),
                 input_batch,
             ):
+                elapsed = time.monotonic() - start_time
+                start_time = time.monotonic()
                 if item is not None:
+                    _log.info(
+                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                    )
                     yield item
+                else:
+                    _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
     def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
         assert self.format_to_options is not None
@@ -237,15 +248,8 @@ class DocumentConverter:
         assert self.allowed_formats is not None
         assert in_doc.format in self.allowed_formats
-        start_doc_time = time.time()
         conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
-        end_doc_time = time.time() - start_doc_time
-        _log.info(
-            f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
-        )
         return conv_res
     def _execute_pipeline(

docling/models/base_model.py CHANGED Viewed

@@ -4,11 +4,14 @@ from typing import Any, Iterable
 from docling_core.types.doc import DoclingDocument, NodeItem
 from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
 class BasePageModel(ABC):
     @abstractmethod
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         pass

docling/models/base_ocr_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import copy
 import logging
 from abc import abstractmethod
+from pathlib import Path
 from typing import Iterable, List
 import numpy as np
@@ -10,12 +11,15 @@ from rtree import index
 from scipy.ndimage import find_objects, label
 from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrOptions
+from docling.datamodel.settings import settings
+from docling.models.base_model import BasePageModel
 _log = logging.getLogger(__name__)
-class BaseOcrModel:
+class BaseOcrModel(BasePageModel):
     def __init__(self, enabled: bool, options: OcrOptions):
         self.enabled = enabled
         self.options = options
@@ -113,7 +117,7 @@ class BaseOcrModel:
         ]
         return filtered_ocr_cells
-    def draw_ocr_rects_and_cells(self, page, ocr_rects):
+    def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
         image = copy.deepcopy(page.image)
         draw = ImageDraw.Draw(image, "RGBA")
@@ -130,8 +134,21 @@ class BaseOcrModel:
             if isinstance(tc, OcrCell):
                 color = "magenta"
             draw.rectangle([(x0, y0), (x1, y1)], outline=color)
-        image.show()
+        if show:
+            image.show()
+        else:
+            out_path: Path = (
+                Path(settings.debug.debug_output_path)
+                / f"debug_{conv_res.input.file.stem}"
+            )
+            out_path.mkdir(parents=True, exist_ok=True)
+            out_file = out_path / f"ocr_page_{page.page_no:05}.png"
+            image.save(str(out_file), format="png")
     @abstractmethod
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         pass

docling/models/ds_glm_model.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import copy
 import random
+from pathlib import Path
 from typing import List, Union
 from deepsearch_glm.nlp_utils import init_nlp_model
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
 from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
 from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
+from docling.datamodel.settings import settings
+from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import create_hash
@@ -226,23 +229,24 @@ class GlmModel:
         return ds_doc
     def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
-        ds_doc = self._to_legacy_document(conv_res)
-        ds_doc_dict = ds_doc.model_dump(by_alias=True)
+        with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
+            ds_doc = self._to_legacy_document(conv_res)
+            ds_doc_dict = ds_doc.model_dump(by_alias=True)
-        glm_doc = self.model.apply_on_doc(ds_doc_dict)
+            glm_doc = self.model.apply_on_doc(ds_doc_dict)
-        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
+            docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
         # DEBUG code:
-        def draw_clusters_and_cells(ds_document, page_no):
+        def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
             clusters_to_draw = []
             image = copy.deepcopy(conv_res.pages[page_no].image)
             for ix, elem in enumerate(ds_document.main_text):
                 if isinstance(elem, BaseText):
-                    prov = elem.prov[0]
+                    prov = elem.prov[0]  # type: ignore
                 elif isinstance(elem, Ref):
                     _, arr, index = elem.ref.split("/")
-                    index = int(index)
+                    index = int(index)  # type: ignore
                     if arr == "tables":
                         prov = ds_document.tables[index].prov[0]
                     elif arr == "figures":
@@ -256,7 +260,7 @@ class GlmModel:
                             id=ix,
                             label=elem.name,
                             bbox=BoundingBox.from_tuple(
-                                coord=prov.bbox,
+                                coord=prov.bbox,  # type: ignore
                                 origin=CoordOrigin.BOTTOMLEFT,
                             ).to_top_left_origin(conv_res.pages[page_no].size.height),
                         )
@@ -276,9 +280,21 @@ class GlmModel:
                 for tc in c.cells:  # [:1]:
                     x0, y0, x1, y1 = tc.bbox.as_tuple()
                     draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-        # draw_clusters_and_cells(ds_doc, 0)
-        # draw_clusters_and_cells(exported_doc, 0)
+            if show:
+                image.show()
+            else:
+                out_path: Path = (
+                    Path(settings.debug.debug_output_path)
+                    / f"debug_{conv_res.input.file.stem}"
+                )
+                out_path.mkdir(parents=True, exist_ok=True)
+                out_file = out_path / f"doc_page_{page_no:05}.png"
+                image.save(str(out_file), format="png")
+        # for item in ds_doc.page_dimensions:
+        #    page_no = item.page
+        #    draw_clusters_and_cells(ds_doc, page_no)
         return docling_doc

docling/models/easyocr_model.py CHANGED Viewed

@@ -5,8 +5,11 @@ import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import EasyOcrOptions
+from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -33,58 +36,65 @@ class EasyOcrModel(BaseOcrModel):
                 download_enabled=self.options.download_enabled,
             )
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
         for page in page_batch:
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
             else:
-                ocr_rects = self.get_ocr_rects(page)
-                all_ocr_cells = []
-                for ocr_rect in ocr_rects:
-                    # Skip zero area boxes
-                    if ocr_rect.area() == 0:
-                        continue
-                    high_res_image = page._backend.get_page_image(
-                        scale=self.scale, cropbox=ocr_rect
-                    )
-                    im = numpy.array(high_res_image)
-                    result = self.reader.readtext(im)
-                    del high_res_image
-                    del im
-                    cells = [
-                        OcrCell(
-                            id=ix,
-                            text=line[1],
-                            confidence=line[2],
-                            bbox=BoundingBox.from_tuple(
-                                coord=(
-                                    (line[0][0][0] / self.scale) + ocr_rect.l,
-                                    (line[0][0][1] / self.scale) + ocr_rect.t,
-                                    (line[0][2][0] / self.scale) + ocr_rect.l,
-                                    (line[0][2][1] / self.scale) + ocr_rect.t,
-                                ),
-                                origin=CoordOrigin.TOPLEFT,
-                            ),
+                with TimeRecorder(conv_res, "ocr"):
+                    ocr_rects = self.get_ocr_rects(page)
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
                         )
-                        for ix, line in enumerate(result)
-                    ]
-                    all_ocr_cells.extend(cells)
-                ## Remove OCR cells which overlap with programmatic cells.
-                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                        im = numpy.array(high_res_image)
+                        result = self.reader.readtext(im)
+                        del high_res_image
+                        del im
+                        cells = [
+                            OcrCell(
+                                id=ix,
+                                text=line[1],
+                                confidence=line[2],
+                                bbox=BoundingBox.from_tuple(
+                                    coord=(
+                                        (line[0][0][0] / self.scale) + ocr_rect.l,
+                                        (line[0][0][1] / self.scale) + ocr_rect.t,
+                                        (line[0][2][0] / self.scale) + ocr_rect.l,
+                                        (line[0][2][1] / self.scale) + ocr_rect.t,
+                                    ),
+                                    origin=CoordOrigin.TOPLEFT,
+                                ),
+                            )
+                            for ix, line in enumerate(result)
+                        ]
+                        all_ocr_cells.extend(cells)
+                    ## Remove OCR cells which overlap with programmatic cells.
+                    filtered_ocr_cells = self.filter_ocr_cells(
+                        all_ocr_cells, page.cells
+                    )
-                page.cells.extend(filtered_ocr_cells)
+                    page.cells.extend(filtered_ocr_cells)
                 # DEBUG code:
-                # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
                 yield page

docling/models/layout_model.py CHANGED Viewed

@@ -16,8 +16,11 @@ from docling.datamodel.base_models import (
     LayoutPrediction,
     Page,
 )
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.utils import layout_utils as lu
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -271,74 +274,97 @@ class LayoutModel(BasePageModel):
         return clusters_out_new, cells_out_new
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         for page in page_batch:
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
             else:
-                assert page.size is not None
-                clusters = []
-                for ix, pred_item in enumerate(
-                    self.layout_predictor.predict(page.get_image(scale=1.0))
-                ):
-                    label = DocItemLabel(
-                        pred_item["label"].lower().replace(" ", "_").replace("-", "_")
-                    )  # Temporary, until docling-ibm-model uses docling-core types
-                    cluster = Cluster(
-                        id=ix,
-                        label=label,
-                        confidence=pred_item["confidence"],
-                        bbox=BoundingBox.model_validate(pred_item),
-                        cells=[],
-                    )
-                    clusters.append(cluster)
-                # Map cells to clusters
-                # TODO: Remove, postprocess should take care of it anyway.
-                for cell in page.cells:
-                    for cluster in clusters:
-                        if not cell.bbox.area() > 0:
-                            overlap_frac = 0.0
-                        else:
-                            overlap_frac = (
-                                cell.bbox.intersection_area_with(cluster.bbox)
-                                / cell.bbox.area()
-                            )
-                        if overlap_frac > 0.5:
-                            cluster.cells.append(cell)
-                # Pre-sort clusters
-                # clusters = self.sort_clusters_by_cell_order(clusters)
-                # DEBUG code:
-                def draw_clusters_and_cells():
-                    image = copy.deepcopy(page.image)
-                    draw = ImageDraw.Draw(image)
-                    for c in clusters:
-                        x0, y0, x1, y1 = c.bbox.as_tuple()
-                        draw.rectangle([(x0, y0), (x1, y1)], outline="green")
-                        cell_color = (
-                            random.randint(30, 140),
-                            random.randint(30, 140),
-                            random.randint(30, 140),
+                with TimeRecorder(conv_res, "layout"):
+                    assert page.size is not None
+                    clusters = []
+                    for ix, pred_item in enumerate(
+                        self.layout_predictor.predict(page.get_image(scale=1.0))
+                    ):
+                        label = DocItemLabel(
+                            pred_item["label"]
+                            .lower()
+                            .replace(" ", "_")
+                            .replace("-", "_")
+                        )  # Temporary, until docling-ibm-model uses docling-core types
+                        cluster = Cluster(
+                            id=ix,
+                            label=label,
+                            confidence=pred_item["confidence"],
+                            bbox=BoundingBox.model_validate(pred_item),
+                            cells=[],
                         )
-                        for tc in c.cells:  # [:1]:
-                            x0, y0, x1, y1 = tc.bbox.as_tuple()
-                            draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-                    image.show()
-                # draw_clusters_and_cells()
-                clusters, page.cells = self.postprocess(
-                    clusters, page.cells, page.size.height
-                )
+                        clusters.append(cluster)
+                    # Map cells to clusters
+                    # TODO: Remove, postprocess should take care of it anyway.
+                    for cell in page.cells:
+                        for cluster in clusters:
+                            if not cell.bbox.area() > 0:
+                                overlap_frac = 0.0
+                            else:
+                                overlap_frac = (
+                                    cell.bbox.intersection_area_with(cluster.bbox)
+                                    / cell.bbox.area()
+                                )
+                            if overlap_frac > 0.5:
+                                cluster.cells.append(cell)
+                    # Pre-sort clusters
+                    # clusters = self.sort_clusters_by_cell_order(clusters)
+                    # DEBUG code:
+                    def draw_clusters_and_cells(show: bool = False):
+                        image = copy.deepcopy(page.image)
+                        if image is not None:
+                            draw = ImageDraw.Draw(image)
+                            for c in clusters:
+                                x0, y0, x1, y1 = c.bbox.as_tuple()
+                                draw.rectangle([(x0, y0), (x1, y1)], outline="green")
+                                cell_color = (
+                                    random.randint(30, 140),
+                                    random.randint(30, 140),
+                                    random.randint(30, 140),
+                                )
+                                for tc in c.cells:  # [:1]:
+                                    x0, y0, x1, y1 = tc.bbox.as_tuple()
+                                    draw.rectangle(
+                                        [(x0, y0), (x1, y1)], outline=cell_color
+                                    )
+                            if show:
+                                image.show()
+                            else:
+                                out_path: Path = (
+                                    Path(settings.debug.debug_output_path)
+                                    / f"debug_{conv_res.input.file.stem}"
+                                )
+                                out_path.mkdir(parents=True, exist_ok=True)
+                                out_file = (
+                                    out_path / f"layout_page_{page.page_no:05}.png"
+                                )
+                                image.save(str(out_file), format="png")
+                    # draw_clusters_and_cells()
+                    clusters, page.cells = self.postprocess(
+                        clusters, page.cells, page.size.height
+                    )
-                # draw_clusters_and_cells()
+                    page.predictions.layout = LayoutPrediction(clusters=clusters)
-                page.predictions.layout = LayoutPrediction(clusters=clusters)
+                if settings.debug.visualize_layout:
+                    draw_clusters_and_cells()
                 yield page

docling 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl

docling 2.2.1py3-none-any.whl → 2.3.0py3-none-any.whl