PyPI - docling - Versions diffs - 2.2.1__py3-none-any.whl → 2.3.1__py3-none-any.whl - Mend

docling 2.2.1py3-none-any.whl → 2.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

docling/backend/asciidoc_backend.py +0 -4
docling/backend/html_backend.py +25 -25
docling/datamodel/base_models.py +1 -1
docling/datamodel/document.py +3 -1
docling/datamodel/settings.py +15 -1
docling/document_converter.py +20 -12
docling/models/base_model.py +4 -1
docling/models/base_ocr_model.py +21 -4
docling/models/ds_glm_model.py +27 -11
docling/models/easyocr_model.py +49 -39
docling/models/layout_model.py +87 -61
docling/models/page_assemble_model.py +102 -100
docling/models/page_preprocessing_model.py +25 -7
docling/models/table_structure_model.py +125 -90
docling/models/tesseract_ocr_cli_model.py +62 -52
docling/models/tesseract_ocr_model.py +57 -45
docling/pipeline/base_pipeline.py +68 -69
docling/pipeline/simple_pipeline.py +8 -11
docling/pipeline/standard_pdf_pipeline.py +59 -56
docling/utils/profiling.py +62 -0
{docling-2.2.1.dist-info → docling-2.3.1.dist-info}/METADATA +7 -10
docling-2.3.1.dist-info/RECORD +45 -0
docling-2.2.1.dist-info/RECORD +0 -44
{docling-2.2.1.dist-info → docling-2.3.1.dist-info}/LICENSE +0 -0
{docling-2.2.1.dist-info → docling-2.3.1.dist-info}/WHEEL +0 -0
{docling-2.2.1.dist-info → docling-2.3.1.dist-info}/entry_points.txt +0 -0

docling/models/page_assemble_model.py CHANGED Viewed

@@ -12,8 +12,10 @@ from docling.datamodel.base_models import (
     Table,
     TextElement,
 )
+from docling.datamodel.document import ConversionResult
 from docling.models.base_model import BasePageModel
 from docling.models.layout_model import LayoutModel
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -51,122 +53,122 @@ class PageAssembleModel(BasePageModel):
         return sanitized_text.strip()  # Strip any leading or trailing whitespace
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         for page in page_batch:
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
             else:
-                assert page.predictions.layout is not None
-                # assembles some JSON output page by page.
-                elements: List[PageElement] = []
-                headers: List[PageElement] = []
-                body: List[PageElement] = []
-                for cluster in page.predictions.layout.clusters:
-                    # _log.info("Cluster label seen:", cluster.label)
-                    if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
-                        textlines = [
-                            cell.text.replace("\x02", "-").strip()
-                            for cell in cluster.cells
-                            if len(cell.text.strip()) > 0
-                        ]
-                        text = self.sanitize_text(textlines)
-                        text_el = TextElement(
-                            label=cluster.label,
-                            id=cluster.id,
-                            text=text,
-                            page_no=page.page_no,
-                            cluster=cluster,
-                        )
-                        elements.append(text_el)
-                        if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
-                            headers.append(text_el)
-                        else:
-                            body.append(text_el)
-                    elif cluster.label == LayoutModel.TABLE_LABEL:
-                        tbl = None
-                        if page.predictions.tablestructure:
-                            tbl = page.predictions.tablestructure.table_map.get(
-                                cluster.id, None
-                            )
-                        if (
-                            not tbl
-                        ):  # fallback: add table without structure, if it isn't present
-                            tbl = Table(
+                with TimeRecorder(conv_res, "page_assemble"):
+                    assert page.predictions.layout is not None
+                    # assembles some JSON output page by page.
+                    elements: List[PageElement] = []
+                    headers: List[PageElement] = []
+                    body: List[PageElement] = []
+                    for cluster in page.predictions.layout.clusters:
+                        # _log.info("Cluster label seen:", cluster.label)
+                        if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
+                            textlines = [
+                                cell.text.replace("\x02", "-").strip()
+                                for cell in cluster.cells
+                                if len(cell.text.strip()) > 0
+                            ]
+                            text = self.sanitize_text(textlines)
+                            text_el = TextElement(
                                 label=cluster.label,
                                 id=cluster.id,
-                                text="",
-                                otsl_seq=[],
-                                table_cells=[],
-                                cluster=cluster,
+                                text=text,
                                 page_no=page.page_no,
+                                cluster=cluster,
                             )
+                            elements.append(text_el)
+                            if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
+                                headers.append(text_el)
+                            else:
+                                body.append(text_el)
+                        elif cluster.label == LayoutModel.TABLE_LABEL:
+                            tbl = None
+                            if page.predictions.tablestructure:
+                                tbl = page.predictions.tablestructure.table_map.get(
+                                    cluster.id, None
+                                )
+                            if (
+                                not tbl
+                            ):  # fallback: add table without structure, if it isn't present
+                                tbl = Table(
+                                    label=cluster.label,
+                                    id=cluster.id,
+                                    text="",
+                                    otsl_seq=[],
+                                    table_cells=[],
+                                    cluster=cluster,
+                                    page_no=page.page_no,
+                                )
-                        elements.append(tbl)
-                        body.append(tbl)
-                    elif cluster.label == LayoutModel.FIGURE_LABEL:
-                        fig = None
-                        if page.predictions.figures_classification:
-                            fig = (
-                                page.predictions.figures_classification.figure_map.get(
+                            elements.append(tbl)
+                            body.append(tbl)
+                        elif cluster.label == LayoutModel.FIGURE_LABEL:
+                            fig = None
+                            if page.predictions.figures_classification:
+                                fig = page.predictions.figures_classification.figure_map.get(
                                     cluster.id, None
                                 )
-                            )
-                        if (
-                            not fig
-                        ):  # fallback: add figure without classification, if it isn't present
-                            fig = FigureElement(
-                                label=cluster.label,
-                                id=cluster.id,
-                                text="",
-                                data=None,
-                                cluster=cluster,
-                                page_no=page.page_no,
-                            )
-                        elements.append(fig)
-                        body.append(fig)
-                    elif cluster.label == LayoutModel.FORMULA_LABEL:
-                        equation = None
-                        if page.predictions.equations_prediction:
-                            equation = (
-                                page.predictions.equations_prediction.equation_map.get(
+                            if (
+                                not fig
+                            ):  # fallback: add figure without classification, if it isn't present
+                                fig = FigureElement(
+                                    label=cluster.label,
+                                    id=cluster.id,
+                                    text="",
+                                    data=None,
+                                    cluster=cluster,
+                                    page_no=page.page_no,
+                                )
+                            elements.append(fig)
+                            body.append(fig)
+                        elif cluster.label == LayoutModel.FORMULA_LABEL:
+                            equation = None
+                            if page.predictions.equations_prediction:
+                                equation = page.predictions.equations_prediction.equation_map.get(
                                     cluster.id, None
                                 )
-                            )
-                        if (
-                            not equation
-                        ):  # fallback: add empty formula, if it isn't present
-                            text = self.sanitize_text(
-                                [
-                                    cell.text.replace("\x02", "-").strip()
-                                    for cell in cluster.cells
-                                    if len(cell.text.strip()) > 0
-                                ]
-                            )
-                            equation = TextElement(
-                                label=cluster.label,
-                                id=cluster.id,
-                                cluster=cluster,
-                                page_no=page.page_no,
-                                text=text,
-                            )
-                        elements.append(equation)
-                        body.append(equation)
+                            if (
+                                not equation
+                            ):  # fallback: add empty formula, if it isn't present
+                                text = self.sanitize_text(
+                                    [
+                                        cell.text.replace("\x02", "-").strip()
+                                        for cell in cluster.cells
+                                        if len(cell.text.strip()) > 0
+                                    ]
+                                )
+                                equation = TextElement(
+                                    label=cluster.label,
+                                    id=cluster.id,
+                                    cluster=cluster,
+                                    page_no=page.page_no,
+                                    text=text,
+                                )
+                            elements.append(equation)
+                            body.append(equation)
-                page.assembled = AssembledUnit(
-                    elements=elements, headers=headers, body=body
-                )
+                    page.assembled = AssembledUnit(
+                        elements=elements, headers=headers, body=body
+                    )
-                # Remove page images (can be disabled)
-                if not self.options.keep_images:
-                    page._image_cache = {}
+                    # Remove page images (can be disabled)
+                    if not self.options.keep_images:
+                        page._image_cache = {}
-                # Unload backend
-                page._backend.unload()
+                    # Unload backend
+                    page._backend.unload()
                 yield page

docling/models/page_preprocessing_model.py CHANGED Viewed

@@ -1,10 +1,14 @@
+from pathlib import Path
 from typing import Iterable, Optional
 from PIL import ImageDraw
 from pydantic import BaseModel
 from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.utils.profiling import TimeRecorder
 class PagePreprocessingOptions(BaseModel):
@@ -15,14 +19,17 @@ class PagePreprocessingModel(BasePageModel):
     def __init__(self, options: PagePreprocessingOptions):
         self.options = options
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         for page in page_batch:
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
             else:
-                page = self._populate_page_images(page)
-                page = self._parse_page_cells(page)
+                with TimeRecorder(conv_res, "page_parse"):
+                    page = self._populate_page_images(page)
+                    page = self._parse_page_cells(conv_res, page)
                 yield page
     # Generate the page image and store it in the page object
@@ -43,19 +50,30 @@ class PagePreprocessingModel(BasePageModel):
         return page
     # Extract and populate the page cells and store it in the page object
-    def _parse_page_cells(self, page: Page) -> Page:
+    def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
         assert page._backend is not None
         page.cells = list(page._backend.get_text_cells())
         # DEBUG code:
-        def draw_text_boxes(image, cells):
+        def draw_text_boxes(image, cells, show: bool = False):
             draw = ImageDraw.Draw(image)
             for c in cells:
                 x0, y0, x1, y1 = c.bbox.as_tuple()
                 draw.rectangle([(x0, y0), (x1, y1)], outline="red")
-            image.show()
+            if show:
+                image.show()
+            else:
+                out_path: Path = (
+                    Path(settings.debug.debug_output_path)
+                    / f"debug_{conv_res.input.file.stem}"
+                )
+                out_path.mkdir(parents=True, exist_ok=True)
+                out_file = out_path / f"cells_page_{page.page_no:05}.png"
+                image.save(str(out_file), format="png")
-        # draw_text_boxes(page.get_image(scale=1.0), cells)
+        if settings.debug.visualize_cells:
+            draw_text_boxes(page.get_image(scale=1.0), page.cells)
         return page

docling/models/table_structure_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import copy
 from pathlib import Path
-from typing import Iterable, List
+from typing import Iterable
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -8,8 +8,11 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
 from PIL import ImageDraw
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
+from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.utils.profiling import TimeRecorder
 class TableStructureModel(BasePageModel):
@@ -35,7 +38,13 @@ class TableStructureModel(BasePageModel):
             self.tf_predictor = TFPredictor(self.tm_config)
             self.scale = 2.0  # Scale up table input images to 144 dpi
-    def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
+    def draw_table_and_cells(
+        self,
+        conv_res: ConversionResult,
+        page: Page,
+        tbl_list: Iterable[Table],
+        show: bool = False,
+    ):
         assert page._backend is not None
         image = (
@@ -61,9 +70,21 @@ class TableStructureModel(BasePageModel):
                         fill="black",
                     )
-        image.show()
+        if show:
+            image.show()
+        else:
+            out_path: Path = (
+                Path(settings.debug.debug_output_path)
+                / f"debug_{conv_res.input.file.stem}"
+            )
+            out_path.mkdir(parents=True, exist_ok=True)
+            out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
+            image.save(str(out_file), format="png")
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
@@ -74,98 +95,112 @@ class TableStructureModel(BasePageModel):
             if not page._backend.is_valid():
                 yield page
             else:
-                assert page.predictions.layout is not None
-                assert page.size is not None
-                page.predictions.tablestructure = TableStructurePrediction()  # dummy
-                in_tables = [
-                    (
-                        cluster,
-                        [
-                            round(cluster.bbox.l) * self.scale,
-                            round(cluster.bbox.t) * self.scale,
-                            round(cluster.bbox.r) * self.scale,
-                            round(cluster.bbox.b) * self.scale,
-                        ],
+                with TimeRecorder(conv_res, "table_structure"):
+                    assert page.predictions.layout is not None
+                    assert page.size is not None
+                    page.predictions.tablestructure = (
+                        TableStructurePrediction()
+                    )  # dummy
+                    in_tables = [
+                        (
+                            cluster,
+                            [
+                                round(cluster.bbox.l) * self.scale,
+                                round(cluster.bbox.t) * self.scale,
+                                round(cluster.bbox.r) * self.scale,
+                                round(cluster.bbox.b) * self.scale,
+                            ],
+                        )
+                        for cluster in page.predictions.layout.clusters
+                        if cluster.label == DocItemLabel.TABLE
+                    ]
+                    if not len(in_tables):
+                        yield page
+                        continue
+                    tokens = []
+                    for c in page.cells:
+                        for cluster, _ in in_tables:
+                            if c.bbox.area() > 0:
+                                if (
+                                    c.bbox.intersection_area_with(cluster.bbox)
+                                    / c.bbox.area()
+                                    > 0.2
+                                ):
+                                    # Only allow non empty stings (spaces) into the cells of a table
+                                    if len(c.text.strip()) > 0:
+                                        new_cell = copy.deepcopy(c)
+                                        new_cell.bbox = new_cell.bbox.scaled(
+                                            scale=self.scale
+                                        )
+                                        tokens.append(new_cell.model_dump())
+                    page_input = {
+                        "tokens": tokens,
+                        "width": page.size.width * self.scale,
+                        "height": page.size.height * self.scale,
+                    }
+                    page_input["image"] = numpy.asarray(
+                        page.get_image(scale=self.scale)
                     )
-                    for cluster in page.predictions.layout.clusters
-                    if cluster.label == DocItemLabel.TABLE
-                ]
-                if not len(in_tables):
-                    yield page
-                    continue
-                tokens = []
-                for c in page.cells:
-                    for cluster, _ in in_tables:
-                        if c.bbox.area() > 0:
-                            if (
-                                c.bbox.intersection_area_with(cluster.bbox)
-                                / c.bbox.area()
-                                > 0.2
-                            ):
-                                # Only allow non empty stings (spaces) into the cells of a table
-                                if len(c.text.strip()) > 0:
-                                    new_cell = copy.deepcopy(c)
-                                    new_cell.bbox = new_cell.bbox.scaled(
-                                        scale=self.scale
-                                    )
-                                    tokens.append(new_cell.model_dump())
-                page_input = {
-                    "tokens": tokens,
-                    "width": page.size.width * self.scale,
-                    "height": page.size.height * self.scale,
-                }
-                page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
+                    table_clusters, table_bboxes = zip(*in_tables)
-                table_clusters, table_bboxes = zip(*in_tables)
-                if len(table_bboxes):
-                    tf_output = self.tf_predictor.multi_table_predict(
-                        page_input, table_bboxes, do_matching=self.do_cell_matching
-                    )
-                    for table_cluster, table_out in zip(table_clusters, tf_output):
-                        table_cells = []
-                        for element in table_out["tf_responses"]:
-                            if not self.do_cell_matching:
-                                the_bbox = BoundingBox.model_validate(
-                                    element["bbox"]
-                                ).scaled(1 / self.scale)
-                                text_piece = page._backend.get_text_in_rect(the_bbox)
-                                element["bbox"]["token"] = text_piece
-                            tc = TableCell.model_validate(element)
-                            if self.do_cell_matching and tc.bbox is not None:
-                                tc.bbox = tc.bbox.scaled(1 / self.scale)
-                            table_cells.append(tc)
-                        # Retrieving cols/rows, after post processing:
-                        num_rows = table_out["predict_details"]["num_rows"]
-                        num_cols = table_out["predict_details"]["num_cols"]
-                        otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
-                        tbl = Table(
-                            otsl_seq=otsl_seq,
-                            table_cells=table_cells,
-                            num_rows=num_rows,
-                            num_cols=num_cols,
-                            id=table_cluster.id,
-                            page_no=page.page_no,
-                            cluster=table_cluster,
-                            label=DocItemLabel.TABLE,
+                    if len(table_bboxes):
+                        tf_output = self.tf_predictor.multi_table_predict(
+                            page_input, table_bboxes, do_matching=self.do_cell_matching
                         )
-                        page.predictions.tablestructure.table_map[table_cluster.id] = (
-                            tbl
-                        )
+                        for table_cluster, table_out in zip(table_clusters, tf_output):
+                            table_cells = []
+                            for element in table_out["tf_responses"]:
+                                if not self.do_cell_matching:
+                                    the_bbox = BoundingBox.model_validate(
+                                        element["bbox"]
+                                    ).scaled(1 / self.scale)
+                                    text_piece = page._backend.get_text_in_rect(
+                                        the_bbox
+                                    )
+                                    element["bbox"]["token"] = text_piece
+                                tc = TableCell.model_validate(element)
+                                if self.do_cell_matching and tc.bbox is not None:
+                                    tc.bbox = tc.bbox.scaled(1 / self.scale)
+                                table_cells.append(tc)
+                            # Retrieving cols/rows, after post processing:
+                            num_rows = table_out["predict_details"]["num_rows"]
+                            num_cols = table_out["predict_details"]["num_cols"]
+                            otsl_seq = table_out["predict_details"]["prediction"][
+                                "rs_seq"
+                            ]
+                            tbl = Table(
+                                otsl_seq=otsl_seq,
+                                table_cells=table_cells,
+                                num_rows=num_rows,
+                                num_cols=num_cols,
+                                id=table_cluster.id,
+                                page_no=page.page_no,
+                                cluster=table_cluster,
+                                label=DocItemLabel.TABLE,
+                            )
+                            page.predictions.tablestructure.table_map[
+                                table_cluster.id
+                            ] = tbl
                     # For debugging purposes:
-                    # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
+                    if settings.debug.visualize_tables:
+                        self.draw_table_and_cells(
+                            conv_res,
+                            page,
+                            page.predictions.tablestructure.table_map.values(),
+                        )
                 yield page

docling 2.2.1__py3-none-any.whl → 2.3.1__py3-none-any.whl

docling 2.2.1py3-none-any.whl → 2.3.1py3-none-any.whl