PyPI - docling - Versions diffs - 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl - Mend

docling 1.19.1py3-none-any.whl → 2.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

docling/backend/abstract_backend.py +33 -37
docling/backend/asciidoc_backend.py +431 -0
docling/backend/docling_parse_backend.py +20 -16
docling/backend/docling_parse_v2_backend.py +248 -0
docling/backend/html_backend.py +429 -0
docling/backend/md_backend.py +346 -0
docling/backend/mspowerpoint_backend.py +398 -0
docling/backend/msword_backend.py +496 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +16 -11
docling/cli/main.py +96 -65
docling/datamodel/base_models.py +79 -193
docling/datamodel/document.py +405 -320
docling/datamodel/pipeline_options.py +19 -3
docling/datamodel/settings.py +16 -1
docling/document_converter.py +240 -251
docling/models/base_model.py +28 -0
docling/models/base_ocr_model.py +40 -10
docling/models/ds_glm_model.py +244 -30
docling/models/easyocr_model.py +57 -42
docling/models/layout_model.py +158 -116
docling/models/page_assemble_model.py +127 -101
docling/models/page_preprocessing_model.py +79 -0
docling/models/table_structure_model.py +162 -116
docling/models/tesseract_ocr_cli_model.py +76 -59
docling/models/tesseract_ocr_model.py +90 -58
docling/pipeline/base_pipeline.py +189 -0
docling/pipeline/simple_pipeline.py +56 -0
docling/pipeline/standard_pdf_pipeline.py +201 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling/utils/profiling.py +62 -0
docling-2.4.1.dist-info/METADATA +154 -0
docling-2.4.1.dist-info/RECORD +45 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.19.1.dist-info/METADATA +0 -380
docling-1.19.1.dist-info/RECORD +0 -34
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0

docling/models/page_preprocessing_model.py ADDED Viewed

@@ -0,0 +1,79 @@
+from pathlib import Path
+from typing import Iterable, Optional
+from PIL import ImageDraw
+from pydantic import BaseModel
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.settings import settings
+from docling.models.base_model import BasePageModel
+from docling.utils.profiling import TimeRecorder
+class PagePreprocessingOptions(BaseModel):
+    images_scale: Optional[float]
+class PagePreprocessingModel(BasePageModel):
+    def __init__(self, options: PagePreprocessingOptions):
+        self.options = options
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "page_parse"):
+                    page = self._populate_page_images(page)
+                    page = self._parse_page_cells(conv_res, page)
+                yield page
+    # Generate the page image and store it in the page object
+    def _populate_page_images(self, page: Page) -> Page:
+        # default scale
+        page.get_image(
+            scale=1.0
+        )  # puts the page image on the image cache at default scale
+        images_scale = self.options.images_scale
+        # user requested scales
+        if images_scale is not None:
+            page._default_image_scale = images_scale
+            page.get_image(
+                scale=images_scale
+            )  # this will trigger storing the image in the internal cache
+        return page
+    # Extract and populate the page cells and store it in the page object
+    def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
+        assert page._backend is not None
+        page.cells = list(page._backend.get_text_cells())
+        # DEBUG code:
+        def draw_text_boxes(image, cells, show: bool = False):
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+            if show:
+                image.show()
+            else:
+                out_path: Path = (
+                    Path(settings.debug.debug_output_path)
+                    / f"debug_{conv_res.input.file.stem}"
+                )
+                out_path.mkdir(parents=True, exist_ok=True)
+                out_file = out_path / f"cells_page_{page.page_no:05}.png"
+                image.save(str(out_file), format="png")
+        if settings.debug.visualize_cells:
+            draw_text_boxes(page.get_image(scale=1.0), page.cells)
+        return page

docling/models/table_structure_model.py CHANGED Viewed

@@ -1,31 +1,30 @@
 import copy
 from pathlib import Path
-from typing import Iterable, List
+from typing import Iterable
 import numpy
+from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw
-from docling.datamodel.base_models import (
-    BoundingBox,
-    Page,
-    TableCell,
-    TableElement,
-    TableStructurePrediction,
-)
-from docling.datamodel.pipeline_options import TableFormerMode
+from docling.datamodel.base_models import Page, Table, TableStructurePrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
+from docling.datamodel.settings import settings
+from docling.models.base_model import BasePageModel
+from docling.utils.profiling import TimeRecorder
-class TableStructureModel:
-    def __init__(self, config):
-        self.config = config
-        self.do_cell_matching = config["do_cell_matching"]
-        self.mode = config["mode"]
+class TableStructureModel(BasePageModel):
+    def __init__(
+        self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
+    ):
+        self.options = options
+        self.do_cell_matching = self.options.do_cell_matching
+        self.mode = self.options.mode
-        self.enabled = config["enabled"]
+        self.enabled = enabled
         if self.enabled:
-            artifacts_path: Path = config["artifacts_path"]
             if self.mode == TableFormerMode.ACCURATE:
                 artifacts_path = artifacts_path / "fat"
@@ -39,7 +38,15 @@ class TableStructureModel:
             self.tf_predictor = TFPredictor(self.tm_config)
             self.scale = 2.0  # Scale up table input images to 144 dpi
-    def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
+    def draw_table_and_cells(
+        self,
+        conv_res: ConversionResult,
+        page: Page,
+        tbl_list: Iterable[Table],
+        show: bool = False,
+    ):
+        assert page._backend is not None
         image = (
             page._backend.get_page_image()
         )  # make new image to avoid drawing on the saved ones
@@ -50,111 +57,150 @@ class TableStructureModel:
             draw.rectangle([(x0, y0), (x1, y1)], outline="red")
             for tc in table_element.table_cells:
-                x0, y0, x1, y1 = tc.bbox.as_tuple()
-                if tc.column_header:
-                    width = 3
-                else:
-                    width = 1
-                draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
-                draw.text(
-                    (x0 + 3, y0 + 3),
-                    text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
-                    fill="black",
-                )
-        image.show()
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+                if tc.bbox is not None:
+                    x0, y0, x1, y1 = tc.bbox.as_tuple()
+                    if tc.column_header:
+                        width = 3
+                    else:
+                        width = 1
+                    draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
+                    draw.text(
+                        (x0 + 3, y0 + 3),
+                        text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
+                        fill="black",
+                    )
+        if show:
+            image.show()
+        else:
+            out_path: Path = (
+                Path(settings.debug.debug_output_path)
+                / f"debug_{conv_res.input.file.stem}"
+            )
+            out_path.mkdir(parents=True, exist_ok=True)
+            out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
+            image.save(str(out_file), format="png")
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
         for page in page_batch:
-            page.predictions.tablestructure = TableStructurePrediction()  # dummy
-            in_tables = [
-                (
-                    cluster,
-                    [
-                        round(cluster.bbox.l) * self.scale,
-                        round(cluster.bbox.t) * self.scale,
-                        round(cluster.bbox.r) * self.scale,
-                        round(cluster.bbox.b) * self.scale,
-                    ],
-                )
-                for cluster in page.predictions.layout.clusters
-                if cluster.label == "Table"
-            ]
-            if not len(in_tables):
+            assert page._backend is not None
+            if not page._backend.is_valid():
                 yield page
-                continue
-            tokens = []
-            for c in page.cells:
-                for cluster, _ in in_tables:
-                    if c.bbox.area() > 0:
-                        if (
-                            c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
-                            > 0.2
-                        ):
-                            # Only allow non empty stings (spaces) into the cells of a table
-                            if len(c.text.strip()) > 0:
-                                new_cell = copy.deepcopy(c)
-                                new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
-                                tokens.append(new_cell.model_dump())
-            page_input = {
-                "tokens": tokens,
-                "width": page.size.width * self.scale,
-                "height": page.size.height * self.scale,
-            }
-            page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
-            table_clusters, table_bboxes = zip(*in_tables)
-            if len(table_bboxes):
-                tf_output = self.tf_predictor.multi_table_predict(
-                    page_input, table_bboxes, do_matching=self.do_cell_matching
-                )
-                for table_cluster, table_out in zip(table_clusters, tf_output):
-                    table_cells = []
-                    for element in table_out["tf_responses"]:
-                        if not self.do_cell_matching:
-                            the_bbox = BoundingBox.model_validate(
-                                element["bbox"]
-                            ).scaled(1 / self.scale)
-                            text_piece = page._backend.get_text_in_rect(the_bbox)
-                            element["bbox"]["token"] = text_piece
-                        tc = TableCell.model_validate(element)
-                        if self.do_cell_matching:
-                            tc.bbox = tc.bbox.scaled(1 / self.scale)
-                        table_cells.append(tc)
-                    # Retrieving cols/rows, after post processing:
-                    num_rows = table_out["predict_details"]["num_rows"]
-                    num_cols = table_out["predict_details"]["num_cols"]
-                    otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
-                    tbl = TableElement(
-                        otsl_seq=otsl_seq,
-                        table_cells=table_cells,
-                        num_rows=num_rows,
-                        num_cols=num_cols,
-                        id=table_cluster.id,
-                        page_no=page.page_no,
-                        cluster=table_cluster,
-                        label="Table",
+            else:
+                with TimeRecorder(conv_res, "table_structure"):
+                    assert page.predictions.layout is not None
+                    assert page.size is not None
+                    page.predictions.tablestructure = (
+                        TableStructurePrediction()
+                    )  # dummy
+                    in_tables = [
+                        (
+                            cluster,
+                            [
+                                round(cluster.bbox.l) * self.scale,
+                                round(cluster.bbox.t) * self.scale,
+                                round(cluster.bbox.r) * self.scale,
+                                round(cluster.bbox.b) * self.scale,
+                            ],
+                        )
+                        for cluster in page.predictions.layout.clusters
+                        if cluster.label == DocItemLabel.TABLE
+                    ]
+                    if not len(in_tables):
+                        yield page
+                        continue
+                    tokens = []
+                    for c in page.cells:
+                        for cluster, _ in in_tables:
+                            if c.bbox.area() > 0:
+                                if (
+                                    c.bbox.intersection_area_with(cluster.bbox)
+                                    / c.bbox.area()
+                                    > 0.2
+                                ):
+                                    # Only allow non empty stings (spaces) into the cells of a table
+                                    if len(c.text.strip()) > 0:
+                                        new_cell = copy.deepcopy(c)
+                                        new_cell.bbox = new_cell.bbox.scaled(
+                                            scale=self.scale
+                                        )
+                                        tokens.append(new_cell.model_dump())
+                    page_input = {
+                        "tokens": tokens,
+                        "width": page.size.width * self.scale,
+                        "height": page.size.height * self.scale,
+                    }
+                    page_input["image"] = numpy.asarray(
+                        page.get_image(scale=self.scale)
                     )
-                    page.predictions.tablestructure.table_map[table_cluster.id] = tbl
+                    table_clusters, table_bboxes = zip(*in_tables)
+                    if len(table_bboxes):
+                        tf_output = self.tf_predictor.multi_table_predict(
+                            page_input, table_bboxes, do_matching=self.do_cell_matching
+                        )
+                        for table_cluster, table_out in zip(table_clusters, tf_output):
+                            table_cells = []
+                            for element in table_out["tf_responses"]:
+                                if not self.do_cell_matching:
+                                    the_bbox = BoundingBox.model_validate(
+                                        element["bbox"]
+                                    ).scaled(1 / self.scale)
+                                    text_piece = page._backend.get_text_in_rect(
+                                        the_bbox
+                                    )
+                                    element["bbox"]["token"] = text_piece
+                                tc = TableCell.model_validate(element)
+                                if self.do_cell_matching and tc.bbox is not None:
+                                    tc.bbox = tc.bbox.scaled(1 / self.scale)
+                                table_cells.append(tc)
+                            # Retrieving cols/rows, after post processing:
+                            num_rows = table_out["predict_details"]["num_rows"]
+                            num_cols = table_out["predict_details"]["num_cols"]
+                            otsl_seq = table_out["predict_details"]["prediction"][
+                                "rs_seq"
+                            ]
+                            tbl = Table(
+                                otsl_seq=otsl_seq,
+                                table_cells=table_cells,
+                                num_rows=num_rows,
+                                num_cols=num_cols,
+                                id=table_cluster.id,
+                                page_no=page.page_no,
+                                cluster=table_cluster,
+                                label=DocItemLabel.TABLE,
+                            )
+                            page.predictions.tablestructure.table_map[
+                                table_cluster.id
+                            ] = tbl
+                    # For debugging purposes:
+                    if settings.debug.visualize_tables:
+                        self.draw_table_and_cells(
+                            conv_res,
+                            page,
+                            page.predictions.tablestructure.table_map.values(),
+                        )
-                # For debugging purposes:
-                # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
-            yield page
+                yield page

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -2,13 +2,17 @@ import io
 import logging
 import tempfile
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, Tuple
+from typing import Iterable, Optional, Tuple
 import pandas as pd
+from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -21,8 +25,8 @@ class TesseractOcrCliModel(BaseOcrModel):
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
-        self._name = None
-        self._version = None
+        self._name: Optional[str] = None
+        self._version: Optional[str] = None
         if self.enabled:
             try:
@@ -39,7 +43,7 @@ class TesseractOcrCliModel(BaseOcrModel):
     def _get_name_and_version(self) -> Tuple[str, str]:
         if self._name != None and self._version != None:
-            return self._name, self._version
+            return self._name, self._version  # type: ignore
         cmd = [self.options.tesseract_cmd, "--version"]
@@ -101,67 +105,80 @@ class TesseractOcrCliModel(BaseOcrModel):
         return df_filtered
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
         for page in page_batch:
-            ocr_rects = self.get_ocr_rects(page)
-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
-                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
-                    fname = image_file.name
-                    high_res_image.save(fname)
-                    df = self._run_tesseract(fname)
-                # _log.info(df)
-                # Print relevant columns (bounding box and text)
-                for ix, row in df.iterrows():
-                    text = row["text"]
-                    conf = row["conf"]
-                    l = float(row["left"])
-                    b = float(row["top"])
-                    w = float(row["width"])
-                    h = float(row["height"])
-                    t = b + h
-                    r = l + w
-                    cell = OcrCell(
-                        id=ix,
-                        text=text,
-                        confidence=conf / 100.0,
-                        bbox=BoundingBox.from_tuple(
-                            coord=(
-                                (l / self.scale) + ocr_rect.l,
-                                (b / self.scale) + ocr_rect.t,
-                                (r / self.scale) + ocr_rect.l,
-                                (t / self.scale) + ocr_rect.t,
-                            ),
-                            origin=CoordOrigin.TOPLEFT,
-                        ),
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "ocr"):
+                    ocr_rects = self.get_ocr_rects(page)
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
+                        with tempfile.NamedTemporaryFile(
+                            suffix=".png", mode="w"
+                        ) as image_file:
+                            fname = image_file.name
+                            high_res_image.save(fname)
+                            df = self._run_tesseract(fname)
+                        # _log.info(df)
+                        # Print relevant columns (bounding box and text)
+                        for ix, row in df.iterrows():
+                            text = row["text"]
+                            conf = row["conf"]
+                            l = float(row["left"])
+                            b = float(row["top"])
+                            w = float(row["width"])
+                            h = float(row["height"])
+                            t = b + h
+                            r = l + w
+                            cell = OcrCell(
+                                id=ix,
+                                text=text,
+                                confidence=conf / 100.0,
+                                bbox=BoundingBox.from_tuple(
+                                    coord=(
+                                        (l / self.scale) + ocr_rect.l,
+                                        (b / self.scale) + ocr_rect.t,
+                                        (r / self.scale) + ocr_rect.l,
+                                        (t / self.scale) + ocr_rect.t,
+                                    ),
+                                    origin=CoordOrigin.TOPLEFT,
+                                ),
+                            )
+                            all_ocr_cells.append(cell)
+                    ## Remove OCR cells which overlap with programmatic cells.
+                    filtered_ocr_cells = self.filter_ocr_cells(
+                        all_ocr_cells, page.cells
                     )
-                    all_ocr_cells.append(cell)
-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
-            page.cells.extend(filtered_ocr_cells)
+                    page.cells.extend(filtered_ocr_cells)
-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                # DEBUG code:
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
-            yield page
+                yield page

docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

docling 1.19.1py3-none-any.whl → 2.4.1py3-none-any.whl