PyPI - docling - Versions diffs - 2.1.0__py3-none-any.whl → 2.4.1__py3-none-any.whl - Mend

docling 2.1.0py3-none-any.whl → 2.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

docling/backend/abstract_backend.py +1 -0
docling/backend/asciidoc_backend.py +431 -0
docling/backend/docling_parse_backend.py +4 -4
docling/backend/docling_parse_v2_backend.py +12 -4
docling/backend/html_backend.py +61 -57
docling/backend/md_backend.py +346 -0
docling/backend/mspowerpoint_backend.py +62 -39
docling/backend/msword_backend.py +12 -25
docling/backend/pypdfium2_backend.py +1 -1
docling/cli/main.py +38 -8
docling/datamodel/base_models.py +16 -10
docling/datamodel/document.py +36 -6
docling/datamodel/pipeline_options.py +3 -3
docling/datamodel/settings.py +15 -1
docling/document_converter.py +38 -12
docling/models/base_model.py +4 -1
docling/models/base_ocr_model.py +21 -4
docling/models/ds_glm_model.py +27 -11
docling/models/easyocr_model.py +49 -39
docling/models/layout_model.py +87 -61
docling/models/page_assemble_model.py +102 -100
docling/models/page_preprocessing_model.py +25 -7
docling/models/table_structure_model.py +125 -90
docling/models/tesseract_ocr_cli_model.py +62 -52
docling/models/tesseract_ocr_model.py +76 -52
docling/pipeline/base_pipeline.py +68 -69
docling/pipeline/simple_pipeline.py +8 -11
docling/pipeline/standard_pdf_pipeline.py +59 -56
docling/utils/profiling.py +62 -0
{docling-2.1.0.dist-info → docling-2.4.1.dist-info}/METADATA +27 -22
docling-2.4.1.dist-info/RECORD +45 -0
docling-2.1.0.dist-info/RECORD +0 -42
{docling-2.1.0.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
{docling-2.1.0.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
{docling-2.1.0.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0

docling/models/table_structure_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import copy
 from pathlib import Path
-from typing import Iterable, List
+from typing import Iterable
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -8,8 +8,11 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
 from PIL import ImageDraw
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
+from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.utils.profiling import TimeRecorder
 class TableStructureModel(BasePageModel):
@@ -35,7 +38,13 @@ class TableStructureModel(BasePageModel):
             self.tf_predictor = TFPredictor(self.tm_config)
             self.scale = 2.0  # Scale up table input images to 144 dpi
-    def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
+    def draw_table_and_cells(
+        self,
+        conv_res: ConversionResult,
+        page: Page,
+        tbl_list: Iterable[Table],
+        show: bool = False,
+    ):
         assert page._backend is not None
         image = (
@@ -61,9 +70,21 @@ class TableStructureModel(BasePageModel):
                         fill="black",
                     )
-        image.show()
+        if show:
+            image.show()
+        else:
+            out_path: Path = (
+                Path(settings.debug.debug_output_path)
+                / f"debug_{conv_res.input.file.stem}"
+            )
+            out_path.mkdir(parents=True, exist_ok=True)
+            out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
+            image.save(str(out_file), format="png")
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
@@ -74,98 +95,112 @@ class TableStructureModel(BasePageModel):
             if not page._backend.is_valid():
                 yield page
             else:
-                assert page.predictions.layout is not None
-                assert page.size is not None
-                page.predictions.tablestructure = TableStructurePrediction()  # dummy
-                in_tables = [
-                    (
-                        cluster,
-                        [
-                            round(cluster.bbox.l) * self.scale,
-                            round(cluster.bbox.t) * self.scale,
-                            round(cluster.bbox.r) * self.scale,
-                            round(cluster.bbox.b) * self.scale,
-                        ],
+                with TimeRecorder(conv_res, "table_structure"):
+                    assert page.predictions.layout is not None
+                    assert page.size is not None
+                    page.predictions.tablestructure = (
+                        TableStructurePrediction()
+                    )  # dummy
+                    in_tables = [
+                        (
+                            cluster,
+                            [
+                                round(cluster.bbox.l) * self.scale,
+                                round(cluster.bbox.t) * self.scale,
+                                round(cluster.bbox.r) * self.scale,
+                                round(cluster.bbox.b) * self.scale,
+                            ],
+                        )
+                        for cluster in page.predictions.layout.clusters
+                        if cluster.label == DocItemLabel.TABLE
+                    ]
+                    if not len(in_tables):
+                        yield page
+                        continue
+                    tokens = []
+                    for c in page.cells:
+                        for cluster, _ in in_tables:
+                            if c.bbox.area() > 0:
+                                if (
+                                    c.bbox.intersection_area_with(cluster.bbox)
+                                    / c.bbox.area()
+                                    > 0.2
+                                ):
+                                    # Only allow non empty stings (spaces) into the cells of a table
+                                    if len(c.text.strip()) > 0:
+                                        new_cell = copy.deepcopy(c)
+                                        new_cell.bbox = new_cell.bbox.scaled(
+                                            scale=self.scale
+                                        )
+                                        tokens.append(new_cell.model_dump())
+                    page_input = {
+                        "tokens": tokens,
+                        "width": page.size.width * self.scale,
+                        "height": page.size.height * self.scale,
+                    }
+                    page_input["image"] = numpy.asarray(
+                        page.get_image(scale=self.scale)
                     )
-                    for cluster in page.predictions.layout.clusters
-                    if cluster.label == DocItemLabel.TABLE
-                ]
-                if not len(in_tables):
-                    yield page
-                    continue
-                tokens = []
-                for c in page.cells:
-                    for cluster, _ in in_tables:
-                        if c.bbox.area() > 0:
-                            if (
-                                c.bbox.intersection_area_with(cluster.bbox)
-                                / c.bbox.area()
-                                > 0.2
-                            ):
-                                # Only allow non empty stings (spaces) into the cells of a table
-                                if len(c.text.strip()) > 0:
-                                    new_cell = copy.deepcopy(c)
-                                    new_cell.bbox = new_cell.bbox.scaled(
-                                        scale=self.scale
-                                    )
-                                    tokens.append(new_cell.model_dump())
-                page_input = {
-                    "tokens": tokens,
-                    "width": page.size.width * self.scale,
-                    "height": page.size.height * self.scale,
-                }
-                page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
+                    table_clusters, table_bboxes = zip(*in_tables)
-                table_clusters, table_bboxes = zip(*in_tables)
-                if len(table_bboxes):
-                    tf_output = self.tf_predictor.multi_table_predict(
-                        page_input, table_bboxes, do_matching=self.do_cell_matching
-                    )
-                    for table_cluster, table_out in zip(table_clusters, tf_output):
-                        table_cells = []
-                        for element in table_out["tf_responses"]:
-                            if not self.do_cell_matching:
-                                the_bbox = BoundingBox.model_validate(
-                                    element["bbox"]
-                                ).scaled(1 / self.scale)
-                                text_piece = page._backend.get_text_in_rect(the_bbox)
-                                element["bbox"]["token"] = text_piece
-                            tc = TableCell.model_validate(element)
-                            if self.do_cell_matching and tc.bbox is not None:
-                                tc.bbox = tc.bbox.scaled(1 / self.scale)
-                            table_cells.append(tc)
-                        # Retrieving cols/rows, after post processing:
-                        num_rows = table_out["predict_details"]["num_rows"]
-                        num_cols = table_out["predict_details"]["num_cols"]
-                        otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
-                        tbl = Table(
-                            otsl_seq=otsl_seq,
-                            table_cells=table_cells,
-                            num_rows=num_rows,
-                            num_cols=num_cols,
-                            id=table_cluster.id,
-                            page_no=page.page_no,
-                            cluster=table_cluster,
-                            label=DocItemLabel.TABLE,
+                    if len(table_bboxes):
+                        tf_output = self.tf_predictor.multi_table_predict(
+                            page_input, table_bboxes, do_matching=self.do_cell_matching
                         )
-                        page.predictions.tablestructure.table_map[table_cluster.id] = (
-                            tbl
-                        )
+                        for table_cluster, table_out in zip(table_clusters, tf_output):
+                            table_cells = []
+                            for element in table_out["tf_responses"]:
+                                if not self.do_cell_matching:
+                                    the_bbox = BoundingBox.model_validate(
+                                        element["bbox"]
+                                    ).scaled(1 / self.scale)
+                                    text_piece = page._backend.get_text_in_rect(
+                                        the_bbox
+                                    )
+                                    element["bbox"]["token"] = text_piece
+                                tc = TableCell.model_validate(element)
+                                if self.do_cell_matching and tc.bbox is not None:
+                                    tc.bbox = tc.bbox.scaled(1 / self.scale)
+                                table_cells.append(tc)
+                            # Retrieving cols/rows, after post processing:
+                            num_rows = table_out["predict_details"]["num_rows"]
+                            num_cols = table_out["predict_details"]["num_cols"]
+                            otsl_seq = table_out["predict_details"]["prediction"][
+                                "rs_seq"
+                            ]
+                            tbl = Table(
+                                otsl_seq=otsl_seq,
+                                table_cells=table_cells,
+                                num_rows=num_rows,
+                                num_cols=num_cols,
+                                id=table_cluster.id,
+                                page_no=page.page_no,
+                                cluster=table_cluster,
+                                label=DocItemLabel.TABLE,
+                            )
+                            page.predictions.tablestructure.table_map[
+                                table_cluster.id
+                            ] = tbl
                     # For debugging purposes:
-                    # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
+                    if settings.debug.visualize_tables:
+                        self.draw_table_and_cells(
+                            conv_res,
+                            page,
+                            page.predictions.tablestructure.table_map.values(),
+                        )
                 yield page

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -8,8 +8,11 @@ import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -102,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
         return df_filtered
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
@@ -113,62 +118,67 @@ class TesseractOcrCliModel(BaseOcrModel):
             if not page._backend.is_valid():
                 yield page
             else:
-                ocr_rects = self.get_ocr_rects(page)
-                all_ocr_cells = []
-                for ocr_rect in ocr_rects:
-                    # Skip zero area boxes
-                    if ocr_rect.area() == 0:
-                        continue
-                    high_res_image = page._backend.get_page_image(
-                        scale=self.scale, cropbox=ocr_rect
-                    )
+                with TimeRecorder(conv_res, "ocr"):
-                    with tempfile.NamedTemporaryFile(
-                        suffix=".png", mode="w"
-                    ) as image_file:
-                        fname = image_file.name
-                        high_res_image.save(fname)
-                        df = self._run_tesseract(fname)
-                    # _log.info(df)
-                    # Print relevant columns (bounding box and text)
-                    for ix, row in df.iterrows():
-                        text = row["text"]
-                        conf = row["conf"]
-                        l = float(row["left"])
-                        b = float(row["top"])
-                        w = float(row["width"])
-                        h = float(row["height"])
-                        t = b + h
-                        r = l + w
-                        cell = OcrCell(
-                            id=ix,
-                            text=text,
-                            confidence=conf / 100.0,
-                            bbox=BoundingBox.from_tuple(
-                                coord=(
-                                    (l / self.scale) + ocr_rect.l,
-                                    (b / self.scale) + ocr_rect.t,
-                                    (r / self.scale) + ocr_rect.l,
-                                    (t / self.scale) + ocr_rect.t,
-                                ),
-                                origin=CoordOrigin.TOPLEFT,
-                            ),
+                    ocr_rects = self.get_ocr_rects(page)
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
                         )
-                        all_ocr_cells.append(cell)
-                ## Remove OCR cells which overlap with programmatic cells.
-                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                        with tempfile.NamedTemporaryFile(
+                            suffix=".png", mode="w"
+                        ) as image_file:
+                            fname = image_file.name
+                            high_res_image.save(fname)
+                            df = self._run_tesseract(fname)
+                        # _log.info(df)
+                        # Print relevant columns (bounding box and text)
+                        for ix, row in df.iterrows():
+                            text = row["text"]
+                            conf = row["conf"]
+                            l = float(row["left"])
+                            b = float(row["top"])
+                            w = float(row["width"])
+                            h = float(row["height"])
+                            t = b + h
+                            r = l + w
+                            cell = OcrCell(
+                                id=ix,
+                                text=text,
+                                confidence=conf / 100.0,
+                                bbox=BoundingBox.from_tuple(
+                                    coord=(
+                                        (l / self.scale) + ocr_rect.l,
+                                        (b / self.scale) + ocr_rect.t,
+                                        (r / self.scale) + ocr_rect.l,
+                                        (t / self.scale) + ocr_rect.t,
+                                    ),
+                                    origin=CoordOrigin.TOPLEFT,
+                                ),
+                            )
+                            all_ocr_cells.append(cell)
+                    ## Remove OCR cells which overlap with programmatic cells.
+                    filtered_ocr_cells = self.filter_ocr_cells(
+                        all_ocr_cells, page.cells
+                    )
-                page.cells.extend(filtered_ocr_cells)
+                    page.cells.extend(filtered_ocr_cells)
                 # DEBUG code:
-                # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
                 yield page

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -4,8 +4,11 @@ from typing import Iterable
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
+from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -19,25 +22,37 @@ class TesseractOcrModel(BaseOcrModel):
         self.reader = None
         if self.enabled:
-            setup_errmsg = (
+            install_errmsg = (
                 "tesserocr is not correctly installed. "
                 "Please install it via `pip install tesserocr` to use this OCR engine. "
-                "Note that tesserocr might have to be manually compiled for working with"
+                "Note that tesserocr might have to be manually compiled for working with "
                 "your Tesseract installation. The Docling documentation provides examples for it. "
-                "Alternatively, Docling has support for other OCR engines. See the documentation."
+                "Alternatively, Docling has support for other OCR engines. See the documentation: "
+                "https://ds4sd.github.io/docling/installation/"
             )
+            missing_langs_errmsg = (
+                "tesserocr is not correctly configured. No language models have been detected. "
+                "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
+                "You can find more information how to setup other OCR engines in Docling "
+                "documentation: "
+                "https://ds4sd.github.io/docling/installation/"
+            )
             try:
                 import tesserocr
             except ImportError:
-                raise ImportError(setup_errmsg)
+                raise ImportError(install_errmsg)
             try:
                 tesseract_version = tesserocr.tesseract_version()
-                _log.debug("Initializing TesserOCR: %s", tesseract_version)
             except:
-                raise ImportError(setup_errmsg)
+                raise ImportError(install_errmsg)
+            _, tesserocr_languages = tesserocr.get_languages()
+            if not tesserocr_languages:
+                raise ImportError(missing_langs_errmsg)
             # Initialize the tesseractAPI
+            _log.debug("Initializing TesserOCR: %s", tesseract_version)
             lang = "+".join(self.options.lang)
             if self.options.path is not None:
                 self.reader = tesserocr.PyTessBaseAPI(
@@ -61,7 +76,9 @@ class TesseractOcrModel(BaseOcrModel):
             # Finalize the tesseractAPI
             self.reader.End()
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
@@ -72,59 +89,66 @@ class TesseractOcrModel(BaseOcrModel):
             if not page._backend.is_valid():
                 yield page
             else:
-                assert self.reader is not None
+                with TimeRecorder(conv_res, "ocr"):
-                ocr_rects = self.get_ocr_rects(page)
+                    assert self.reader is not None
-                all_ocr_cells = []
-                for ocr_rect in ocr_rects:
-                    # Skip zero area boxes
-                    if ocr_rect.area() == 0:
-                        continue
-                    high_res_image = page._backend.get_page_image(
-                        scale=self.scale, cropbox=ocr_rect
-                    )
+                    ocr_rects = self.get_ocr_rects(page)
-                    # Retrieve text snippets with their bounding boxes
-                    self.reader.SetImage(high_res_image)
-                    boxes = self.reader.GetComponentImages(
-                        self.reader_RIL.TEXTLINE, True
-                    )
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
-                    cells = []
-                    for ix, (im, box, _, _) in enumerate(boxes):
-                        # Set the area of interest. Tesseract uses Bottom-Left for the origin
-                        self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
-                        # Extract text within the bounding box
-                        text = self.reader.GetUTF8Text().strip()
-                        confidence = self.reader.MeanTextConf()
-                        left = box["x"] / self.scale
-                        bottom = box["y"] / self.scale
-                        right = (box["x"] + box["w"]) / self.scale
-                        top = (box["y"] + box["h"]) / self.scale
-                        cells.append(
-                            OcrCell(
-                                id=ix,
-                                text=text,
-                                confidence=confidence,
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(left, top, right, bottom),
-                                    origin=CoordOrigin.TOPLEFT,
-                                ),
-                            )
+                        # Retrieve text snippets with their bounding boxes
+                        self.reader.SetImage(high_res_image)
+                        boxes = self.reader.GetComponentImages(
+                            self.reader_RIL.TEXTLINE, True
                         )
-                    # del high_res_image
-                    all_ocr_cells.extend(cells)
+                        cells = []
+                        for ix, (im, box, _, _) in enumerate(boxes):
+                            # Set the area of interest. Tesseract uses Bottom-Left for the origin
+                            self.reader.SetRectangle(
+                                box["x"], box["y"], box["w"], box["h"]
+                            )
+                            # Extract text within the bounding box
+                            text = self.reader.GetUTF8Text().strip()
+                            confidence = self.reader.MeanTextConf()
+                            left = box["x"] / self.scale
+                            bottom = box["y"] / self.scale
+                            right = (box["x"] + box["w"]) / self.scale
+                            top = (box["y"] + box["h"]) / self.scale
+                            cells.append(
+                                OcrCell(
+                                    id=ix,
+                                    text=text,
+                                    confidence=confidence,
+                                    bbox=BoundingBox.from_tuple(
+                                        coord=(left, top, right, bottom),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    ),
+                                )
+                            )
+                        # del high_res_image
+                        all_ocr_cells.extend(cells)
-                ## Remove OCR cells which overlap with programmatic cells.
-                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                    ## Remove OCR cells which overlap with programmatic cells.
+                    filtered_ocr_cells = self.filter_ocr_cells(
+                        all_ocr_cells, page.cells
+                    )
-                page.cells.extend(filtered_ocr_cells)
+                    page.cells.extend(filtered_ocr_cells)
                 # DEBUG code:
-                # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
                 yield page

docling 2.1.0__py3-none-any.whl → 2.4.1__py3-none-any.whl

docling 2.1.0py3-none-any.whl → 2.4.1py3-none-any.whl