PyPI - docling - Versions diffs - 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

docling 1.19.0py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docling/backend/abstract_backend.py +32 -37
docling/backend/docling_parse_backend.py +16 -12
docling/backend/docling_parse_v2_backend.py +240 -0
docling/backend/html_backend.py +425 -0
docling/backend/mspowerpoint_backend.py +375 -0
docling/backend/msword_backend.py +509 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +15 -10
docling/cli/main.py +61 -60
docling/datamodel/base_models.py +73 -193
docling/datamodel/document.py +379 -324
docling/datamodel/pipeline_options.py +16 -0
docling/datamodel/settings.py +1 -0
docling/document_converter.py +215 -252
docling/models/base_model.py +25 -0
docling/models/base_ocr_model.py +19 -6
docling/models/ds_glm_model.py +220 -22
docling/models/easyocr_model.py +45 -40
docling/models/layout_model.py +130 -114
docling/models/page_assemble_model.py +119 -95
docling/models/page_preprocessing_model.py +61 -0
docling/models/table_structure_model.py +122 -111
docling/models/tesseract_ocr_cli_model.py +65 -58
docling/models/tesseract_ocr_model.py +58 -50
docling/pipeline/base_pipeline.py +190 -0
docling/pipeline/simple_pipeline.py +59 -0
docling/pipeline/standard_pdf_pipeline.py +198 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling-2.1.0.dist-info/METADATA +149 -0
docling-2.1.0.dist-info/RECORD +42 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.19.0.dist-info/METADATA +0 -380
docling-1.19.0.dist-info/RECORD +0 -34
{docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
{docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
{docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0

docling/models/table_structure_model.py CHANGED Viewed

@@ -3,29 +3,25 @@ from pathlib import Path
 from typing import Iterable, List
 import numpy
+from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw
-from docling.datamodel.base_models import (
-    BoundingBox,
-    Page,
-    TableCell,
-    TableElement,
-    TableStructurePrediction,
-)
-from docling.datamodel.pipeline_options import TableFormerMode
+from docling.datamodel.base_models import Page, Table, TableStructurePrediction
+from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
+from docling.models.base_model import BasePageModel
-class TableStructureModel:
-    def __init__(self, config):
-        self.config = config
-        self.do_cell_matching = config["do_cell_matching"]
-        self.mode = config["mode"]
+class TableStructureModel(BasePageModel):
+    def __init__(
+        self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
+    ):
+        self.options = options
+        self.do_cell_matching = self.options.do_cell_matching
+        self.mode = self.options.mode
-        self.enabled = config["enabled"]
+        self.enabled = enabled
         if self.enabled:
-            artifacts_path: Path = config["artifacts_path"]
             if self.mode == TableFormerMode.ACCURATE:
                 artifacts_path = artifacts_path / "fat"
@@ -39,7 +35,9 @@ class TableStructureModel:
             self.tf_predictor = TFPredictor(self.tm_config)
             self.scale = 2.0  # Scale up table input images to 144 dpi
-    def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
+    def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
+        assert page._backend is not None
         image = (
             page._backend.get_page_image()
         )  # make new image to avoid drawing on the saved ones
@@ -50,17 +48,18 @@ class TableStructureModel:
             draw.rectangle([(x0, y0), (x1, y1)], outline="red")
             for tc in table_element.table_cells:
-                x0, y0, x1, y1 = tc.bbox.as_tuple()
-                if tc.column_header:
-                    width = 3
-                else:
-                    width = 1
-                draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
-                draw.text(
-                    (x0 + 3, y0 + 3),
-                    text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
-                    fill="black",
-                )
+                if tc.bbox is not None:
+                    x0, y0, x1, y1 = tc.bbox.as_tuple()
+                    if tc.column_header:
+                        width = 3
+                    else:
+                        width = 1
+                    draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
+                    draw.text(
+                        (x0 + 3, y0 + 3),
+                        text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
+                        fill="black",
+                    )
         image.show()
@@ -71,90 +70,102 @@ class TableStructureModel:
             return
         for page in page_batch:
-            page.predictions.tablestructure = TableStructurePrediction()  # dummy
-            in_tables = [
-                (
-                    cluster,
-                    [
-                        round(cluster.bbox.l) * self.scale,
-                        round(cluster.bbox.t) * self.scale,
-                        round(cluster.bbox.r) * self.scale,
-                        round(cluster.bbox.b) * self.scale,
-                    ],
-                )
-                for cluster in page.predictions.layout.clusters
-                if cluster.label == "Table"
-            ]
-            if not len(in_tables):
+            assert page._backend is not None
+            if not page._backend.is_valid():
                 yield page
-                continue
-            tokens = []
-            for c in page.cells:
-                for cluster, _ in in_tables:
-                    if c.bbox.area() > 0:
-                        if (
-                            c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
-                            > 0.2
-                        ):
-                            # Only allow non empty stings (spaces) into the cells of a table
-                            if len(c.text.strip()) > 0:
-                                new_cell = copy.deepcopy(c)
-                                new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
-                                tokens.append(new_cell.model_dump())
-            page_input = {
-                "tokens": tokens,
-                "width": page.size.width * self.scale,
-                "height": page.size.height * self.scale,
-            }
-            page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
-            table_clusters, table_bboxes = zip(*in_tables)
-            if len(table_bboxes):
-                tf_output = self.tf_predictor.multi_table_predict(
-                    page_input, table_bboxes, do_matching=self.do_cell_matching
-                )
-                for table_cluster, table_out in zip(table_clusters, tf_output):
-                    table_cells = []
-                    for element in table_out["tf_responses"]:
-                        if not self.do_cell_matching:
-                            the_bbox = BoundingBox.model_validate(
-                                element["bbox"]
-                            ).scaled(1 / self.scale)
-                            text_piece = page._backend.get_text_in_rect(the_bbox)
-                            element["bbox"]["token"] = text_piece
-                        tc = TableCell.model_validate(element)
-                        if self.do_cell_matching:
-                            tc.bbox = tc.bbox.scaled(1 / self.scale)
-                        table_cells.append(tc)
-                    # Retrieving cols/rows, after post processing:
-                    num_rows = table_out["predict_details"]["num_rows"]
-                    num_cols = table_out["predict_details"]["num_cols"]
-                    otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
-                    tbl = TableElement(
-                        otsl_seq=otsl_seq,
-                        table_cells=table_cells,
-                        num_rows=num_rows,
-                        num_cols=num_cols,
-                        id=table_cluster.id,
-                        page_no=page.page_no,
-                        cluster=table_cluster,
-                        label="Table",
+            else:
+                assert page.predictions.layout is not None
+                assert page.size is not None
+                page.predictions.tablestructure = TableStructurePrediction()  # dummy
+                in_tables = [
+                    (
+                        cluster,
+                        [
+                            round(cluster.bbox.l) * self.scale,
+                            round(cluster.bbox.t) * self.scale,
+                            round(cluster.bbox.r) * self.scale,
+                            round(cluster.bbox.b) * self.scale,
+                        ],
+                    )
+                    for cluster in page.predictions.layout.clusters
+                    if cluster.label == DocItemLabel.TABLE
+                ]
+                if not len(in_tables):
+                    yield page
+                    continue
+                tokens = []
+                for c in page.cells:
+                    for cluster, _ in in_tables:
+                        if c.bbox.area() > 0:
+                            if (
+                                c.bbox.intersection_area_with(cluster.bbox)
+                                / c.bbox.area()
+                                > 0.2
+                            ):
+                                # Only allow non empty stings (spaces) into the cells of a table
+                                if len(c.text.strip()) > 0:
+                                    new_cell = copy.deepcopy(c)
+                                    new_cell.bbox = new_cell.bbox.scaled(
+                                        scale=self.scale
+                                    )
+                                    tokens.append(new_cell.model_dump())
+                page_input = {
+                    "tokens": tokens,
+                    "width": page.size.width * self.scale,
+                    "height": page.size.height * self.scale,
+                }
+                page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
+                table_clusters, table_bboxes = zip(*in_tables)
+                if len(table_bboxes):
+                    tf_output = self.tf_predictor.multi_table_predict(
+                        page_input, table_bboxes, do_matching=self.do_cell_matching
                     )
-                    page.predictions.tablestructure.table_map[table_cluster.id] = tbl
-                # For debugging purposes:
-                # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
+                    for table_cluster, table_out in zip(table_clusters, tf_output):
+                        table_cells = []
+                        for element in table_out["tf_responses"]:
+                            if not self.do_cell_matching:
+                                the_bbox = BoundingBox.model_validate(
+                                    element["bbox"]
+                                ).scaled(1 / self.scale)
+                                text_piece = page._backend.get_text_in_rect(the_bbox)
+                                element["bbox"]["token"] = text_piece
+                            tc = TableCell.model_validate(element)
+                            if self.do_cell_matching and tc.bbox is not None:
+                                tc.bbox = tc.bbox.scaled(1 / self.scale)
+                            table_cells.append(tc)
+                        # Retrieving cols/rows, after post processing:
+                        num_rows = table_out["predict_details"]["num_rows"]
+                        num_cols = table_out["predict_details"]["num_cols"]
+                        otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
+                        tbl = Table(
+                            otsl_seq=otsl_seq,
+                            table_cells=table_cells,
+                            num_rows=num_rows,
+                            num_cols=num_cols,
+                            id=table_cluster.id,
+                            page_no=page.page_no,
+                            cluster=table_cluster,
+                            label=DocItemLabel.TABLE,
+                        )
+                        page.predictions.tablestructure.table_map[table_cluster.id] = (
+                            tbl
+                        )
+                    # For debugging purposes:
+                    # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
-            yield page
+                yield page

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import io
 import logging
 import tempfile
-from subprocess import PIPE, Popen
-from typing import Iterable, Tuple
+from subprocess import DEVNULL, PIPE, Popen
+from typing import Iterable, Optional, Tuple
 import pandas as pd
+from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel
@@ -21,8 +22,8 @@ class TesseractOcrCliModel(BaseOcrModel):
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
-        self._name = None
-        self._version = None
+        self._name: Optional[str] = None
+        self._version: Optional[str] = None
         if self.enabled:
             try:
@@ -39,7 +40,7 @@ class TesseractOcrCliModel(BaseOcrModel):
     def _get_name_and_version(self) -> Tuple[str, str]:
         if self._name != None and self._version != None:
-            return self._name, self._version
+            return self._name, self._version  # type: ignore
         cmd = [self.options.tesseract_cmd, "--version"]
@@ -81,7 +82,7 @@ class TesseractOcrCliModel(BaseOcrModel):
         cmd += [ifilename, "stdout", "tsv"]
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE)
+        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
         output, _ = proc.communicate()
         # _log.info(output)
@@ -108,60 +109,66 @@ class TesseractOcrCliModel(BaseOcrModel):
             return
         for page in page_batch:
-            ocr_rects = self.get_ocr_rects(page)
-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                ocr_rects = self.get_ocr_rects(page)
+                all_ocr_cells = []
+                for ocr_rect in ocr_rects:
+                    # Skip zero area boxes
+                    if ocr_rect.area() == 0:
+                        continue
+                    high_res_image = page._backend.get_page_image(
+                        scale=self.scale, cropbox=ocr_rect
+                    )
-                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
-                    fname = image_file.name
-                    high_res_image.save(fname)
-                    df = self._run_tesseract(fname)
-                # _log.info(df)
-                # Print relevant columns (bounding box and text)
-                for ix, row in df.iterrows():
-                    text = row["text"]
-                    conf = row["conf"]
-                    l = float(row["left"])
-                    b = float(row["top"])
-                    w = float(row["width"])
-                    h = float(row["height"])
-                    t = b + h
-                    r = l + w
-                    cell = OcrCell(
-                        id=ix,
-                        text=text,
-                        confidence=conf / 100.0,
-                        bbox=BoundingBox.from_tuple(
-                            coord=(
-                                (l / self.scale) + ocr_rect.l,
-                                (b / self.scale) + ocr_rect.t,
-                                (r / self.scale) + ocr_rect.l,
-                                (t / self.scale) + ocr_rect.t,
+                    with tempfile.NamedTemporaryFile(
+                        suffix=".png", mode="w"
+                    ) as image_file:
+                        fname = image_file.name
+                        high_res_image.save(fname)
+                        df = self._run_tesseract(fname)
+                    # _log.info(df)
+                    # Print relevant columns (bounding box and text)
+                    for ix, row in df.iterrows():
+                        text = row["text"]
+                        conf = row["conf"]
+                        l = float(row["left"])
+                        b = float(row["top"])
+                        w = float(row["width"])
+                        h = float(row["height"])
+                        t = b + h
+                        r = l + w
+                        cell = OcrCell(
+                            id=ix,
+                            text=text,
+                            confidence=conf / 100.0,
+                            bbox=BoundingBox.from_tuple(
+                                coord=(
+                                    (l / self.scale) + ocr_rect.l,
+                                    (b / self.scale) + ocr_rect.t,
+                                    (r / self.scale) + ocr_rect.l,
+                                    (t / self.scale) + ocr_rect.t,
+                                ),
+                                origin=CoordOrigin.TOPLEFT,
                             ),
-                            origin=CoordOrigin.TOPLEFT,
-                        ),
-                    )
-                    all_ocr_cells.append(cell)
+                        )
+                        all_ocr_cells.append(cell)
-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                ## Remove OCR cells which overlap with programmatic cells.
+                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
-            page.cells.extend(filtered_ocr_cells)
+                page.cells.extend(filtered_ocr_cells)
-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                # DEBUG code:
+                # self.draw_ocr_rects_and_cells(page, ocr_rects)
-            yield page
+                yield page

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -1,19 +1,19 @@
 import logging
 from typing import Iterable
-import numpy
+from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
-from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
 class TesseractOcrModel(BaseOcrModel):
-    def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
+    def __init__(self, enabled: bool, options: TesseractOcrOptions):
         super().__init__(enabled=enabled, options=options)
-        self.options: TesseractCliOcrOptions
+        self.options: TesseractOcrOptions
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
         self.reader = None
@@ -68,55 +68,63 @@ class TesseractOcrModel(BaseOcrModel):
             return
         for page in page_batch:
-            ocr_rects = self.get_ocr_rects(page)
-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                assert self.reader is not None
-                # Retrieve text snippets with their bounding boxes
-                self.reader.SetImage(high_res_image)
-                boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
-                cells = []
-                for ix, (im, box, _, _) in enumerate(boxes):
-                    # Set the area of interest. Tesseract uses Bottom-Left for the origin
-                    self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
-                    # Extract text within the bounding box
-                    text = self.reader.GetUTF8Text().strip()
-                    confidence = self.reader.MeanTextConf()
-                    left = box["x"] / self.scale
-                    bottom = box["y"] / self.scale
-                    right = (box["x"] + box["w"]) / self.scale
-                    top = (box["y"] + box["h"]) / self.scale
-                    cells.append(
-                        OcrCell(
-                            id=ix,
-                            text=text,
-                            confidence=confidence,
-                            bbox=BoundingBox.from_tuple(
-                                coord=(left, top, right, bottom),
-                                origin=CoordOrigin.TOPLEFT,
-                            ),
-                        )
+                ocr_rects = self.get_ocr_rects(page)
+                all_ocr_cells = []
+                for ocr_rect in ocr_rects:
+                    # Skip zero area boxes
+                    if ocr_rect.area() == 0:
+                        continue
+                    high_res_image = page._backend.get_page_image(
+                        scale=self.scale, cropbox=ocr_rect
                     )
-                # del high_res_image
-                all_ocr_cells.extend(cells)
+                    # Retrieve text snippets with their bounding boxes
+                    self.reader.SetImage(high_res_image)
+                    boxes = self.reader.GetComponentImages(
+                        self.reader_RIL.TEXTLINE, True
+                    )
+                    cells = []
+                    for ix, (im, box, _, _) in enumerate(boxes):
+                        # Set the area of interest. Tesseract uses Bottom-Left for the origin
+                        self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
+                        # Extract text within the bounding box
+                        text = self.reader.GetUTF8Text().strip()
+                        confidence = self.reader.MeanTextConf()
+                        left = box["x"] / self.scale
+                        bottom = box["y"] / self.scale
+                        right = (box["x"] + box["w"]) / self.scale
+                        top = (box["y"] + box["h"]) / self.scale
+                        cells.append(
+                            OcrCell(
+                                id=ix,
+                                text=text,
+                                confidence=confidence,
+                                bbox=BoundingBox.from_tuple(
+                                    coord=(left, top, right, bottom),
+                                    origin=CoordOrigin.TOPLEFT,
+                                ),
+                            )
+                        )
+                    # del high_res_image
+                    all_ocr_cells.extend(cells)
-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                ## Remove OCR cells which overlap with programmatic cells.
+                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
-            page.cells.extend(filtered_ocr_cells)
+                page.cells.extend(filtered_ocr_cells)
-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                # DEBUG code:
+                # self.draw_ocr_rects_and_cells(page, ocr_rects)
-            yield page
+                yield page

docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

docling 1.19.0py3-none-any.whl → 2.1.0py3-none-any.whl