PyPI - docling - Versions diffs - 1.6.2__py3-none-any.whl - Mend

docling 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

docling/__init__.py +0 -0
docling/backend/__init__.py +0 -0
docling/backend/abstract_backend.py +59 -0
docling/backend/docling_parse_backend.py +207 -0
docling/backend/pypdfium2_backend.py +233 -0
docling/datamodel/__init__.py +0 -0
docling/datamodel/base_models.py +312 -0
docling/datamodel/document.py +363 -0
docling/datamodel/settings.py +32 -0
docling/document_converter.py +276 -0
docling/models/__init__.py +0 -0
docling/models/base_ocr_model.py +124 -0
docling/models/ds_glm_model.py +82 -0
docling/models/easyocr_model.py +70 -0
docling/models/layout_model.py +328 -0
docling/models/page_assemble_model.py +148 -0
docling/models/table_structure_model.py +144 -0
docling/pipeline/__init__.py +0 -0
docling/pipeline/base_model_pipeline.py +17 -0
docling/pipeline/standard_model_pipeline.py +38 -0
docling/utils/__init__.py +0 -0
docling/utils/layout_utils.py +806 -0
docling/utils/utils.py +41 -0
docling-1.6.2.dist-info/LICENSE +21 -0
docling-1.6.2.dist-info/METADATA +192 -0
docling-1.6.2.dist-info/RECORD +27 -0
docling-1.6.2.dist-info/WHEEL +4 -0

docling/models/layout_model.py ADDED Viewed

@@ -0,0 +1,328 @@
+import copy
+import logging
+import random
+import time
+from typing import Iterable, List
+from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
+from PIL import ImageDraw
+from docling.datamodel.base_models import (
+    BoundingBox,
+    Cell,
+    Cluster,
+    CoordOrigin,
+    LayoutPrediction,
+    Page,
+)
+from docling.utils import layout_utils as lu
+_log = logging.getLogger(__name__)
+class LayoutModel:
+    TEXT_ELEM_LABELS = [
+        "Text",
+        "Footnote",
+        "Caption",
+        "Checkbox-Unselected",
+        "Checkbox-Selected",
+        "Section-header",
+        "Page-header",
+        "Page-footer",
+        "Code",
+        "List-item",
+        # "Formula",
+    ]
+    PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
+    TABLE_LABEL = "Table"
+    FIGURE_LABEL = "Picture"
+    FORMULA_LABEL = "Formula"
+    def __init__(self, config):
+        self.config = config
+        self.layout_predictor = LayoutPredictor(
+            config["artifacts_path"]
+        )  # TODO temporary
+    def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
+        MIN_INTERSECTION = 0.2
+        CLASS_THRESHOLDS = {
+            "Caption": 0.35,
+            "Footnote": 0.35,
+            "Formula": 0.35,
+            "List-item": 0.35,
+            "Page-footer": 0.35,
+            "Page-header": 0.35,
+            "Picture": 0.2,  # low threshold adjust to capture chemical structures for examples.
+            "Section-header": 0.45,
+            "Table": 0.35,
+            "Text": 0.45,
+            "Title": 0.45,
+            "Document Index": 0.45,
+            "Code": 0.45,
+            "Checkbox-Selected": 0.45,
+            "Checkbox-Unselected": 0.45,
+            "Form": 0.45,
+            "Key-Value Region": 0.45,
+        }
+        CLASS_REMAPPINGS = {
+            "Document Index": "Table",
+        }
+        _log.debug("================= Start postprocess function ====================")
+        start_time = time.time()
+        # Apply Confidence Threshold to cluster predictions
+        # confidence = self.conf_threshold
+        clusters_out = []
+        for cluster in clusters:
+            confidence = CLASS_THRESHOLDS[cluster.label]
+            if cluster.confidence >= confidence:
+                # annotation["created_by"] = "high_conf_pred"
+                # Remap class labels where needed.
+                if cluster.label in CLASS_REMAPPINGS.keys():
+                    cluster.label = CLASS_REMAPPINGS[cluster.label]
+                clusters_out.append(cluster)
+        # map to dictionary clusters and cells, with bottom left origin
+        clusters = [
+            {
+                "id": c.id,
+                "bbox": list(
+                    c.bbox.to_bottom_left_origin(page_height).as_tuple()
+                ),  # TODO
+                "confidence": c.confidence,
+                "cell_ids": [],
+                "type": c.label,
+            }
+            for c in clusters
+        ]
+        clusters_out = [
+            {
+                "id": c.id,
+                "bbox": list(
+                    c.bbox.to_bottom_left_origin(page_height).as_tuple()
+                ),  # TODO
+                "confidence": c.confidence,
+                "created_by": "high_conf_pred",
+                "cell_ids": [],
+                "type": c.label,
+            }
+            for c in clusters_out
+        ]
+        raw_cells = [
+            {
+                "id": c.id,
+                "bbox": list(
+                    c.bbox.to_bottom_left_origin(page_height).as_tuple()
+                ),  # TODO
+                "text": c.text,
+            }
+            for c in cells
+        ]
+        cell_count = len(raw_cells)
+        _log.debug("---- 0. Treat cluster overlaps ------")
+        clusters_out = lu.remove_cluster_duplicates_by_conf(clusters_out, 0.8)
+        _log.debug(
+            "---- 1. Initially assign cells to clusters based on minimum intersection ------"
+        )
+        ## Check for cells included in or touched by clusters:
+        clusters_out = lu.assigning_cell_ids_to_clusters(
+            clusters_out, raw_cells, MIN_INTERSECTION
+        )
+        _log.debug("---- 2. Assign Orphans with Low Confidence Detections")
+        # Creates a map of cell_id->cluster_id
+        (
+            clusters_around_cells,
+            orphan_cell_indices,
+            ambiguous_cell_indices,
+        ) = lu.cell_id_state_map(clusters_out, cell_count)
+        # Assign orphan cells with lower confidence predictions
+        clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
+            clusters_out, clusters, raw_cells, orphan_cell_indices
+        )
+        # Refresh the cell_ids assignment, after creating new clusters using low conf predictions
+        clusters_out = lu.assigning_cell_ids_to_clusters(
+            clusters_out, raw_cells, MIN_INTERSECTION
+        )
+        _log.debug("---- 3. Settle Ambigous Cells")
+        # Creates an update map after assignment of cell_id->cluster_id
+        (
+            clusters_around_cells,
+            orphan_cell_indices,
+            ambiguous_cell_indices,
+        ) = lu.cell_id_state_map(clusters_out, cell_count)
+        # Settle pdf cells that belong to multiple clusters
+        clusters_out, ambiguous_cell_indices = lu.remove_ambigous_pdf_cell_by_conf(
+            clusters_out, raw_cells, ambiguous_cell_indices
+        )
+        _log.debug("---- 4. Set Orphans as Text")
+        (
+            clusters_around_cells,
+            orphan_cell_indices,
+            ambiguous_cell_indices,
+        ) = lu.cell_id_state_map(clusters_out, cell_count)
+        clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
+            clusters_out, clusters, raw_cells, orphan_cell_indices
+        )
+        _log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
+        # Merge cells orphan cells
+        clusters_out = lu.merge_cells(clusters_out)
+        # Clean up clusters that remain from merged and unreasonable clusters
+        clusters_out = lu.clean_up_clusters(
+            clusters_out,
+            raw_cells,
+            merge_cells=True,
+            img_table=True,
+            one_cell_table=True,
+        )
+        new_clusters = lu.adapt_bboxes(raw_cells, clusters_out, orphan_cell_indices)
+        clusters_out = new_clusters
+        ## We first rebuild where every cell is now:
+        ##   Now we write into a prediction cells list, not into the raw cells list.
+        ##   As we don't need previous labels, we best overwrite any old list, because that might
+        ##   have been sorted differently.
+        (
+            clusters_around_cells,
+            orphan_cell_indices,
+            ambiguous_cell_indices,
+        ) = lu.cell_id_state_map(clusters_out, cell_count)
+        target_cells = []
+        for ix, cell in enumerate(raw_cells):
+            new_cell = {
+                "id": ix,
+                "rawcell_id": ix,
+                "label": "None",
+                "bbox": cell["bbox"],
+                "text": cell["text"],
+            }
+            for cluster_index in clusters_around_cells[
+                ix
+            ]:  # By previous analysis, this is always 1 cluster.
+                new_cell["label"] = clusters_out[cluster_index]["type"]
+            target_cells.append(new_cell)
+            # _log.debug("New label of cell " + str(ix) + " is " + str(new_cell["label"]))
+        cells_out = target_cells
+        ## -------------------------------
+        ## Sort clusters into reasonable reading order, and sort the cells inside each cluster
+        _log.debug("---- 5. Sort clusters in reading order ------")
+        sorted_clusters = lu.produce_reading_order(
+            clusters_out, "raw_cell_ids", "raw_cell_ids", True
+        )
+        clusters_out = sorted_clusters
+        # end_time = timer()
+        _log.debug("---- End of postprocessing function ------")
+        end_time = time.time() - start_time
+        _log.debug(f"Finished post processing in seconds={end_time:.3f}")
+        cells_out = [
+            Cell(
+                id=c["id"],
+                bbox=BoundingBox.from_tuple(
+                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                ).to_top_left_origin(page_height),
+                text=c["text"],
+            )
+            for c in cells_out
+        ]
+        clusters_out_new = []
+        for c in clusters_out:
+            cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
+            c_new = Cluster(
+                id=c["id"],
+                bbox=BoundingBox.from_tuple(
+                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                ).to_top_left_origin(page_height),
+                confidence=c["confidence"],
+                label=c["type"],
+                cells=cluster_cells,
+            )
+            clusters_out_new.append(c_new)
+        return clusters_out_new, cells_out
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        for page in page_batch:
+            clusters = []
+            for ix, pred_item in enumerate(
+                self.layout_predictor.predict(page.get_image(scale=1.0))
+            ):
+                cluster = Cluster(
+                    id=ix,
+                    label=pred_item["label"],
+                    confidence=pred_item["confidence"],
+                    bbox=BoundingBox.model_validate(pred_item),
+                    cells=[],
+                )
+                clusters.append(cluster)
+            # Map cells to clusters
+            # TODO: Remove, postprocess should take care of it anyway.
+            for cell in page.cells:
+                for cluster in clusters:
+                    if not cell.bbox.area() > 0:
+                        overlap_frac = 0.0
+                    else:
+                        overlap_frac = (
+                            cell.bbox.intersection_area_with(cluster.bbox)
+                            / cell.bbox.area()
+                        )
+                    if overlap_frac > 0.5:
+                        cluster.cells.append(cell)
+            # Pre-sort clusters
+            # clusters = self.sort_clusters_by_cell_order(clusters)
+            # DEBUG code:
+            def draw_clusters_and_cells():
+                image = copy.deepcopy(page.image)
+                draw = ImageDraw.Draw(image)
+                for c in clusters:
+                    x0, y0, x1, y1 = c.bbox.as_tuple()
+                    draw.rectangle([(x0, y0), (x1, y1)], outline="green")
+                    cell_color = (
+                        random.randint(30, 140),
+                        random.randint(30, 140),
+                        random.randint(30, 140),
+                    )
+                    for tc in c.cells:  # [:1]:
+                        x0, y0, x1, y1 = tc.bbox.as_tuple()
+                        draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+                image.show()
+            # draw_clusters_and_cells()
+            clusters, page.cells = self.postprocess(
+                clusters, page.cells, page.size.height
+            )
+            # draw_clusters_and_cells()
+            page.predictions.layout = LayoutPrediction(clusters=clusters)
+            yield page

docling/models/page_assemble_model.py ADDED Viewed

@@ -0,0 +1,148 @@
+import logging
+import re
+from typing import Iterable, List
+from docling.datamodel.base_models import (
+    AssembledUnit,
+    FigureElement,
+    Page,
+    PageElement,
+    TableElement,
+    TextElement,
+)
+from docling.models.layout_model import LayoutModel
+_log = logging.getLogger(__name__)
+class PageAssembleModel:
+    def __init__(self, config):
+        self.config = config
+    def sanitize_text(self, lines):
+        if len(lines) <= 1:
+            return " ".join(lines)
+        for ix, line in enumerate(lines[1:]):
+            prev_line = lines[ix]
+            if prev_line.endswith("-"):
+                prev_words = re.findall(r"\b[\w]+\b", prev_line)
+                line_words = re.findall(r"\b[\w]+\b", line)
+                if (
+                    len(prev_words)
+                    and len(line_words)
+                    and prev_words[-1].isalnum()
+                    and line_words[0].isalnum()
+                ):
+                    lines[ix] = prev_line[:-1]
+            else:
+                lines[ix] += " "
+        sanitized_text = "".join(lines)
+        return sanitized_text.strip()  # Strip any leading or trailing whitespace
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        for page in page_batch:
+            # assembles some JSON output page by page.
+            elements: List[PageElement] = []
+            headers: List[PageElement] = []
+            body: List[PageElement] = []
+            for cluster in page.predictions.layout.clusters:
+                # _log.info("Cluster label seen:", cluster.label)
+                if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
+                    textlines = [
+                        cell.text.replace("\x02", "-").strip()
+                        for cell in cluster.cells
+                        if len(cell.text.strip()) > 0
+                    ]
+                    text = self.sanitize_text(textlines)
+                    text_el = TextElement(
+                        label=cluster.label,
+                        id=cluster.id,
+                        text=text,
+                        page_no=page.page_no,
+                        cluster=cluster,
+                    )
+                    elements.append(text_el)
+                    if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
+                        headers.append(text_el)
+                    else:
+                        body.append(text_el)
+                elif cluster.label == LayoutModel.TABLE_LABEL:
+                    tbl = None
+                    if page.predictions.tablestructure:
+                        tbl = page.predictions.tablestructure.table_map.get(
+                            cluster.id, None
+                        )
+                    if (
+                        not tbl
+                    ):  # fallback: add table without structure, if it isn't present
+                        tbl = TableElement(
+                            label=cluster.label,
+                            id=cluster.id,
+                            text="",
+                            otsl_seq=[],
+                            table_cells=[],
+                            cluster=cluster,
+                            page_no=page.page_no,
+                        )
+                    elements.append(tbl)
+                    body.append(tbl)
+                elif cluster.label == LayoutModel.FIGURE_LABEL:
+                    fig = None
+                    if page.predictions.figures_classification:
+                        fig = page.predictions.figures_classification.figure_map.get(
+                            cluster.id, None
+                        )
+                    if (
+                        not fig
+                    ):  # fallback: add figure without classification, if it isn't present
+                        fig = FigureElement(
+                            label=cluster.label,
+                            id=cluster.id,
+                            text="",
+                            data=None,
+                            cluster=cluster,
+                            page_no=page.page_no,
+                        )
+                    elements.append(fig)
+                    body.append(fig)
+                elif cluster.label == LayoutModel.FORMULA_LABEL:
+                    equation = None
+                    if page.predictions.equations_prediction:
+                        equation = (
+                            page.predictions.equations_prediction.equation_map.get(
+                                cluster.id, None
+                            )
+                        )
+                    if not equation:  # fallback: add empty formula, if it isn't present
+                        text = self.sanitize_text(
+                            [
+                                cell.text.replace("\x02", "-").strip()
+                                for cell in cluster.cells
+                                if len(cell.text.strip()) > 0
+                            ]
+                        )
+                        equation = TextElement(
+                            label=cluster.label,
+                            id=cluster.id,
+                            cluster=cluster,
+                            page_no=page.page_no,
+                            text=text,
+                        )
+                    elements.append(equation)
+                    body.append(equation)
+            page.assembled = AssembledUnit(
+                elements=elements, headers=headers, body=body
+            )
+            yield page

docling/models/table_structure_model.py ADDED Viewed

@@ -0,0 +1,144 @@
+import copy
+from typing import Iterable, List
+import numpy
+from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
+from PIL import ImageDraw
+from docling.datamodel.base_models import (
+    BoundingBox,
+    Page,
+    TableCell,
+    TableElement,
+    TableStructurePrediction,
+)
+class TableStructureModel:
+    def __init__(self, config):
+        self.config = config
+        self.do_cell_matching = config["do_cell_matching"]
+        self.enabled = config["enabled"]
+        if self.enabled:
+            artifacts_path = config["artifacts_path"]
+            # Third Party
+            import docling_ibm_models.tableformer.common as c
+            self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
+            self.tm_config["model"]["save_dir"] = artifacts_path
+            self.tm_model_type = self.tm_config["model"]["type"]
+            self.tf_predictor = TFPredictor(self.tm_config)
+            self.scale = 2.0  # Scale up table input images to 144 dpi
+    def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
+        image = (
+            page._backend.get_page_image()
+        )  # make new image to avoid drawing on the saved ones
+        draw = ImageDraw.Draw(image)
+        for table_element in tbl_list:
+            x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
+            draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+            for tc in table_element.table_cells:
+                x0, y0, x1, y1 = tc.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
+        image.show()
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+        for page in page_batch:
+            page.predictions.tablestructure = TableStructurePrediction()  # dummy
+            in_tables = [
+                (
+                    cluster,
+                    [
+                        round(cluster.bbox.l) * self.scale,
+                        round(cluster.bbox.t) * self.scale,
+                        round(cluster.bbox.r) * self.scale,
+                        round(cluster.bbox.b) * self.scale,
+                    ],
+                )
+                for cluster in page.predictions.layout.clusters
+                if cluster.label == "Table"
+            ]
+            if not len(in_tables):
+                yield page
+                continue
+            tokens = []
+            for c in page.cells:
+                for cluster, _ in in_tables:
+                    if c.bbox.area() > 0:
+                        if (
+                            c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
+                            > 0.2
+                        ):
+                            # Only allow non empty stings (spaces) into the cells of a table
+                            if len(c.text.strip()) > 0:
+                                new_cell = copy.deepcopy(c)
+                                new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
+                                tokens.append(new_cell.model_dump())
+            page_input = {
+                "tokens": tokens,
+                "width": page.size.width * self.scale,
+                "height": page.size.height * self.scale,
+            }
+            page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
+            table_clusters, table_bboxes = zip(*in_tables)
+            if len(table_bboxes):
+                tf_output = self.tf_predictor.multi_table_predict(
+                    page_input, table_bboxes, do_matching=self.do_cell_matching
+                )
+                for table_cluster, table_out in zip(table_clusters, tf_output):
+                    table_cells = []
+                    for element in table_out["tf_responses"]:
+                        if not self.do_cell_matching:
+                            the_bbox = BoundingBox.model_validate(
+                                element["bbox"]
+                            ).scaled(1 / self.scale)
+                            text_piece = page._backend.get_text_in_rect(the_bbox)
+                            element["bbox"]["token"] = text_piece
+                        tc = TableCell.model_validate(element)
+                        if self.do_cell_matching:
+                            tc.bbox = tc.bbox.scaled(1 / self.scale)
+                        table_cells.append(tc)
+                    # Retrieving cols/rows, after post processing:
+                    num_rows = table_out["predict_details"]["num_rows"]
+                    num_cols = table_out["predict_details"]["num_cols"]
+                    otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
+                    tbl = TableElement(
+                        otsl_seq=otsl_seq,
+                        table_cells=table_cells,
+                        num_rows=num_rows,
+                        num_cols=num_cols,
+                        id=table_cluster.id,
+                        page_no=page.page_no,
+                        cluster=table_cluster,
+                        label="Table",
+                    )
+                    page.predictions.tablestructure.table_map[table_cluster.id] = tbl
+                # For debugging purposes:
+                # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
+            yield page

docling/pipeline/__init__.py ADDED Viewed

File without changes

docling/pipeline/base_model_pipeline.py ADDED Viewed

@@ -0,0 +1,17 @@
+from pathlib import Path
+from typing import Iterable
+from docling.datamodel.base_models import Page, PipelineOptions
+class BaseModelPipeline:
+    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
+        self.model_pipe = []
+        self.artifacts_path = artifacts_path
+        self.pipeline_options = pipeline_options
+    def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        for model in self.model_pipe:
+            page_batch = model(page_batch)
+        yield from page_batch

docling/pipeline/standard_model_pipeline.py ADDED Viewed

@@ -0,0 +1,38 @@
+from pathlib import Path
+from docling.datamodel.base_models import PipelineOptions
+from docling.models.easyocr_model import EasyOcrModel
+from docling.models.layout_model import LayoutModel
+from docling.models.table_structure_model import TableStructureModel
+from docling.pipeline.base_model_pipeline import BaseModelPipeline
+class StandardModelPipeline(BaseModelPipeline):
+    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
+    _table_model_path = "model_artifacts/tableformer"
+    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
+        super().__init__(artifacts_path, pipeline_options)
+        self.model_pipe = [
+            EasyOcrModel(
+                config={
+                    "lang": ["fr", "de", "es", "en"],
+                    "enabled": pipeline_options.do_ocr,
+                }
+            ),
+            LayoutModel(
+                config={
+                    "artifacts_path": artifacts_path
+                    / StandardModelPipeline._layout_model_path
+                }
+            ),
+            TableStructureModel(
+                config={
+                    "artifacts_path": artifacts_path
+                    / StandardModelPipeline._table_model_path,
+                    "enabled": pipeline_options.do_table_structure,
+                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
+                }
+            ),
+        ]

docling/utils/__init__.py ADDED Viewed

File without changes