PyPI - docling - Versions diffs - 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl - Mend

docling 1.19.1py3-none-any.whl → 2.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

docling/backend/abstract_backend.py +33 -37
docling/backend/asciidoc_backend.py +431 -0
docling/backend/docling_parse_backend.py +20 -16
docling/backend/docling_parse_v2_backend.py +248 -0
docling/backend/html_backend.py +429 -0
docling/backend/md_backend.py +346 -0
docling/backend/mspowerpoint_backend.py +398 -0
docling/backend/msword_backend.py +496 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +16 -11
docling/cli/main.py +96 -65
docling/datamodel/base_models.py +79 -193
docling/datamodel/document.py +405 -320
docling/datamodel/pipeline_options.py +19 -3
docling/datamodel/settings.py +16 -1
docling/document_converter.py +240 -251
docling/models/base_model.py +28 -0
docling/models/base_ocr_model.py +40 -10
docling/models/ds_glm_model.py +244 -30
docling/models/easyocr_model.py +57 -42
docling/models/layout_model.py +158 -116
docling/models/page_assemble_model.py +127 -101
docling/models/page_preprocessing_model.py +79 -0
docling/models/table_structure_model.py +162 -116
docling/models/tesseract_ocr_cli_model.py +76 -59
docling/models/tesseract_ocr_model.py +90 -58
docling/pipeline/base_pipeline.py +189 -0
docling/pipeline/simple_pipeline.py +56 -0
docling/pipeline/standard_pdf_pipeline.py +201 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling/utils/profiling.py +62 -0
docling-2.4.1.dist-info/METADATA +154 -0
docling-2.4.1.dist-info/RECORD +45 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.19.1.dist-info/METADATA +0 -380
docling-1.19.1.dist-info/RECORD +0 -34
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0

docling/models/layout_model.py CHANGED Viewed

@@ -2,8 +2,10 @@ import copy
 import logging
 import random
 import time
+from pathlib import Path
 from typing import Iterable, List
+from docling_core.types.doc import CoordOrigin, DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import ImageDraw
@@ -11,74 +13,76 @@ from docling.datamodel.base_models import (
     BoundingBox,
     Cell,
     Cluster,
-    CoordOrigin,
     LayoutPrediction,
     Page,
 )
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.settings import settings
+from docling.models.base_model import BasePageModel
 from docling.utils import layout_utils as lu
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
-class LayoutModel:
+class LayoutModel(BasePageModel):
     TEXT_ELEM_LABELS = [
-        "Text",
-        "Footnote",
-        "Caption",
-        "Checkbox-Unselected",
-        "Checkbox-Selected",
-        "Section-header",
-        "Page-header",
-        "Page-footer",
-        "Code",
-        "List-item",
-        # "Title"
+        DocItemLabel.TEXT,
+        DocItemLabel.FOOTNOTE,
+        DocItemLabel.CAPTION,
+        DocItemLabel.CHECKBOX_UNSELECTED,
+        DocItemLabel.CHECKBOX_SELECTED,
+        DocItemLabel.SECTION_HEADER,
+        DocItemLabel.PAGE_HEADER,
+        DocItemLabel.PAGE_FOOTER,
+        DocItemLabel.CODE,
+        DocItemLabel.LIST_ITEM,
         # "Formula",
     ]
-    PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
+    PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
-    TABLE_LABEL = "Table"
-    FIGURE_LABEL = "Picture"
-    FORMULA_LABEL = "Formula"
+    TABLE_LABEL = DocItemLabel.TABLE
+    FIGURE_LABEL = DocItemLabel.PICTURE
+    FORMULA_LABEL = DocItemLabel.FORMULA
-    def __init__(self, config):
-        self.config = config
-        self.layout_predictor = LayoutPredictor(
-            config["artifacts_path"]
-        )  # TODO temporary
+    def __init__(self, artifacts_path: Path):
+        self.layout_predictor = LayoutPredictor(artifacts_path)  # TODO temporary
-    def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
+    def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
         MIN_INTERSECTION = 0.2
         CLASS_THRESHOLDS = {
-            "Caption": 0.35,
-            "Footnote": 0.35,
-            "Formula": 0.35,
-            "List-item": 0.35,
-            "Page-footer": 0.35,
-            "Page-header": 0.35,
-            "Picture": 0.2,  # low threshold adjust to capture chemical structures for examples.
-            "Section-header": 0.45,
-            "Table": 0.35,
-            "Text": 0.45,
-            "Title": 0.45,
-            "Document Index": 0.45,
-            "Code": 0.45,
-            "Checkbox-Selected": 0.45,
-            "Checkbox-Unselected": 0.45,
-            "Form": 0.45,
-            "Key-Value Region": 0.45,
+            DocItemLabel.CAPTION: 0.35,
+            DocItemLabel.FOOTNOTE: 0.35,
+            DocItemLabel.FORMULA: 0.35,
+            DocItemLabel.LIST_ITEM: 0.35,
+            DocItemLabel.PAGE_FOOTER: 0.35,
+            DocItemLabel.PAGE_HEADER: 0.35,
+            DocItemLabel.PICTURE: 0.2,  # low threshold adjust to capture chemical structures for examples.
+            DocItemLabel.SECTION_HEADER: 0.45,
+            DocItemLabel.TABLE: 0.35,
+            DocItemLabel.TEXT: 0.45,
+            DocItemLabel.TITLE: 0.45,
+            DocItemLabel.DOCUMENT_INDEX: 0.45,
+            DocItemLabel.CODE: 0.45,
+            DocItemLabel.CHECKBOX_SELECTED: 0.45,
+            DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
+            DocItemLabel.FORM: 0.45,
+            DocItemLabel.KEY_VALUE_REGION: 0.45,
         }
-        CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
+        CLASS_REMAPPINGS = {
+            DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
+            DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
+        }
         _log.debug("================= Start postprocess function ====================")
         start_time = time.time()
         # Apply Confidence Threshold to cluster predictions
         # confidence = self.conf_threshold
-        clusters_out = []
+        clusters_mod = []
-        for cluster in clusters:
+        for cluster in clusters_in:
             confidence = CLASS_THRESHOLDS[cluster.label]
             if cluster.confidence >= confidence:
                 # annotation["created_by"] = "high_conf_pred"
@@ -86,10 +90,10 @@ class LayoutModel:
                 # Remap class labels where needed.
                 if cluster.label in CLASS_REMAPPINGS.keys():
                     cluster.label = CLASS_REMAPPINGS[cluster.label]
-                clusters_out.append(cluster)
+                clusters_mod.append(cluster)
         # map to dictionary clusters and cells, with bottom left origin
-        clusters = [
+        clusters_orig = [
             {
                 "id": c.id,
                 "bbox": list(
@@ -99,7 +103,7 @@ class LayoutModel:
                 "cell_ids": [],
                 "type": c.label,
             }
-            for c in clusters
+            for c in clusters_in
         ]
         clusters_out = [
@@ -113,9 +117,11 @@ class LayoutModel:
                 "cell_ids": [],
                 "type": c.label,
             }
-            for c in clusters_out
+            for c in clusters_mod
         ]
+        del clusters_mod
         raw_cells = [
             {
                 "id": c.id,
@@ -149,7 +155,7 @@ class LayoutModel:
         # Assign orphan cells with lower confidence predictions
         clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
-            clusters_out, clusters, raw_cells, orphan_cell_indices
+            clusters_out, clusters_orig, raw_cells, orphan_cell_indices
         )
         # Refresh the cell_ids assignment, after creating new clusters using low conf predictions
@@ -178,7 +184,7 @@ class LayoutModel:
         ) = lu.cell_id_state_map(clusters_out, cell_count)
         clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
-            clusters_out, clusters, raw_cells, orphan_cell_indices
+            clusters_out, clusters_orig, raw_cells, orphan_cell_indices
         )
         _log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
@@ -237,92 +243,128 @@ class LayoutModel:
         end_time = time.time() - start_time
         _log.debug(f"Finished post processing in seconds={end_time:.3f}")
-        cells_out = [
+        cells_out_new = [
             Cell(
-                id=c["id"],
+                id=c["id"],  # type: ignore
                 bbox=BoundingBox.from_tuple(
-                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT  # type: ignore
                 ).to_top_left_origin(page_height),
-                text=c["text"],
+                text=c["text"],  # type: ignore
             )
             for c in cells_out
         ]
+        del cells_out
         clusters_out_new = []
         for c in clusters_out:
-            cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
+            cluster_cells = [
+                ccell for ccell in cells_out_new if ccell.id in c["cell_ids"]  # type: ignore
+            ]
             c_new = Cluster(
-                id=c["id"],
+                id=c["id"],  # type: ignore
                 bbox=BoundingBox.from_tuple(
-                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT  # type: ignore
                 ).to_top_left_origin(page_height),
-                confidence=c["confidence"],
-                label=c["type"],
+                confidence=c["confidence"],  # type: ignore
+                label=DocItemLabel(c["type"]),
                 cells=cluster_cells,
             )
             clusters_out_new.append(c_new)
-        return clusters_out_new, cells_out
+        return clusters_out_new, cells_out_new
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
         for page in page_batch:
-            clusters = []
-            for ix, pred_item in enumerate(
-                self.layout_predictor.predict(page.get_image(scale=1.0))
-            ):
-                cluster = Cluster(
-                    id=ix,
-                    label=pred_item["label"],
-                    confidence=pred_item["confidence"],
-                    bbox=BoundingBox.model_validate(pred_item),
-                    cells=[],
-                )
-                clusters.append(cluster)
-            # Map cells to clusters
-            # TODO: Remove, postprocess should take care of it anyway.
-            for cell in page.cells:
-                for cluster in clusters:
-                    if not cell.bbox.area() > 0:
-                        overlap_frac = 0.0
-                    else:
-                        overlap_frac = (
-                            cell.bbox.intersection_area_with(cluster.bbox)
-                            / cell.bbox.area()
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "layout"):
+                    assert page.size is not None
+                    clusters = []
+                    for ix, pred_item in enumerate(
+                        self.layout_predictor.predict(page.get_image(scale=1.0))
+                    ):
+                        label = DocItemLabel(
+                            pred_item["label"]
+                            .lower()
+                            .replace(" ", "_")
+                            .replace("-", "_")
+                        )  # Temporary, until docling-ibm-model uses docling-core types
+                        cluster = Cluster(
+                            id=ix,
+                            label=label,
+                            confidence=pred_item["confidence"],
+                            bbox=BoundingBox.model_validate(pred_item),
+                            cells=[],
                         )
-                    if overlap_frac > 0.5:
-                        cluster.cells.append(cell)
-            # Pre-sort clusters
-            # clusters = self.sort_clusters_by_cell_order(clusters)
-            # DEBUG code:
-            def draw_clusters_and_cells():
-                image = copy.deepcopy(page.image)
-                draw = ImageDraw.Draw(image)
-                for c in clusters:
-                    x0, y0, x1, y1 = c.bbox.as_tuple()
-                    draw.rectangle([(x0, y0), (x1, y1)], outline="green")
-                    cell_color = (
-                        random.randint(30, 140),
-                        random.randint(30, 140),
-                        random.randint(30, 140),
+                        clusters.append(cluster)
+                    # Map cells to clusters
+                    # TODO: Remove, postprocess should take care of it anyway.
+                    for cell in page.cells:
+                        for cluster in clusters:
+                            if not cell.bbox.area() > 0:
+                                overlap_frac = 0.0
+                            else:
+                                overlap_frac = (
+                                    cell.bbox.intersection_area_with(cluster.bbox)
+                                    / cell.bbox.area()
+                                )
+                            if overlap_frac > 0.5:
+                                cluster.cells.append(cell)
+                    # Pre-sort clusters
+                    # clusters = self.sort_clusters_by_cell_order(clusters)
+                    # DEBUG code:
+                    def draw_clusters_and_cells(show: bool = False):
+                        image = copy.deepcopy(page.image)
+                        if image is not None:
+                            draw = ImageDraw.Draw(image)
+                            for c in clusters:
+                                x0, y0, x1, y1 = c.bbox.as_tuple()
+                                draw.rectangle([(x0, y0), (x1, y1)], outline="green")
+                                cell_color = (
+                                    random.randint(30, 140),
+                                    random.randint(30, 140),
+                                    random.randint(30, 140),
+                                )
+                                for tc in c.cells:  # [:1]:
+                                    x0, y0, x1, y1 = tc.bbox.as_tuple()
+                                    draw.rectangle(
+                                        [(x0, y0), (x1, y1)], outline=cell_color
+                                    )
+                            if show:
+                                image.show()
+                            else:
+                                out_path: Path = (
+                                    Path(settings.debug.debug_output_path)
+                                    / f"debug_{conv_res.input.file.stem}"
+                                )
+                                out_path.mkdir(parents=True, exist_ok=True)
+                                out_file = (
+                                    out_path / f"layout_page_{page.page_no:05}.png"
+                                )
+                                image.save(str(out_file), format="png")
+                    # draw_clusters_and_cells()
+                    clusters, page.cells = self.postprocess(
+                        clusters, page.cells, page.size.height
                     )
-                    for tc in c.cells:  # [:1]:
-                        x0, y0, x1, y1 = tc.bbox.as_tuple()
-                        draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-                image.show()
-            # draw_clusters_and_cells()
-            clusters, page.cells = self.postprocess(
-                clusters, page.cells, page.size.height
-            )
-            # draw_clusters_and_cells()
+                    page.predictions.layout = LayoutPrediction(clusters=clusters)
-            page.predictions.layout = LayoutPrediction(clusters=clusters)
+                if settings.debug.visualize_layout:
+                    draw_clusters_and_cells()
-            yield page
+                yield page

docling/models/page_assemble_model.py CHANGED Viewed

@@ -2,22 +2,31 @@ import logging
 import re
 from typing import Iterable, List
+from pydantic import BaseModel
 from docling.datamodel.base_models import (
     AssembledUnit,
     FigureElement,
     Page,
     PageElement,
-    TableElement,
+    Table,
     TextElement,
 )
+from docling.datamodel.document import ConversionResult
+from docling.models.base_model import BasePageModel
 from docling.models.layout_model import LayoutModel
+from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
-class PageAssembleModel:
-    def __init__(self, config):
-        self.config = config
+class PageAssembleOptions(BaseModel):
+    keep_images: bool = False
+class PageAssembleModel(BasePageModel):
+    def __init__(self, options: PageAssembleOptions):
+        self.options = options
     def sanitize_text(self, lines):
         if len(lines) <= 1:
@@ -44,105 +53,122 @@ class PageAssembleModel:
         return sanitized_text.strip()  # Strip any leading or trailing whitespace
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         for page in page_batch:
-            # assembles some JSON output page by page.
-            elements: List[PageElement] = []
-            headers: List[PageElement] = []
-            body: List[PageElement] = []
-            for cluster in page.predictions.layout.clusters:
-                # _log.info("Cluster label seen:", cluster.label)
-                if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
-                    textlines = [
-                        cell.text.replace("\x02", "-").strip()
-                        for cell in cluster.cells
-                        if len(cell.text.strip()) > 0
-                    ]
-                    text = self.sanitize_text(textlines)
-                    text_el = TextElement(
-                        label=cluster.label,
-                        id=cluster.id,
-                        text=text,
-                        page_no=page.page_no,
-                        cluster=cluster,
-                    )
-                    elements.append(text_el)
-                    if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
-                        headers.append(text_el)
-                    else:
-                        body.append(text_el)
-                elif cluster.label == LayoutModel.TABLE_LABEL:
-                    tbl = None
-                    if page.predictions.tablestructure:
-                        tbl = page.predictions.tablestructure.table_map.get(
-                            cluster.id, None
-                        )
-                    if (
-                        not tbl
-                    ):  # fallback: add table without structure, if it isn't present
-                        tbl = TableElement(
-                            label=cluster.label,
-                            id=cluster.id,
-                            text="",
-                            otsl_seq=[],
-                            table_cells=[],
-                            cluster=cluster,
-                            page_no=page.page_no,
-                        )
-                    elements.append(tbl)
-                    body.append(tbl)
-                elif cluster.label == LayoutModel.FIGURE_LABEL:
-                    fig = None
-                    if page.predictions.figures_classification:
-                        fig = page.predictions.figures_classification.figure_map.get(
-                            cluster.id, None
-                        )
-                    if (
-                        not fig
-                    ):  # fallback: add figure without classification, if it isn't present
-                        fig = FigureElement(
-                            label=cluster.label,
-                            id=cluster.id,
-                            text="",
-                            data=None,
-                            cluster=cluster,
-                            page_no=page.page_no,
-                        )
-                    elements.append(fig)
-                    body.append(fig)
-                elif cluster.label == LayoutModel.FORMULA_LABEL:
-                    equation = None
-                    if page.predictions.equations_prediction:
-                        equation = (
-                            page.predictions.equations_prediction.equation_map.get(
-                                cluster.id, None
-                            )
-                        )
-                    if not equation:  # fallback: add empty formula, if it isn't present
-                        text = self.sanitize_text(
-                            [
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "page_assemble"):
+                    assert page.predictions.layout is not None
+                    # assembles some JSON output page by page.
+                    elements: List[PageElement] = []
+                    headers: List[PageElement] = []
+                    body: List[PageElement] = []
+                    for cluster in page.predictions.layout.clusters:
+                        # _log.info("Cluster label seen:", cluster.label)
+                        if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
+                            textlines = [
                                 cell.text.replace("\x02", "-").strip()
                                 for cell in cluster.cells
                                 if len(cell.text.strip()) > 0
                             ]
-                        )
-                        equation = TextElement(
-                            label=cluster.label,
-                            id=cluster.id,
-                            cluster=cluster,
-                            page_no=page.page_no,
-                            text=text,
-                        )
-                    elements.append(equation)
-                    body.append(equation)
-            page.assembled = AssembledUnit(
-                elements=elements, headers=headers, body=body
-            )
-            yield page
+                            text = self.sanitize_text(textlines)
+                            text_el = TextElement(
+                                label=cluster.label,
+                                id=cluster.id,
+                                text=text,
+                                page_no=page.page_no,
+                                cluster=cluster,
+                            )
+                            elements.append(text_el)
+                            if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
+                                headers.append(text_el)
+                            else:
+                                body.append(text_el)
+                        elif cluster.label == LayoutModel.TABLE_LABEL:
+                            tbl = None
+                            if page.predictions.tablestructure:
+                                tbl = page.predictions.tablestructure.table_map.get(
+                                    cluster.id, None
+                                )
+                            if (
+                                not tbl
+                            ):  # fallback: add table without structure, if it isn't present
+                                tbl = Table(
+                                    label=cluster.label,
+                                    id=cluster.id,
+                                    text="",
+                                    otsl_seq=[],
+                                    table_cells=[],
+                                    cluster=cluster,
+                                    page_no=page.page_no,
+                                )
+                            elements.append(tbl)
+                            body.append(tbl)
+                        elif cluster.label == LayoutModel.FIGURE_LABEL:
+                            fig = None
+                            if page.predictions.figures_classification:
+                                fig = page.predictions.figures_classification.figure_map.get(
+                                    cluster.id, None
+                                )
+                            if (
+                                not fig
+                            ):  # fallback: add figure without classification, if it isn't present
+                                fig = FigureElement(
+                                    label=cluster.label,
+                                    id=cluster.id,
+                                    text="",
+                                    data=None,
+                                    cluster=cluster,
+                                    page_no=page.page_no,
+                                )
+                            elements.append(fig)
+                            body.append(fig)
+                        elif cluster.label == LayoutModel.FORMULA_LABEL:
+                            equation = None
+                            if page.predictions.equations_prediction:
+                                equation = page.predictions.equations_prediction.equation_map.get(
+                                    cluster.id, None
+                                )
+                            if (
+                                not equation
+                            ):  # fallback: add empty formula, if it isn't present
+                                text = self.sanitize_text(
+                                    [
+                                        cell.text.replace("\x02", "-").strip()
+                                        for cell in cluster.cells
+                                        if len(cell.text.strip()) > 0
+                                    ]
+                                )
+                                equation = TextElement(
+                                    label=cluster.label,
+                                    id=cluster.id,
+                                    cluster=cluster,
+                                    page_no=page.page_no,
+                                    text=text,
+                                )
+                            elements.append(equation)
+                            body.append(equation)
+                    page.assembled = AssembledUnit(
+                        elements=elements, headers=headers, body=body
+                    )
+                    # Remove page images (can be disabled)
+                    if not self.options.keep_images:
+                        page._image_cache = {}
+                    # Unload backend
+                    page._backend.unload()
+                yield page

docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

docling 1.19.1py3-none-any.whl → 2.4.1py3-none-any.whl