PyPI - docling - Versions diffs - 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

docling 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

docling/backend/abstract_backend.py +1 -0
docling/backend/asciidoc_backend.py +435 -0
docling/backend/docling_parse_backend.py +3 -3
docling/backend/docling_parse_v2_backend.py +11 -3
docling/backend/html_backend.py +8 -1
docling/backend/md_backend.py +293 -0
docling/backend/mspowerpoint_backend.py +62 -39
docling/backend/msword_backend.py +3 -10
docling/datamodel/base_models.py +15 -9
docling/datamodel/document.py +49 -12
docling/datamodel/pipeline_options.py +3 -0
docling/document_converter.py +18 -0
docling/models/base_ocr_model.py +9 -1
docling/models/ds_glm_model.py +16 -7
docling/models/easyocr_model.py +42 -40
docling/models/layout_model.py +63 -59
docling/models/page_assemble_model.py +105 -97
docling/models/page_preprocessing_model.py +7 -3
docling/models/table_structure_model.py +94 -85
docling/models/tesseract_ocr_cli_model.py +56 -52
docling/models/tesseract_ocr_model.py +50 -45
docling/pipeline/standard_pdf_pipeline.py +7 -7
{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/METADATA +10 -9
docling-2.2.0.dist-info/RECORD +44 -0
docling-2.0.0.dist-info/RECORD +0 -42
{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/LICENSE +0 -0
{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/WHEEL +0 -0
{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/entry_points.txt +0 -0

docling/document_converter.py CHANGED Viewed

@@ -8,8 +8,10 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type
 from pydantic import BaseModel, ConfigDict, model_validator, validate_call
 from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
@@ -52,6 +54,16 @@ class PowerpointFormatOption(FormatOption):
     backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
+class MarkdownFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
+class AsciiDocFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = AsciiDocBackend
 class HTMLFormatOption(FormatOption):
     pipeline_cls: Type = SimplePipeline
     backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
@@ -74,6 +86,12 @@ _format_to_default_options = {
     InputFormat.PPTX: FormatOption(
         pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
     ),
+    InputFormat.MD: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
+    ),
+    InputFormat.ASCIIDOC: FormatOption(
+        pipeline_cls=SimplePipeline, backend=AsciiDocBackend
+    ),
     InputFormat.HTML: FormatOption(
         pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
     ),

docling/models/base_ocr_model.py CHANGED Viewed

@@ -69,7 +69,7 @@ class BaseOcrModel:
         coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
         # return full-page rectangle if sufficiently covered with bitmaps
-        if coverage > BITMAP_COVERAGE_TRESHOLD:
+        if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
             return [
                 BoundingBox(
                     l=0,
@@ -81,6 +81,14 @@ class BaseOcrModel:
             ]
         # return individual rectangles if the bitmap coverage is smaller
         else:  # coverage <= BITMAP_COVERAGE_TRESHOLD:
+            # skip OCR if the bitmap area on the page is smaller than the options threshold
+            ocr_rects = [
+                rect
+                for rect in ocr_rects
+                if rect.area() / (page.size.width * page.size.height)
+                > self.options.bitmap_area_threshold
+            ]
             return ocr_rects
     # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.

docling/models/ds_glm_model.py CHANGED Viewed

@@ -5,15 +5,23 @@ from typing import List, Union
 from deepsearch_glm.nlp_utils import init_nlp_model
 from deepsearch_glm.utils.doc_utils import to_docling_document
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
-from docling_core.types import BaseText
-from docling_core.types import Document as DsDocument
-from docling_core.types import DocumentDescription as DsDocumentDescription
-from docling_core.types import FileInfoObject as DsFileInfoObject
-from docling_core.types import PageDimensions, PageReference, Prov, Ref
-from docling_core.types import Table as DsSchemaTable
 from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
 from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
-from docling_core.types.legacy_doc.base import Figure, TableCell
+from docling_core.types.legacy_doc.base import (
+    Figure,
+    PageDimensions,
+    PageReference,
+    Prov,
+    Ref,
+)
+from docling_core.types.legacy_doc.base import Table as DsSchemaTable
+from docling_core.types.legacy_doc.base import TableCell
+from docling_core.types.legacy_doc.document import BaseText
+from docling_core.types.legacy_doc.document import (
+    CCSDocumentDescription as DsDocumentDescription,
+)
+from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
+from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from PIL import ImageDraw
 from pydantic import BaseModel, ConfigDict
@@ -202,6 +210,7 @@ class GlmModel:
         page_dimensions = [
             PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
             for p in conv_res.pages
+            if p.size is not None
         ]
         ds_doc: DsDocument = DsDocument(

docling/models/easyocr_model.py CHANGED Viewed

@@ -41,48 +41,50 @@ class EasyOcrModel(BaseOcrModel):
         for page in page_batch:
             assert page._backend is not None
-            ocr_rects = self.get_ocr_rects(page)
-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
-                im = numpy.array(high_res_image)
-                result = self.reader.readtext(im)
-                del high_res_image
-                del im
-                cells = [
-                    OcrCell(
-                        id=ix,
-                        text=line[1],
-                        confidence=line[2],
-                        bbox=BoundingBox.from_tuple(
-                            coord=(
-                                (line[0][0][0] / self.scale) + ocr_rect.l,
-                                (line[0][0][1] / self.scale) + ocr_rect.t,
-                                (line[0][2][0] / self.scale) + ocr_rect.l,
-                                (line[0][2][1] / self.scale) + ocr_rect.t,
-                            ),
-                            origin=CoordOrigin.TOPLEFT,
-                        ),
+            if not page._backend.is_valid():
+                yield page
+            else:
+                ocr_rects = self.get_ocr_rects(page)
+                all_ocr_cells = []
+                for ocr_rect in ocr_rects:
+                    # Skip zero area boxes
+                    if ocr_rect.area() == 0:
+                        continue
+                    high_res_image = page._backend.get_page_image(
+                        scale=self.scale, cropbox=ocr_rect
                     )
-                    for ix, line in enumerate(result)
-                ]
-                all_ocr_cells.extend(cells)
+                    im = numpy.array(high_res_image)
+                    result = self.reader.readtext(im)
+                    del high_res_image
+                    del im
+                    cells = [
+                        OcrCell(
+                            id=ix,
+                            text=line[1],
+                            confidence=line[2],
+                            bbox=BoundingBox.from_tuple(
+                                coord=(
+                                    (line[0][0][0] / self.scale) + ocr_rect.l,
+                                    (line[0][0][1] / self.scale) + ocr_rect.t,
+                                    (line[0][2][0] / self.scale) + ocr_rect.l,
+                                    (line[0][2][1] / self.scale) + ocr_rect.t,
+                                ),
+                                origin=CoordOrigin.TOPLEFT,
+                            ),
+                        )
+                        for ix, line in enumerate(result)
+                    ]
+                    all_ocr_cells.extend(cells)
-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                ## Remove OCR cells which overlap with programmatic cells.
+                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
-            page.cells.extend(filtered_ocr_cells)
+                page.cells.extend(filtered_ocr_cells)
-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                # DEBUG code:
+                # self.draw_ocr_rects_and_cells(page, ocr_rects)
-            yield page
+                yield page

docling/models/layout_model.py CHANGED Viewed

@@ -273,68 +273,72 @@ class LayoutModel(BasePageModel):
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
         for page in page_batch:
-            assert page.size is not None
-            clusters = []
-            for ix, pred_item in enumerate(
-                self.layout_predictor.predict(page.get_image(scale=1.0))
-            ):
-                label = DocItemLabel(
-                    pred_item["label"].lower().replace(" ", "_").replace("-", "_")
-                )  # Temporary, until docling-ibm-model uses docling-core types
-                cluster = Cluster(
-                    id=ix,
-                    label=label,
-                    confidence=pred_item["confidence"],
-                    bbox=BoundingBox.model_validate(pred_item),
-                    cells=[],
-                )
-                clusters.append(cluster)
-            # Map cells to clusters
-            # TODO: Remove, postprocess should take care of it anyway.
-            for cell in page.cells:
-                for cluster in clusters:
-                    if not cell.bbox.area() > 0:
-                        overlap_frac = 0.0
-                    else:
-                        overlap_frac = (
-                            cell.bbox.intersection_area_with(cluster.bbox)
-                            / cell.bbox.area()
-                        )
-                    if overlap_frac > 0.5:
-                        cluster.cells.append(cell)
-            # Pre-sort clusters
-            # clusters = self.sort_clusters_by_cell_order(clusters)
-            # DEBUG code:
-            def draw_clusters_and_cells():
-                image = copy.deepcopy(page.image)
-                draw = ImageDraw.Draw(image)
-                for c in clusters:
-                    x0, y0, x1, y1 = c.bbox.as_tuple()
-                    draw.rectangle([(x0, y0), (x1, y1)], outline="green")
-                    cell_color = (
-                        random.randint(30, 140),
-                        random.randint(30, 140),
-                        random.randint(30, 140),
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                assert page.size is not None
+                clusters = []
+                for ix, pred_item in enumerate(
+                    self.layout_predictor.predict(page.get_image(scale=1.0))
+                ):
+                    label = DocItemLabel(
+                        pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+                    )  # Temporary, until docling-ibm-model uses docling-core types
+                    cluster = Cluster(
+                        id=ix,
+                        label=label,
+                        confidence=pred_item["confidence"],
+                        bbox=BoundingBox.model_validate(pred_item),
+                        cells=[],
                     )
-                    for tc in c.cells:  # [:1]:
-                        x0, y0, x1, y1 = tc.bbox.as_tuple()
-                        draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-                image.show()
+                    clusters.append(cluster)
+                # Map cells to clusters
+                # TODO: Remove, postprocess should take care of it anyway.
+                for cell in page.cells:
+                    for cluster in clusters:
+                        if not cell.bbox.area() > 0:
+                            overlap_frac = 0.0
+                        else:
+                            overlap_frac = (
+                                cell.bbox.intersection_area_with(cluster.bbox)
+                                / cell.bbox.area()
+                            )
+                        if overlap_frac > 0.5:
+                            cluster.cells.append(cell)
+                # Pre-sort clusters
+                # clusters = self.sort_clusters_by_cell_order(clusters)
+                # DEBUG code:
+                def draw_clusters_and_cells():
+                    image = copy.deepcopy(page.image)
+                    draw = ImageDraw.Draw(image)
+                    for c in clusters:
+                        x0, y0, x1, y1 = c.bbox.as_tuple()
+                        draw.rectangle([(x0, y0), (x1, y1)], outline="green")
+                        cell_color = (
+                            random.randint(30, 140),
+                            random.randint(30, 140),
+                            random.randint(30, 140),
+                        )
+                        for tc in c.cells:  # [:1]:
+                            x0, y0, x1, y1 = tc.bbox.as_tuple()
+                            draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+                    image.show()
-            # draw_clusters_and_cells()
+                # draw_clusters_and_cells()
-            clusters, page.cells = self.postprocess(
-                clusters, page.cells, page.size.height
-            )
+                clusters, page.cells = self.postprocess(
+                    clusters, page.cells, page.size.height
+                )
-            # draw_clusters_and_cells()
+                # draw_clusters_and_cells()
-            page.predictions.layout = LayoutPrediction(clusters=clusters)
+                page.predictions.layout = LayoutPrediction(clusters=clusters)
-            yield page
+                yield page

docling/models/page_assemble_model.py CHANGED Viewed

@@ -54,111 +54,119 @@ class PageAssembleModel(BasePageModel):
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
         for page in page_batch:
             assert page._backend is not None
-            assert page.predictions.layout is not None
-            # assembles some JSON output page by page.
-            elements: List[PageElement] = []
-            headers: List[PageElement] = []
-            body: List[PageElement] = []
-            for cluster in page.predictions.layout.clusters:
-                # _log.info("Cluster label seen:", cluster.label)
-                if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
-                    textlines = [
-                        cell.text.replace("\x02", "-").strip()
-                        for cell in cluster.cells
-                        if len(cell.text.strip()) > 0
-                    ]
-                    text = self.sanitize_text(textlines)
-                    text_el = TextElement(
-                        label=cluster.label,
-                        id=cluster.id,
-                        text=text,
-                        page_no=page.page_no,
-                        cluster=cluster,
-                    )
-                    elements.append(text_el)
-                    if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
-                        headers.append(text_el)
-                    else:
-                        body.append(text_el)
-                elif cluster.label == LayoutModel.TABLE_LABEL:
-                    tbl = None
-                    if page.predictions.tablestructure:
-                        tbl = page.predictions.tablestructure.table_map.get(
-                            cluster.id, None
-                        )
-                    if (
-                        not tbl
-                    ):  # fallback: add table without structure, if it isn't present
-                        tbl = Table(
-                            label=cluster.label,
-                            id=cluster.id,
-                            text="",
-                            otsl_seq=[],
-                            table_cells=[],
-                            cluster=cluster,
-                            page_no=page.page_no,
-                        )
+            if not page._backend.is_valid():
+                yield page
+            else:
+                assert page.predictions.layout is not None
-                    elements.append(tbl)
-                    body.append(tbl)
-                elif cluster.label == LayoutModel.FIGURE_LABEL:
-                    fig = None
-                    if page.predictions.figures_classification:
-                        fig = page.predictions.figures_classification.figure_map.get(
-                            cluster.id, None
-                        )
-                    if (
-                        not fig
-                    ):  # fallback: add figure without classification, if it isn't present
-                        fig = FigureElement(
+                # assembles some JSON output page by page.
+                elements: List[PageElement] = []
+                headers: List[PageElement] = []
+                body: List[PageElement] = []
+                for cluster in page.predictions.layout.clusters:
+                    # _log.info("Cluster label seen:", cluster.label)
+                    if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
+                        textlines = [
+                            cell.text.replace("\x02", "-").strip()
+                            for cell in cluster.cells
+                            if len(cell.text.strip()) > 0
+                        ]
+                        text = self.sanitize_text(textlines)
+                        text_el = TextElement(
                             label=cluster.label,
                             id=cluster.id,
-                            text="",
-                            data=None,
-                            cluster=cluster,
+                            text=text,
                             page_no=page.page_no,
+                            cluster=cluster,
                         )
-                    elements.append(fig)
-                    body.append(fig)
-                elif cluster.label == LayoutModel.FORMULA_LABEL:
-                    equation = None
-                    if page.predictions.equations_prediction:
-                        equation = (
-                            page.predictions.equations_prediction.equation_map.get(
+                        elements.append(text_el)
+                        if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
+                            headers.append(text_el)
+                        else:
+                            body.append(text_el)
+                    elif cluster.label == LayoutModel.TABLE_LABEL:
+                        tbl = None
+                        if page.predictions.tablestructure:
+                            tbl = page.predictions.tablestructure.table_map.get(
                                 cluster.id, None
                             )
-                        )
-                    if not equation:  # fallback: add empty formula, if it isn't present
-                        text = self.sanitize_text(
-                            [
-                                cell.text.replace("\x02", "-").strip()
-                                for cell in cluster.cells
-                                if len(cell.text.strip()) > 0
-                            ]
-                        )
-                        equation = TextElement(
-                            label=cluster.label,
-                            id=cluster.id,
-                            cluster=cluster,
-                            page_no=page.page_no,
-                            text=text,
-                        )
-                    elements.append(equation)
-                    body.append(equation)
+                        if (
+                            not tbl
+                        ):  # fallback: add table without structure, if it isn't present
+                            tbl = Table(
+                                label=cluster.label,
+                                id=cluster.id,
+                                text="",
+                                otsl_seq=[],
+                                table_cells=[],
+                                cluster=cluster,
+                                page_no=page.page_no,
+                            )
+                        elements.append(tbl)
+                        body.append(tbl)
+                    elif cluster.label == LayoutModel.FIGURE_LABEL:
+                        fig = None
+                        if page.predictions.figures_classification:
+                            fig = (
+                                page.predictions.figures_classification.figure_map.get(
+                                    cluster.id, None
+                                )
+                            )
+                        if (
+                            not fig
+                        ):  # fallback: add figure without classification, if it isn't present
+                            fig = FigureElement(
+                                label=cluster.label,
+                                id=cluster.id,
+                                text="",
+                                data=None,
+                                cluster=cluster,
+                                page_no=page.page_no,
+                            )
+                        elements.append(fig)
+                        body.append(fig)
+                    elif cluster.label == LayoutModel.FORMULA_LABEL:
+                        equation = None
+                        if page.predictions.equations_prediction:
+                            equation = (
+                                page.predictions.equations_prediction.equation_map.get(
+                                    cluster.id, None
+                                )
+                            )
+                        if (
+                            not equation
+                        ):  # fallback: add empty formula, if it isn't present
+                            text = self.sanitize_text(
+                                [
+                                    cell.text.replace("\x02", "-").strip()
+                                    for cell in cluster.cells
+                                    if len(cell.text.strip()) > 0
+                                ]
+                            )
+                            equation = TextElement(
+                                label=cluster.label,
+                                id=cluster.id,
+                                cluster=cluster,
+                                page_no=page.page_no,
+                                text=text,
+                            )
+                        elements.append(equation)
+                        body.append(equation)
-            page.assembled = AssembledUnit(
-                elements=elements, headers=headers, body=body
-            )
+                page.assembled = AssembledUnit(
+                    elements=elements, headers=headers, body=body
+                )
-            # Remove page images (can be disabled)
-            if not self.options.keep_images:
-                page._image_cache = {}
+                # Remove page images (can be disabled)
+                if not self.options.keep_images:
+                    page._image_cache = {}
-            # Unload backend
-            page._backend.unload()
+                # Unload backend
+                page._backend.unload()
-            yield page
+                yield page

docling/models/page_preprocessing_model.py CHANGED Viewed

@@ -17,9 +17,13 @@ class PagePreprocessingModel(BasePageModel):
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
         for page in page_batch:
-            page = self._populate_page_images(page)
-            page = self._parse_page_cells(page)
-            yield page
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                page = self._populate_page_images(page)
+                page = self._parse_page_cells(page)
+                yield page
     # Generate the page image and store it in the page object
     def _populate_page_images(self, page: Page) -> Page:

docling 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

docling 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl