PyPI - docling - Versions diffs - 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

docling 1.19.0py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docling/backend/abstract_backend.py +32 -37
docling/backend/docling_parse_backend.py +16 -12
docling/backend/docling_parse_v2_backend.py +240 -0
docling/backend/html_backend.py +425 -0
docling/backend/mspowerpoint_backend.py +375 -0
docling/backend/msword_backend.py +509 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +15 -10
docling/cli/main.py +61 -60
docling/datamodel/base_models.py +73 -193
docling/datamodel/document.py +379 -324
docling/datamodel/pipeline_options.py +16 -0
docling/datamodel/settings.py +1 -0
docling/document_converter.py +215 -252
docling/models/base_model.py +25 -0
docling/models/base_ocr_model.py +19 -6
docling/models/ds_glm_model.py +220 -22
docling/models/easyocr_model.py +45 -40
docling/models/layout_model.py +130 -114
docling/models/page_assemble_model.py +119 -95
docling/models/page_preprocessing_model.py +61 -0
docling/models/table_structure_model.py +122 -111
docling/models/tesseract_ocr_cli_model.py +65 -58
docling/models/tesseract_ocr_model.py +58 -50
docling/pipeline/base_pipeline.py +190 -0
docling/pipeline/simple_pipeline.py +59 -0
docling/pipeline/standard_pdf_pipeline.py +198 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling-2.1.0.dist-info/METADATA +149 -0
docling-2.1.0.dist-info/RECORD +42 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.19.0.dist-info/METADATA +0 -380
docling-1.19.0.dist-info/RECORD +0 -34
{docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
{docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
{docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0

docling/models/ds_glm_model.py CHANGED Viewed

@@ -1,39 +1,237 @@
 import copy
 import random
+from typing import List, Union
 from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.doc_utils import to_legacy_document_format
+from deepsearch_glm.utils.doc_utils import to_docling_document
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
-from docling_core.types import BaseText
-from docling_core.types import Document as DsDocument
-from docling_core.types import Ref
+from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
+from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
+from docling_core.types.legacy_doc.base import (
+    Figure,
+    PageDimensions,
+    PageReference,
+    Prov,
+    Ref,
+)
+from docling_core.types.legacy_doc.base import Table as DsSchemaTable
+from docling_core.types.legacy_doc.base import TableCell
+from docling_core.types.legacy_doc.document import BaseText
+from docling_core.types.legacy_doc.document import (
+    CCSDocumentDescription as DsDocumentDescription,
+)
+from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
+from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from PIL import ImageDraw
+from pydantic import BaseModel, ConfigDict
-from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
-from docling.datamodel.document import ConversionResult
+from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
+from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
+from docling.utils.utils import create_hash
+class GlmOptions(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+    model_names: str = ""  # e.g. "language;term;reference"
 class GlmModel:
-    def __init__(self, config):
-        self.config = config
-        self.model_names = self.config.get(
-            "model_names", ""
-        )  # "language;term;reference"
+    def __init__(self, options: GlmOptions):
+        self.options = options
         load_pretrained_nlp_models()
-        # model = init_nlp_model(model_names="language;term;reference")
-        model = init_nlp_model(model_names=self.model_names)
-        self.model = model
+        self.model = init_nlp_model(model_names=self.options.model_names)
+    def _to_legacy_document(self, conv_res) -> DsDocument:
+        title = ""
+        desc: DsDocumentDescription = DsDocumentDescription(logs=[])
+        page_hashes = [
+            PageReference(
+                hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
+                page=p.page_no + 1,
+                model="default",
+            )
+            for p in conv_res.pages
+        ]
+        file_info = DsFileInfoObject(
+            filename=conv_res.input.file.name,
+            document_hash=conv_res.input.document_hash,
+            num_pages=conv_res.input.page_count,
+            page_hashes=page_hashes,
+        )
+        main_text: List[Union[Ref, BaseText]] = []
+        tables: List[DsSchemaTable] = []
+        figures: List[Figure] = []
+        page_no_to_page = {p.page_no: p for p in conv_res.pages}
+        for element in conv_res.assembled.elements:
+            # Convert bboxes to lower-left origin.
+            target_bbox = DsBoundingBox(
+                element.cluster.bbox.to_bottom_left_origin(
+                    page_no_to_page[element.page_no].size.height
+                ).as_tuple()
+            )
+            if isinstance(element, TextElement):
+                main_text.append(
+                    BaseText(
+                        text=element.text,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        name=element.label,
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no + 1,
+                                span=[0, len(element.text)],
+                            )
+                        ],
+                    )
+                )
+            elif isinstance(element, Table):
+                index = len(tables)
+                ref_str = f"#/tables/{index}"
+                main_text.append(
+                    Ref(
+                        name=element.label,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        ref=ref_str,
+                    ),
+                )
+                # Initialise empty table data grid (only empty cells)
+                table_data = [
+                    [
+                        TableCell(
+                            text="",
+                            # bbox=[0,0,0,0],
+                            spans=[[i, j]],
+                            obj_type="body",
+                        )
+                        for j in range(element.num_cols)
+                    ]
+                    for i in range(element.num_rows)
+                ]
-    def __call__(self, conv_res: ConversionResult) -> DsDocument:
-        ds_doc = conv_res._to_ds_document()
+                # Overwrite cells in table data for which there is actual cell content.
+                for cell in element.table_cells:
+                    for i in range(
+                        min(cell.start_row_offset_idx, element.num_rows),
+                        min(cell.end_row_offset_idx, element.num_rows),
+                    ):
+                        for j in range(
+                            min(cell.start_col_offset_idx, element.num_cols),
+                            min(cell.end_col_offset_idx, element.num_cols),
+                        ):
+                            celltype = "body"
+                            if cell.column_header:
+                                celltype = "col_header"
+                            elif cell.row_header:
+                                celltype = "row_header"
+                            elif cell.row_section:
+                                celltype = "row_section"
+                            def make_spans(cell):
+                                for rspan in range(
+                                    min(cell.start_row_offset_idx, element.num_rows),
+                                    min(cell.end_row_offset_idx, element.num_rows),
+                                ):
+                                    for cspan in range(
+                                        min(
+                                            cell.start_col_offset_idx, element.num_cols
+                                        ),
+                                        min(cell.end_col_offset_idx, element.num_cols),
+                                    ):
+                                        yield [rspan, cspan]
+                            spans = list(make_spans(cell))
+                            if cell.bbox is not None:
+                                bbox = cell.bbox.to_bottom_left_origin(
+                                    page_no_to_page[element.page_no].size.height
+                                ).as_tuple()
+                            else:
+                                bbox = None
+                            table_data[i][j] = TableCell(
+                                text=cell.text,
+                                bbox=bbox,
+                                # col=j,
+                                # row=i,
+                                spans=spans,
+                                obj_type=celltype,
+                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
+                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
+                            )
+                tables.append(
+                    DsSchemaTable(
+                        num_cols=element.num_cols,
+                        num_rows=element.num_rows,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        data=table_data,
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no + 1,
+                                span=[0, 0],
+                            )
+                        ],
+                    )
+                )
+            elif isinstance(element, FigureElement):
+                index = len(figures)
+                ref_str = f"#/figures/{index}"
+                main_text.append(
+                    Ref(
+                        name=element.label,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        ref=ref_str,
+                    ),
+                )
+                figures.append(
+                    Figure(
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no + 1,
+                                span=[0, 0],
+                            )
+                        ],
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        # data=[[]],
+                    )
+                )
+        page_dimensions = [
+            PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
+            for p in conv_res.pages
+            if p.size is not None
+        ]
+        ds_doc: DsDocument = DsDocument(
+            name=title,
+            description=desc,
+            file_info=file_info,
+            main_text=main_text,
+            tables=tables,
+            figures=figures,
+            page_dimensions=page_dimensions,
+        )
+        return ds_doc
+    def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
+        ds_doc = self._to_legacy_document(conv_res)
         ds_doc_dict = ds_doc.model_dump(by_alias=True)
         glm_doc = self.model.apply_on_doc(ds_doc_dict)
-        ds_doc_dict = to_legacy_document_format(
-            glm_doc, ds_doc_dict, update_name_label=True
-        )
-        exported_doc = DsDocument.model_validate(ds_doc_dict)
+        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
         # DEBUG code:
         def draw_clusters_and_cells(ds_document, page_no):
@@ -48,7 +246,7 @@ class GlmModel:
                     if arr == "tables":
                         prov = ds_document.tables[index].prov[0]
                     elif arr == "figures":
-                        prov = ds_document.figures[index].prov[0]
+                        prov = ds_document.pictures[index].prov[0]
                     else:
                         prov = None
@@ -83,4 +281,4 @@ class GlmModel:
         # draw_clusters_and_cells(ds_doc, 0)
         # draw_clusters_and_cells(exported_doc, 0)
-        return exported_doc
+        return docling_doc

docling/models/easyocr_model.py CHANGED Viewed

@@ -2,8 +2,9 @@ import logging
 from typing import Iterable
 import numpy
+from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.base_models import OcrCell, Page
 from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel
@@ -39,47 +40,51 @@ class EasyOcrModel(BaseOcrModel):
             return
         for page in page_batch:
-            ocr_rects = self.get_ocr_rects(page)
-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
-                im = numpy.array(high_res_image)
-                result = self.reader.readtext(im)
-                del high_res_image
-                del im
-                cells = [
-                    OcrCell(
-                        id=ix,
-                        text=line[1],
-                        confidence=line[2],
-                        bbox=BoundingBox.from_tuple(
-                            coord=(
-                                (line[0][0][0] / self.scale) + ocr_rect.l,
-                                (line[0][0][1] / self.scale) + ocr_rect.t,
-                                (line[0][2][0] / self.scale) + ocr_rect.l,
-                                (line[0][2][1] / self.scale) + ocr_rect.t,
-                            ),
-                            origin=CoordOrigin.TOPLEFT,
-                        ),
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                ocr_rects = self.get_ocr_rects(page)
+                all_ocr_cells = []
+                for ocr_rect in ocr_rects:
+                    # Skip zero area boxes
+                    if ocr_rect.area() == 0:
+                        continue
+                    high_res_image = page._backend.get_page_image(
+                        scale=self.scale, cropbox=ocr_rect
                     )
-                    for ix, line in enumerate(result)
-                ]
-                all_ocr_cells.extend(cells)
+                    im = numpy.array(high_res_image)
+                    result = self.reader.readtext(im)
+                    del high_res_image
+                    del im
+                    cells = [
+                        OcrCell(
+                            id=ix,
+                            text=line[1],
+                            confidence=line[2],
+                            bbox=BoundingBox.from_tuple(
+                                coord=(
+                                    (line[0][0][0] / self.scale) + ocr_rect.l,
+                                    (line[0][0][1] / self.scale) + ocr_rect.t,
+                                    (line[0][2][0] / self.scale) + ocr_rect.l,
+                                    (line[0][2][1] / self.scale) + ocr_rect.t,
+                                ),
+                                origin=CoordOrigin.TOPLEFT,
+                            ),
+                        )
+                        for ix, line in enumerate(result)
+                    ]
+                    all_ocr_cells.extend(cells)
-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                ## Remove OCR cells which overlap with programmatic cells.
+                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
-            page.cells.extend(filtered_ocr_cells)
+                page.cells.extend(filtered_ocr_cells)
-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                # DEBUG code:
+                # self.draw_ocr_rects_and_cells(page, ocr_rects)
-            yield page
+                yield page

docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

docling 1.19.0py3-none-any.whl → 2.1.0py3-none-any.whl