PyPI - docling - Versions diffs - 0.1.0__py3-none-any.whl - Mend

docling 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

docling/__init__.py +0 -0
docling/backend/__init__.py +0 -0
docling/backend/abstract_backend.py +55 -0
docling/backend/pypdfium2_backend.py +223 -0
docling/datamodel/__init__.py +0 -0
docling/datamodel/base_models.py +247 -0
docling/datamodel/document.py +351 -0
docling/datamodel/settings.py +32 -0
docling/document_converter.py +207 -0
docling/models/__init__.py +0 -0
docling/models/ds_glm_model.py +82 -0
docling/models/easyocr_model.py +77 -0
docling/models/layout_model.py +318 -0
docling/models/page_assemble_model.py +160 -0
docling/models/table_structure_model.py +114 -0
docling/pipeline/__init__.py +0 -0
docling/pipeline/base_model_pipeline.py +18 -0
docling/pipeline/standard_model_pipeline.py +40 -0
docling/utils/__init__.py +0 -0
docling/utils/layout_utils.py +806 -0
docling/utils/utils.py +41 -0
docling-0.1.0.dist-info/LICENSE +21 -0
docling-0.1.0.dist-info/METADATA +130 -0
docling-0.1.0.dist-info/RECORD +25 -0
docling-0.1.0.dist-info/WHEEL +4 -0

docling/datamodel/document.py ADDED Viewed

@@ -0,0 +1,351 @@
+import logging
+from io import BytesIO
+from pathlib import Path, PurePath
+from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
+from deepsearch.documents.core.export import export_to_markdown
+from docling_core.types import BaseCell, BaseText
+from docling_core.types import BoundingBox as DsBoundingBox
+from docling_core.types import Document as DsDocument
+from docling_core.types import DocumentDescription as DsDocumentDescription
+from docling_core.types import FileInfoObject as DsFileInfoObject
+from docling_core.types import PageDimensions, PageReference, Prov, Ref
+from docling_core.types import Table as DsSchemaTable
+from docling_core.types import TableCell
+from pydantic import BaseModel
+from docling.backend.abstract_backend import PdfDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import (
+    AssembledUnit,
+    ConversionStatus,
+    DocumentStream,
+    FigureElement,
+    Page,
+    TableElement,
+    TextElement,
+)
+from docling.datamodel.settings import DocumentLimits
+from docling.utils.utils import create_file_hash
+_log = logging.getLogger(__name__)
+layout_label_to_ds_type = {
+    "Title": "title",
+    "Document Index": "table-of-path_or_stream",
+    "Section-header": "subtitle-level-1",
+    "Checkbox-Selected": "checkbox-selected",
+    "Checkbox-Unselected": "checkbox-unselected",
+    "Caption": "caption",
+    "Page-header": "page-header",
+    "Page-footer": "page-footer",
+    "Footnote": "footnote",
+    "Table": "table",
+    "Formula": "equation",
+    "List-item": "paragraph",
+    "Code": "paragraph",
+    "Picture": "figure",
+    "Text": "paragraph",
+}
+class InputDocument(BaseModel):
+    file: PurePath = None
+    document_hash: Optional[str] = None
+    valid: bool = False
+    limits: DocumentLimits = DocumentLimits()
+    filesize: Optional[int] = None
+    page_count: Optional[int] = None
+    _backend: PdfDocumentBackend = None  # Internal PDF backend used
+    def __init__(
+        self,
+        path_or_stream: Union[BytesIO, Path],
+        filename: Optional[str] = None,
+        limits: Optional[DocumentLimits] = None,
+        pdf_backend=PyPdfiumDocumentBackend,
+    ):
+        super().__init__()
+        self.limits = limits or DocumentLimits()
+        try:
+            if isinstance(path_or_stream, Path):
+                self.file = path_or_stream
+                self.filesize = path_or_stream.stat().st_size
+                if self.filesize > self.limits.max_file_size:
+                    self.valid = False
+                else:
+                    self.document_hash = create_file_hash(path_or_stream)
+                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+            elif isinstance(path_or_stream, BytesIO):
+                self.file = PurePath(filename)
+                self.filesize = path_or_stream.getbuffer().nbytes
+                if self.filesize > self.limits.max_file_size:
+                    self.valid = False
+                else:
+                    self.document_hash = create_file_hash(path_or_stream)
+                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+            if self.document_hash and self._backend.page_count() > 0:
+                self.page_count = self._backend.page_count()
+                if self.page_count <= self.limits.max_num_pages:
+                    self.valid = True
+        except (FileNotFoundError, OSError) as e:
+            _log.exception(
+                f"File {self.file.name} not found or cannot be opened.", exc_info=e
+            )
+            # raise
+        except RuntimeError as e:
+            _log.exception(
+                f"An unexpected error occurred while opening the document {self.file.name}",
+                exc_info=e,
+            )
+            # raise
+class ConvertedDocument(BaseModel):
+    input: InputDocument
+    status: ConversionStatus = ConversionStatus.PENDING  # failure, success
+    errors: List[Dict] = []  # structure to keep errors
+    pages: List[Page] = []
+    assembled: AssembledUnit = None
+    output: DsDocument = None
+    def to_ds_document(self) -> DsDocument:
+        title = ""
+        desc = DsDocumentDescription(logs=[])
+        page_hashes = [
+            PageReference(hash=p.page_hash, page=p.page_no, model="default")
+            for p in self.pages
+        ]
+        file_info = DsFileInfoObject(
+            filename=self.input.file.name,
+            document_hash=self.input.document_hash,
+            num_pages=self.input.page_count,
+            page_hashes=page_hashes,
+        )
+        main_text = []
+        tables = []
+        figures = []
+        page_no_to_page = {p.page_no: p for p in self.pages}
+        for element in self.assembled.elements:
+            # Convert bboxes to lower-left origin.
+            target_bbox = DsBoundingBox(
+                element.cluster.bbox.to_bottom_left_origin(
+                    page_no_to_page[element.page_no].size.height
+                ).as_tuple()
+            )
+            if isinstance(element, TextElement):
+                main_text.append(
+                    BaseText(
+                        text=element.text,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        name=element.label,
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no,
+                                span=[0, len(element.text)],
+                            )
+                        ],
+                    )
+                )
+            elif isinstance(element, TableElement):
+                index = len(tables)
+                ref_str = f"#/tables/{index}"
+                main_text.append(
+                    Ref(
+                        name=element.label,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        ref=ref_str,
+                    ),
+                )
+                # Initialise empty table data grid (only empty cells)
+                table_data = [
+                    [
+                        TableCell(
+                            text="",
+                            # bbox=[0,0,0,0],
+                            spans=[[i, j]],
+                            obj_type="body",
+                        )
+                        for j in range(element.num_cols)
+                    ]
+                    for i in range(element.num_rows)
+                ]
+                # Overwrite cells in table data for which there is actual cell content.
+                for cell in element.table_cells:
+                    for i in range(
+                        min(cell.start_row_offset_idx, element.num_rows),
+                        min(cell.end_row_offset_idx, element.num_rows),
+                    ):
+                        for j in range(
+                            min(cell.start_col_offset_idx, element.num_cols),
+                            min(cell.end_col_offset_idx, element.num_cols),
+                        ):
+                            celltype = "body"
+                            if cell.column_header:
+                                celltype = "col_header"
+                            elif cell.row_header:
+                                celltype = "row_header"
+                            def make_spans(cell):
+                                for rspan in range(
+                                    min(cell.start_row_offset_idx, element.num_rows),
+                                    min(cell.end_row_offset_idx, element.num_rows),
+                                ):
+                                    for cspan in range(
+                                        min(
+                                            cell.start_col_offset_idx, element.num_cols
+                                        ),
+                                        min(cell.end_col_offset_idx, element.num_cols),
+                                    ):
+                                        yield [rspan, cspan]
+                            spans = list(make_spans(cell))
+                            table_data[i][j] = TableCell(
+                                text=cell.text,
+                                bbox=cell.bbox.to_bottom_left_origin(
+                                    page_no_to_page[element.page_no].size.height
+                                ).as_tuple(),
+                                # col=j,
+                                # row=i,
+                                spans=spans,
+                                obj_type=celltype,
+                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
+                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
+                            )
+                tables.append(
+                    DsSchemaTable(
+                        num_cols=element.num_cols,
+                        num_rows=element.num_rows,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        data=table_data,
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no,
+                                span=[0, 0],
+                            )
+                        ],
+                    )
+                )
+            elif isinstance(element, FigureElement):
+                index = len(figures)
+                ref_str = f"#/figures/{index}"
+                main_text.append(
+                    Ref(
+                        name=element.label,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        ref=ref_str,
+                    ),
+                )
+                figures.append(
+                    BaseCell(
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no,
+                                span=[0, 0],
+                            )
+                        ],
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        # data=[[]],
+                    )
+                )
+        page_dimensions = [
+            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
+            for p in self.pages
+        ]
+        ds_doc = DsDocument(
+            name=title,
+            description=desc,
+            file_info=file_info,
+            main_text=main_text,
+            tables=tables,
+            figures=figures,
+            page_dimensions=page_dimensions,
+        )
+        return ds_doc
+    def render_as_dict(self):
+        if self.output:
+            return self.output.model_dump(by_alias=True, exclude_none=True)
+        else:
+            return {}
+    def render_as_markdown(self):
+        if self.output:
+            return export_to_markdown(
+                self.output.model_dump(by_alias=True, exclude_none=True)
+            )
+        else:
+            return ""
+class DocumentConversionInput(BaseModel):
+    _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
+    limits: Optional[DocumentLimits] = DocumentLimits()
+    DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
+    def docs(
+        self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
+    ) -> Iterable[InputDocument]:
+        pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
+        for obj in self._path_or_stream_iterator:
+            if isinstance(obj, Path):
+                yield InputDocument(
+                    path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
+                )
+            elif isinstance(obj, DocumentStream):
+                yield InputDocument(
+                    path_or_stream=obj.stream,
+                    filename=obj.filename,
+                    limits=self.limits,
+                    pdf_backend=pdf_backend,
+                )
+    @classmethod
+    def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
+        paths = [Path(p) for p in paths]
+        doc_input = cls(limits=limits)
+        doc_input._path_or_stream_iterator = paths
+        return doc_input
+    @classmethod
+    def from_streams(
+        cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
+    ):
+        doc_input = cls(limits=limits)
+        doc_input._path_or_stream_iterator = streams
+        return doc_input

docling/datamodel/settings.py ADDED Viewed

@@ -0,0 +1,32 @@
+import sys
+from pydantic import BaseModel
+from pydantic_settings import BaseSettings
+class DocumentLimits(BaseModel):
+    max_num_pages: int = sys.maxsize
+    max_file_size: int = sys.maxsize
+class BatchConcurrencySettings(BaseModel):
+    doc_batch_size: int = 2
+    doc_batch_concurrency: int = 2
+    page_batch_size: int = 4
+    page_batch_concurrency: int = 2
+    # doc_batch_size: int = 1
+    # doc_batch_concurrency: int = 1
+    # page_batch_size: int = 1
+    # page_batch_concurrency: int = 1
+    # model_concurrency: int = 2
+    # To force models into single core: export OMP_NUM_THREADS=1
+class AppSettings(BaseSettings):
+    perf: BatchConcurrencySettings
+settings = AppSettings(perf=BatchConcurrencySettings())

docling/document_converter.py ADDED Viewed

@@ -0,0 +1,207 @@
+import functools
+import logging
+import time
+import traceback
+from pathlib import Path
+from typing import Iterable, Optional, Type, Union
+from PIL import ImageDraw
+from docling.backend.abstract_backend import PdfDocumentBackend
+from docling.datamodel.base_models import (
+    AssembledUnit,
+    ConversionStatus,
+    Page,
+    PipelineOptions,
+)
+from docling.datamodel.document import (
+    ConvertedDocument,
+    DocumentConversionInput,
+    InputDocument,
+)
+from docling.datamodel.settings import settings
+from docling.models.ds_glm_model import GlmModel
+from docling.models.page_assemble_model import PageAssembleModel
+from docling.pipeline.base_model_pipeline import BaseModelPipeline
+from docling.pipeline.standard_model_pipeline import StandardModelPipeline
+from docling.utils.utils import chunkify, create_hash
+_log = logging.getLogger(__name__)
+class DocumentConverter:
+    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
+    _table_model_path = "model_artifacts/tableformer"
+    def __init__(
+        self,
+        artifacts_path: Optional[Union[Path, str]] = None,
+        pipeline_options: PipelineOptions = PipelineOptions(),
+        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
+        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
+    ):
+        if not artifacts_path:
+            artifacts_path = self.download_models_hf()
+        artifacts_path = Path(artifacts_path)
+        self.model_pipeline = pipeline_cls(
+            artifacts_path=artifacts_path, pipeline_options=pipeline_options
+        )
+        self.page_assemble_model = PageAssembleModel(config={})
+        self.glm_model = GlmModel(config={})
+        self.pdf_backend = pdf_backend
+    @staticmethod
+    def download_models_hf(
+        local_dir: Optional[Path] = None, force: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
+        )
+        return Path(download_path)
+    def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
+        for input_batch in chunkify(
+            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
+        ):
+            _log.info(f"Going to convert document batch...")
+            # parallel processing only within input_batch
+            # with ThreadPoolExecutor(
+            #    max_workers=settings.perf.doc_batch_concurrency
+            # ) as pool:
+            #   yield from pool.map(self.process_document, input_batch)
+            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
+            yield from map(self.process_document, input_batch)
+    def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
+        start_doc_time = time.time()
+        converted_doc = ConvertedDocument(input=in_doc)
+        if not in_doc.valid:
+            converted_doc.status = ConversionStatus.FAILURE
+            return converted_doc
+        for i in range(0, in_doc.page_count):
+            converted_doc.pages.append(Page(page_no=i))
+        all_assembled_pages = []
+        try:
+            # Iterate batches of pages (page_batch_size) in the doc
+            for page_batch in chunkify(
+                converted_doc.pages, settings.perf.page_batch_size
+            ):
+                start_pb_time = time.time()
+                # Pipeline
+                # 1. Initialise the page resources
+                init_pages = map(
+                    functools.partial(self.initialize_page, in_doc), page_batch
+                )
+                # 2. Populate page image
+                pages_with_images = map(
+                    functools.partial(self.populate_page_images, in_doc), init_pages
+                )
+                # 3. Populate programmatic page cells
+                pages_with_cells = map(
+                    functools.partial(self.parse_page_cells, in_doc),
+                    pages_with_images,
+                )
+                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
+                # 7. Assemble page elements (per page)
+                assembled_pages = self.page_assemble_model(pipeline_pages)
+                # exhaust assembled_pages
+                for assembled_page in assembled_pages:
+                    # Free up mem resources before moving on with next batch
+                    assembled_page.image = (
+                        None  # Comment this if you want to visualize page images
+                    )
+                    assembled_page._backend.unload()
+                    all_assembled_pages.append(assembled_page)
+                end_pb_time = time.time() - start_pb_time
+                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
+            # Free up mem resources of PDF backend
+            in_doc._backend.unload()
+            converted_doc.pages = all_assembled_pages
+            self.assemble_doc(converted_doc)
+            converted_doc.status = ConversionStatus.SUCCESS
+        except Exception as e:
+            converted_doc.status = ConversionStatus.FAILURE
+            trace = "\n".join(traceback.format_exception(e))
+            _log.info(f"Encountered an error during conversion: {trace}")
+        end_doc_time = time.time() - start_doc_time
+        _log.info(
+            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
+        )
+        return converted_doc
+    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
+    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+        page._backend = doc._backend.load_page(page.page_no)
+        page.size = page._backend.get_size()
+        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
+        return page
+    # Generate the page image and store it in the page object
+    def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
+        page.image = page._backend.get_page_image()
+        return page
+    # Extract and populate the page cells and store it in the page object
+    def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
+        page.cells = page._backend.get_text_cells()
+        # DEBUG code:
+        def draw_text_boxes(image, cells):
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+            image.show()
+        # draw_text_boxes(page.image, cells)
+        return page
+    def assemble_doc(self, converted_doc: ConvertedDocument):
+        all_elements = []
+        all_headers = []
+        all_body = []
+        for p in converted_doc.pages:
+            for el in p.assembled.body:
+                all_body.append(el)
+            for el in p.assembled.headers:
+                all_headers.append(el)
+            for el in p.assembled.elements:
+                all_elements.append(el)
+        converted_doc.assembled = AssembledUnit(
+            elements=all_elements, headers=all_headers, body=all_body
+        )
+        converted_doc.output = self.glm_model(converted_doc)

docling/models/__init__.py ADDED Viewed

File without changes

docling/models/ds_glm_model.py ADDED Viewed

@@ -0,0 +1,82 @@
+import copy
+import random
+from deepsearch_glm.nlp_utils import init_nlp_model
+from deepsearch_glm.utils.ds_utils import to_legacy_document_format
+from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
+from docling_core.types import BaseText
+from docling_core.types import Document as DsDocument
+from docling_core.types import Ref
+from PIL import ImageDraw
+from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
+from docling.datamodel.document import ConvertedDocument
+class GlmModel:
+    def __init__(self, config):
+        self.config = config
+        load_pretrained_nlp_models()
+        model = init_nlp_model(model_names="language;term;reference")
+        self.model = model
+    def __call__(self, document: ConvertedDocument) -> DsDocument:
+        ds_doc = document.to_ds_document()
+        ds_doc_dict = ds_doc.model_dump(by_alias=True)
+        glm_doc = self.model.apply_on_doc(ds_doc_dict)
+        ds_doc_dict = to_legacy_document_format(
+            glm_doc, ds_doc_dict, update_name_label=True
+        )
+        exported_doc = DsDocument.model_validate(ds_doc_dict)
+        # DEBUG code:
+        def draw_clusters_and_cells(ds_document, page_no):
+            clusters_to_draw = []
+            image = copy.deepcopy(document.pages[page_no].image)
+            for ix, elem in enumerate(ds_document.main_text):
+                if isinstance(elem, BaseText):
+                    prov = elem.prov[0]
+                elif isinstance(elem, Ref):
+                    _, arr, index = elem.ref.split("/")
+                    index = int(index)
+                    if arr == "tables":
+                        prov = ds_document.tables[index].prov[0]
+                    elif arr == "figures":
+                        prov = ds_document.figures[index].prov[0]
+                    else:
+                        prov = None
+                if prov and prov.page == page_no:
+                    clusters_to_draw.append(
+                        Cluster(
+                            id=ix,
+                            label=elem.name,
+                            bbox=BoundingBox.from_tuple(
+                                coord=prov.bbox,
+                                origin=CoordOrigin.BOTTOMLEFT,
+                            ).to_top_left_origin(document.pages[page_no].size.height),
+                        )
+                    )
+            draw = ImageDraw.Draw(image)
+            for c in clusters_to_draw:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+                draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                for tc in c.cells:  # [:1]:
+                    x0, y0, x1, y1 = tc.bbox.as_tuple()
+                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+            image.show()
+        # draw_clusters_and_cells(ds_doc, 0)
+        # draw_clusters_and_cells(exported_doc, 0)
+        return exported_doc