PyPI - docling - Versions diffs - 1.6.2__py3-none-any.whl - Mend

docling 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

docling/__init__.py +0 -0
docling/backend/__init__.py +0 -0
docling/backend/abstract_backend.py +59 -0
docling/backend/docling_parse_backend.py +207 -0
docling/backend/pypdfium2_backend.py +233 -0
docling/datamodel/__init__.py +0 -0
docling/datamodel/base_models.py +312 -0
docling/datamodel/document.py +363 -0
docling/datamodel/settings.py +32 -0
docling/document_converter.py +276 -0
docling/models/__init__.py +0 -0
docling/models/base_ocr_model.py +124 -0
docling/models/ds_glm_model.py +82 -0
docling/models/easyocr_model.py +70 -0
docling/models/layout_model.py +328 -0
docling/models/page_assemble_model.py +148 -0
docling/models/table_structure_model.py +144 -0
docling/pipeline/__init__.py +0 -0
docling/pipeline/base_model_pipeline.py +17 -0
docling/pipeline/standard_model_pipeline.py +38 -0
docling/utils/__init__.py +0 -0
docling/utils/layout_utils.py +806 -0
docling/utils/utils.py +41 -0
docling-1.6.2.dist-info/LICENSE +21 -0
docling-1.6.2.dist-info/METADATA +192 -0
docling-1.6.2.dist-info/RECORD +27 -0
docling-1.6.2.dist-info/WHEEL +4 -0

docling/document_converter.py ADDED Viewed

@@ -0,0 +1,276 @@
+import functools
+import logging
+import tempfile
+import time
+import traceback
+from pathlib import Path
+from typing import Iterable, Optional, Type, Union
+import requests
+from docling_core.types import Document
+from PIL import ImageDraw
+from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
+from docling.backend.abstract_backend import PdfDocumentBackend
+from docling.datamodel.base_models import (
+    AssembledUnit,
+    AssembleOptions,
+    ConversionStatus,
+    Page,
+    PipelineOptions,
+)
+from docling.datamodel.document import (
+    ConvertedDocument,
+    DocumentConversionInput,
+    InputDocument,
+)
+from docling.datamodel.settings import settings
+from docling.models.ds_glm_model import GlmModel
+from docling.models.page_assemble_model import PageAssembleModel
+from docling.pipeline.base_model_pipeline import BaseModelPipeline
+from docling.pipeline.standard_model_pipeline import StandardModelPipeline
+from docling.utils.utils import chunkify, create_hash
+_log = logging.getLogger(__name__)
+class DocumentConverter:
+    _default_download_filename = "file.pdf"
+    def __init__(
+        self,
+        artifacts_path: Optional[Union[Path, str]] = None,
+        pipeline_options: PipelineOptions = PipelineOptions(),
+        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
+        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
+        assemble_options: AssembleOptions = AssembleOptions(),
+    ):
+        if not artifacts_path:
+            artifacts_path = self.download_models_hf()
+        artifacts_path = Path(artifacts_path)
+        self.model_pipeline = pipeline_cls(
+            artifacts_path=artifacts_path, pipeline_options=pipeline_options
+        )
+        self.page_assemble_model = PageAssembleModel(config={})
+        self.glm_model = GlmModel(config={})
+        self.pdf_backend = pdf_backend
+        self.assemble_options = assemble_options
+    @staticmethod
+    def download_models_hf(
+        local_dir: Optional[Path] = None, force: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
+        )
+        return Path(download_path)
+    def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
+        for input_batch in chunkify(
+            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
+        ):
+            _log.info(f"Going to convert document batch...")
+            # parallel processing only within input_batch
+            # with ThreadPoolExecutor(
+            #    max_workers=settings.perf.doc_batch_concurrency
+            # ) as pool:
+            #   yield from pool.map(self.process_document, input_batch)
+            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
+            yield from map(self.process_document, input_batch)
+    def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
+        """Convert a single document.
+        Args:
+            source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
+        Raises:
+            ValueError: If source is of unexpected type.
+            RuntimeError: If conversion fails.
+        Returns:
+            Document: The converted document object.
+        """
+        with tempfile.TemporaryDirectory() as temp_dir:
+            try:
+                http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
+                res = requests.get(http_url, stream=True)
+                res.raise_for_status()
+                fname = None
+                # try to get filename from response header
+                if cont_disp := res.headers.get("Content-Disposition"):
+                    for par in cont_disp.strip().split(";"):
+                        # currently only handling directive "filename" (not "*filename")
+                        if (split := par.split("=")) and split[0].strip() == "filename":
+                            fname = "=".join(split[1:]).strip().strip("'\"") or None
+                            break
+                # otherwise, use name from URL:
+                if fname is None:
+                    fname = Path(http_url.path).name or self._default_download_filename
+                local_path = Path(temp_dir) / fname
+                with open(local_path, "wb") as f:
+                    for chunk in res.iter_content(chunk_size=1024):  # using 1-KB chunks
+                        f.write(chunk)
+            except ValidationError:
+                try:
+                    local_path = TypeAdapter(Path).validate_python(source)
+                except ValidationError:
+                    raise ValueError(
+                        f"Unexpected file path type encountered: {type(source)}"
+                    )
+            conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
+            converted_docs_iter = self.convert(conv_inp)
+            converted_doc: ConvertedDocument = next(converted_docs_iter)
+        if converted_doc.status not in {
+            ConversionStatus.SUCCESS,
+            ConversionStatus.SUCCESS_WITH_ERRORS,
+        }:
+            raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
+        doc = converted_doc.to_ds_document()
+        return doc
+    def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
+        start_doc_time = time.time()
+        converted_doc = ConvertedDocument(input=in_doc)
+        if not in_doc.valid:
+            converted_doc.status = ConversionStatus.FAILURE
+            return converted_doc
+        for i in range(0, in_doc.page_count):
+            converted_doc.pages.append(Page(page_no=i))
+        all_assembled_pages = []
+        try:
+            # Iterate batches of pages (page_batch_size) in the doc
+            for page_batch in chunkify(
+                converted_doc.pages, settings.perf.page_batch_size
+            ):
+                start_pb_time = time.time()
+                # Pipeline
+                # 1. Initialise the page resources
+                init_pages = map(
+                    functools.partial(self.initialize_page, in_doc), page_batch
+                )
+                # 2. Populate page image
+                pages_with_images = map(
+                    functools.partial(self.populate_page_images, in_doc), init_pages
+                )
+                # 3. Populate programmatic page cells
+                pages_with_cells = map(
+                    functools.partial(self.parse_page_cells, in_doc),
+                    pages_with_images,
+                )
+                # 4. Run pipeline stages
+                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
+                # 5. Assemble page elements (per page)
+                assembled_pages = self.page_assemble_model(pipeline_pages)
+                # exhaust assembled_pages
+                for assembled_page in assembled_pages:
+                    # Free up mem resources before moving on with next batch
+                    # Remove page images (can be disabled)
+                    if self.assemble_options.images_scale is None:
+                        assembled_page._image_cache = {}
+                    # Unload backend
+                    assembled_page._backend.unload()
+                    all_assembled_pages.append(assembled_page)
+                end_pb_time = time.time() - start_pb_time
+                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
+            # Free up mem resources of PDF backend
+            in_doc._backend.unload()
+            converted_doc.pages = all_assembled_pages
+            self.assemble_doc(converted_doc)
+            converted_doc.status = ConversionStatus.SUCCESS
+        except Exception as e:
+            converted_doc.status = ConversionStatus.FAILURE
+            trace = "\n".join(traceback.format_exception(e))
+            _log.info(f"Encountered an error during conversion: {trace}")
+        end_doc_time = time.time() - start_doc_time
+        _log.info(
+            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
+        )
+        return converted_doc
+    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
+    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+        page._backend = doc._backend.load_page(page.page_no)
+        page.size = page._backend.get_size()
+        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
+        return page
+    # Generate the page image and store it in the page object
+    def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
+        # default scale
+        page.get_image(scale=1.0)
+        # user requested scales
+        if self.assemble_options.images_scale is not None:
+            page._default_image_scale = self.assemble_options.images_scale
+            page.get_image(
+                scale=self.assemble_options.images_scale
+            )  # this will trigger storing the image in the internal cache
+        return page
+    # Extract and populate the page cells and store it in the page object
+    def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
+        page.cells = page._backend.get_text_cells()
+        # DEBUG code:
+        def draw_text_boxes(image, cells):
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+            image.show()
+        # draw_text_boxes(page.get_image(scale=1.0), cells)
+        return page
+    def assemble_doc(self, converted_doc: ConvertedDocument):
+        all_elements = []
+        all_headers = []
+        all_body = []
+        for p in converted_doc.pages:
+            for el in p.assembled.body:
+                all_body.append(el)
+            for el in p.assembled.headers:
+                all_headers.append(el)
+            for el in p.assembled.elements:
+                all_elements.append(el)
+        converted_doc.assembled = AssembledUnit(
+            elements=all_elements, headers=all_headers, body=all_body
+        )
+        converted_doc.output = self.glm_model(converted_doc)

docling/models/__init__.py ADDED Viewed

File without changes

docling/models/base_ocr_model.py ADDED Viewed

@@ -0,0 +1,124 @@
+import copy
+import logging
+from abc import abstractmethod
+from typing import Iterable, List, Tuple
+import numpy
+import numpy as np
+from PIL import Image, ImageDraw
+from rtree import index
+from scipy.ndimage import find_objects, label
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+_log = logging.getLogger(__name__)
+class BaseOcrModel:
+    def __init__(self, config):
+        self.config = config
+        self.enabled = config["enabled"]
+    # Computes the optimum amount and coordinates of rectangles to OCR on a given page
+    def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
+        BITMAP_COVERAGE_TRESHOLD = 0.75
+        def find_ocr_rects(size, bitmap_rects):
+            image = Image.new(
+                "1", (round(size.width), round(size.height))
+            )  # '1' mode is binary
+            # Draw all bitmap rects into a binary image
+            draw = ImageDraw.Draw(image)
+            for rect in bitmap_rects:
+                x0, y0, x1, y1 = rect.as_tuple()
+                x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
+                draw.rectangle([(x0, y0), (x1, y1)], fill=1)
+            np_image = np.array(image)
+            # Find the connected components
+            labeled_image, num_features = label(
+                np_image > 0
+            )  # Label black (0 value) regions
+            # Find enclosing bounding boxes for each connected component.
+            slices = find_objects(labeled_image)
+            bounding_boxes = [
+                BoundingBox(
+                    l=slc[1].start,
+                    t=slc[0].start,
+                    r=slc[1].stop - 1,
+                    b=slc[0].stop - 1,
+                    coord_origin=CoordOrigin.TOPLEFT,
+                )
+                for slc in slices
+            ]
+            # Compute area fraction on page covered by bitmaps
+            area_frac = np.sum(np_image > 0) / (size.width * size.height)
+            return (area_frac, bounding_boxes)  # fraction covered  # boxes
+        bitmap_rects = page._backend.get_bitmap_rects()
+        coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
+        # return full-page rectangle if sufficiently covered with bitmaps
+        if coverage > BITMAP_COVERAGE_TRESHOLD:
+            return [
+                BoundingBox(
+                    l=0,
+                    t=0,
+                    r=page.size.width,
+                    b=page.size.height,
+                    coord_origin=CoordOrigin.TOPLEFT,
+                )
+            ]
+        # return individual rectangles if the bitmap coverage is smaller
+        elif coverage < BITMAP_COVERAGE_TRESHOLD:
+            return ocr_rects
+    # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
+    def filter_ocr_cells(self, ocr_cells, programmatic_cells):
+        # Create R-tree index for programmatic cells
+        p = index.Property()
+        p.dimension = 2
+        idx = index.Index(properties=p)
+        for i, cell in enumerate(programmatic_cells):
+            idx.insert(i, cell.bbox.as_tuple())
+        def is_overlapping_with_existing_cells(ocr_cell):
+            # Query the R-tree to get overlapping rectangles
+            possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
+            return (
+                len(possible_matches_index) > 0
+            )  # this is a weak criterion but it works.
+        filtered_ocr_cells = [
+            rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
+        ]
+        return filtered_ocr_cells
+    def draw_ocr_rects_and_cells(self, page, ocr_rects):
+        image = copy.deepcopy(page.image)
+        draw = ImageDraw.Draw(image, "RGBA")
+        # Draw OCR rectangles as yellow filled rect
+        for rect in ocr_rects:
+            x0, y0, x1, y1 = rect.as_tuple()
+            shade_color = (255, 255, 0, 40)  # transparent yellow
+            draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
+        # Draw OCR and programmatic cells
+        for tc in page.cells:
+            x0, y0, x1, y1 = tc.bbox.as_tuple()
+            color = "red"
+            if isinstance(tc, OcrCell):
+                color = "magenta"
+            draw.rectangle([(x0, y0), (x1, y1)], outline=color)
+        image.show()
+    @abstractmethod
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        pass

docling/models/ds_glm_model.py ADDED Viewed

@@ -0,0 +1,82 @@
+import copy
+import random
+from deepsearch_glm.nlp_utils import init_nlp_model
+from deepsearch_glm.utils.ds_utils import to_legacy_document_format
+from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
+from docling_core.types import BaseText
+from docling_core.types import Document as DsDocument
+from docling_core.types import Ref
+from PIL import ImageDraw
+from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
+from docling.datamodel.document import ConvertedDocument
+class GlmModel:
+    def __init__(self, config):
+        self.config = config
+        load_pretrained_nlp_models()
+        model = init_nlp_model(model_names="language;term;reference")
+        self.model = model
+    def __call__(self, document: ConvertedDocument) -> DsDocument:
+        ds_doc = document.to_ds_document()
+        ds_doc_dict = ds_doc.model_dump(by_alias=True)
+        glm_doc = self.model.apply_on_doc(ds_doc_dict)
+        ds_doc_dict = to_legacy_document_format(
+            glm_doc, ds_doc_dict, update_name_label=True
+        )
+        exported_doc = DsDocument.model_validate(ds_doc_dict)
+        # DEBUG code:
+        def draw_clusters_and_cells(ds_document, page_no):
+            clusters_to_draw = []
+            image = copy.deepcopy(document.pages[page_no].image)
+            for ix, elem in enumerate(ds_document.main_text):
+                if isinstance(elem, BaseText):
+                    prov = elem.prov[0]
+                elif isinstance(elem, Ref):
+                    _, arr, index = elem.ref.split("/")
+                    index = int(index)
+                    if arr == "tables":
+                        prov = ds_document.tables[index].prov[0]
+                    elif arr == "figures":
+                        prov = ds_document.figures[index].prov[0]
+                    else:
+                        prov = None
+                if prov and prov.page == page_no:
+                    clusters_to_draw.append(
+                        Cluster(
+                            id=ix,
+                            label=elem.name,
+                            bbox=BoundingBox.from_tuple(
+                                coord=prov.bbox,
+                                origin=CoordOrigin.BOTTOMLEFT,
+                            ).to_top_left_origin(document.pages[page_no].size.height),
+                        )
+                    )
+            draw = ImageDraw.Draw(image)
+            for c in clusters_to_draw:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+                draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                for tc in c.cells:  # [:1]:
+                    x0, y0, x1, y1 = tc.bbox.as_tuple()
+                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+            image.show()
+        # draw_clusters_and_cells(ds_doc, 0)
+        # draw_clusters_and_cells(exported_doc, 0)
+        return exported_doc

docling/models/easyocr_model.py ADDED Viewed

@@ -0,0 +1,70 @@
+import logging
+from typing import Iterable
+import numpy
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.models.base_ocr_model import BaseOcrModel
+_log = logging.getLogger(__name__)
+class EasyOcrModel(BaseOcrModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        if self.enabled:
+            import easyocr
+            self.reader = easyocr.Reader(config["lang"])
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+        for page in page_batch:
+            ocr_rects = self.get_ocr_rects(page)
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
+                )
+                im = numpy.array(high_res_image)
+                result = self.reader.readtext(im)
+                del high_res_image
+                del im
+                cells = [
+                    OcrCell(
+                        id=ix,
+                        text=line[1],
+                        confidence=line[2],
+                        bbox=BoundingBox.from_tuple(
+                            coord=(
+                                (line[0][0][0] / self.scale) + ocr_rect.l,
+                                (line[0][0][1] / self.scale) + ocr_rect.t,
+                                (line[0][2][0] / self.scale) + ocr_rect.l,
+                                (line[0][2][1] / self.scale) + ocr_rect.t,
+                            ),
+                            origin=CoordOrigin.TOPLEFT,
+                        ),
+                    )
+                    for ix, line in enumerate(result)
+                ]
+                all_ocr_cells.extend(cells)
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+            page.cells.extend(filtered_ocr_cells)
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+            yield page