PyPI - docling - Versions diffs - 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

docling 1.4.0py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

docling/backend/abstract_backend.py +4 -0
docling/backend/docling_parse_backend.py +38 -4
docling/backend/pypdfium2_backend.py +18 -2
docling/datamodel/base_models.py +56 -17
docling/datamodel/document.py +16 -1
docling/document_converter.py +12 -8
docling/models/base_ocr_model.py +124 -0
docling/models/easyocr_model.py +39 -46
docling/models/layout_model.py +3 -1
docling/models/table_structure_model.py +4 -9
docling/pipeline/base_model_pipeline.py +0 -1
docling/pipeline/standard_model_pipeline.py +1 -3
{docling-1.4.0.dist-info → docling-1.6.0.dist-info}/METADATA +13 -10
docling-1.6.0.dist-info/RECORD +27 -0
docling-1.4.0.dist-info/RECORD +0 -26
{docling-1.4.0.dist-info → docling-1.6.0.dist-info}/LICENSE +0 -0
{docling-1.4.0.dist-info → docling-1.6.0.dist-info}/WHEEL +0 -0

docling/backend/abstract_backend.py CHANGED Viewed

@@ -18,6 +18,10 @@ class PdfPageBackend(ABC):
     def get_text_cells(self) -> Iterable["Cell"]:
         pass
+    @abstractmethod
+    def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
+        pass
     @abstractmethod
     def get_page_image(
         self, scale: int = 1, cropbox: Optional["BoundingBox"] = None

docling/backend/docling_parse_backend.py CHANGED Viewed

@@ -1,7 +1,9 @@
+import logging
 import random
+import time
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import Iterable, Optional, Union
 import pypdfium2 as pdfium
 from docling_parse.docling_parse import pdf_parser
@@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
 from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+_log = logging.getLogger(__name__)
 class DoclingParsePageBackend(PdfPageBackend):
     def __init__(self, page_obj: PdfPage, docling_page_obj):
@@ -39,7 +43,7 @@ class DoclingParsePageBackend(PdfPageBackend):
                 r=x1 * scale * page_size.width / parser_width,
                 t=y1 * scale * page_size.height / parser_height,
                 coord_origin=CoordOrigin.BOTTOMLEFT,
-            ).to_top_left_origin(page_size.height * scale)
+            ).to_top_left_origin(page_height=page_size.height * scale)
             overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
@@ -62,6 +66,12 @@ class DoclingParsePageBackend(PdfPageBackend):
         for i in range(len(self._dpage["cells"])):
             rect = self._dpage["cells"][i]["box"]["device"]
             x0, y0, x1, y1 = rect
+            if x1 < x0:
+                x0, x1 = x1, x0
+            if y1 < y0:
+                y0, y1 = y1, y0
             text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
             cells.append(
                 Cell(
@@ -80,7 +90,9 @@ class DoclingParsePageBackend(PdfPageBackend):
             cell_counter += 1
         def draw_clusters_and_cells():
-            image = self.get_page_image()
+            image = (
+                self.get_page_image()
+            )  # make new image to avoid drawing on the saved ones
             draw = ImageDraw.Draw(image)
             for c in cells:
                 x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -102,6 +114,20 @@ class DoclingParsePageBackend(PdfPageBackend):
         return cells
+    def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 32 * 32
+        for i in range(len(self._dpage["images"])):
+            bitmap = self._dpage["images"][i]
+            cropbox = BoundingBox.from_tuple(
+                bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
+            ).to_top_left_origin(self.get_size().height)
+            if cropbox.area() > AREA_THRESHOLD:
+                cropbox = cropbox.scaled(scale=scale)
+                yield cropbox
     def get_page_image(
         self, scale: int = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
@@ -151,15 +177,23 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
         self._pdoc = pdfium.PdfDocument(path_or_stream)
         # Parsing cells with docling_parser call
         parser = pdf_parser()
+        start_pb_time = time.time()
         if isinstance(path_or_stream, BytesIO):
             self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
         else:
             self._parser_doc = parser.find_cells(str(path_or_stream))
+        end_pb_time = time.time() - start_pb_time
+        _log.info(
+            f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
+        )
     def page_count(self) -> int:
         return len(self._parser_doc["pages"])
-    def load_page(self, page_no: int) -> PdfPage:
+    def load_page(self, page_no: int) -> DoclingParsePageBackend:
         return DoclingParsePageBackend(
             self._pdoc[page_no], self._parser_doc["pages"][page_no]
         )

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -4,6 +4,7 @@ from pathlib import Path
 from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
@@ -17,6 +18,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
         self._ppage = page_obj
         self.text_page = None
+    def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 32 * 32
+        for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
+            pos = obj.get_pos()
+            cropbox = BoundingBox.from_tuple(
+                pos, origin=CoordOrigin.BOTTOMLEFT
+            ).to_top_left_origin(page_height=self.get_size().height)
+            if cropbox.area() > AREA_THRESHOLD:
+                cropbox = cropbox.scaled(scale=scale)
+                yield cropbox
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
         if not self.text_page:
             self.text_page = self._ppage.get_textpage()
@@ -134,7 +148,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
             return merged_cells
         def draw_clusters_and_cells():
-            image = self.get_page_image()
+            image = (
+                self.get_page_image()
+            )  # make new image to avoid drawing on the saved ones
             draw = ImageDraw.Draw(image)
             for c in cells:
                 x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -206,7 +222,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
     def page_count(self) -> int:
         return len(self._pdoc)
-    def load_page(self, page_no: int) -> PdfPage:
+    def load_page(self, page_no: int) -> PyPdfiumPageBackend:
         return PyPdfiumPageBackend(self._pdoc[page_no])
     def is_valid(self) -> bool:

docling/datamodel/base_models.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import copy
+import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, model_validator
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from typing_extensions import Self
 from docling.backend.abstract_backend import PdfPageBackend
@@ -66,13 +68,21 @@ class BoundingBox(BaseModel):
     @classmethod
     def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
         if origin == CoordOrigin.TOPLEFT:
-            return BoundingBox(
-                l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin
-            )
+            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b < t:
+                b, t = t, b
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
         elif origin == CoordOrigin.BOTTOMLEFT:
-            return BoundingBox(
-                l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin
-            )
+            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b > t:
+                b, t = t, b
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
     def area(self) -> float:
         return (self.r - self.l) * (self.b - self.t)
@@ -234,14 +244,30 @@ class Page(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
     page_no: int
-    page_hash: str = None
-    size: PageSize = None
-    image: Image = None
+    page_hash: Optional[str] = None
+    size: Optional[PageSize] = None
     cells: List[Cell] = None
     predictions: PagePredictions = PagePredictions()
-    assembled: AssembledUnit = None
+    assembled: Optional[AssembledUnit] = None
-    _backend: PdfPageBackend = None  # Internal PDF backend
+    _backend: Optional[PdfPageBackend] = (
+        None  # Internal PDF backend. By default it is cleared during assembling.
+    )
+    _default_image_scale: float = 1.0  # Default image scale for external usage.
+    _image_cache: Dict[float, Image] = (
+        {}
+    )  # Cache of images in different scales. By default it is cleared during assembling.
+    def get_image(self, scale: float = 1.0) -> Optional[Image]:
+        if self._backend is None:
+            return self._image_cache.get(scale, None)
+        if not scale in self._image_cache:
+            self._image_cache[scale] = self._backend.get_page_image(scale=scale)
+        return self._image_cache[scale]
+    @property
+    def image(self) -> Optional[Image]:
+        return self.get_image(scale=self._default_image_scale)
 class DocumentStream(BaseModel):
@@ -262,12 +288,25 @@ class TableStructureOptions(BaseModel):
 class PipelineOptions(BaseModel):
     do_table_structure: bool = True  # True: perform table structure extraction
-    do_ocr: bool = False  # True: perform OCR, replace programmatic PDF text
+    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
     table_structure_options: TableStructureOptions = TableStructureOptions()
 class AssembleOptions(BaseModel):
-    keep_page_images: bool = (
-        False  # False: page images are removed in the assemble step
-    )
+    keep_page_images: Annotated[
+        bool,
+        Field(
+            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
+        ),
+    ] = False  # False: page images are removed in the assemble step
+    images_scale: Optional[float] = None  # if set, the scale for generated images
+    @model_validator(mode="after")
+    def set_page_images_from_deprecated(self) -> Self:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", DeprecationWarning)
+            default_scale = 1.0
+            if self.keep_page_images and self.images_scale is None:
+                self.images_scale = default_scale
+        return self

docling/datamodel/document.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
+from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
 from docling_core.types import BaseCell, BaseText
 from docling_core.types import BoundingBox as DsBoundingBox
@@ -21,6 +21,7 @@ from docling.datamodel.base_models import (
     DocumentStream,
     FigureElement,
     Page,
+    PageElement,
     TableElement,
     TextElement,
 )
@@ -302,6 +303,20 @@ class ConvertedDocument(BaseModel):
         else:
             return ""
+    def render_element_images(
+        self, element_types: Tuple[PageElement] = (FigureElement,)
+    ):
+        for element in self.assembled.elements:
+            if isinstance(element, element_types):
+                page_ix = element.page_no
+                scale = self.pages[page_ix]._default_image_scale
+                crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
+                    page_height=self.pages[page_ix].size.height * scale
+                )
+                cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
+                yield element, cropped_im
 class DocumentConversionInput(BaseModel):

docling/document_converter.py CHANGED Viewed

@@ -35,8 +35,6 @@ _log = logging.getLogger(__name__)
 class DocumentConverter:
-    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
-    _table_model_path = "model_artifacts/tableformer"
     _default_download_filename = "file.pdf"
     def __init__(
@@ -188,10 +186,8 @@ class DocumentConverter:
                     # Free up mem resources before moving on with next batch
                     # Remove page images (can be disabled)
-                    if not self.assemble_options.keep_page_images:
-                        assembled_page.image = (
-                            None  # Comment this if you want to visualize page images
-                        )
+                    if self.assemble_options.images_scale is None:
+                        assembled_page._image_cache = {}
                     # Unload backend
                     assembled_page._backend.unload()
@@ -231,7 +227,15 @@ class DocumentConverter:
     # Generate the page image and store it in the page object
     def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
-        page.image = page._backend.get_page_image()
+        # default scale
+        page.get_image(scale=1.0)
+        # user requested scales
+        if self.assemble_options.images_scale is not None:
+            page._default_image_scale = self.assemble_options.images_scale
+            page.get_image(
+                scale=self.assemble_options.images_scale
+            )  # this will trigger storing the image in the internal cache
         return page
@@ -247,7 +251,7 @@ class DocumentConverter:
                 draw.rectangle([(x0, y0), (x1, y1)], outline="red")
             image.show()
-        # draw_text_boxes(page.image, cells)
+        # draw_text_boxes(page.get_image(scale=1.0), cells)
         return page

docling/models/base_ocr_model.py ADDED Viewed

@@ -0,0 +1,124 @@
+import copy
+import logging
+from abc import abstractmethod
+from typing import Iterable, List, Tuple
+import numpy
+import numpy as np
+from PIL import Image, ImageDraw
+from rtree import index
+from scipy.ndimage import find_objects, label
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+_log = logging.getLogger(__name__)
+class BaseOcrModel:
+    def __init__(self, config):
+        self.config = config
+        self.enabled = config["enabled"]
+    # Computes the optimum amount and coordinates of rectangles to OCR on a given page
+    def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
+        BITMAP_COVERAGE_TRESHOLD = 0.75
+        def find_ocr_rects(size, bitmap_rects):
+            image = Image.new(
+                "1", (round(size.width), round(size.height))
+            )  # '1' mode is binary
+            # Draw all bitmap rects into a binary image
+            draw = ImageDraw.Draw(image)
+            for rect in bitmap_rects:
+                x0, y0, x1, y1 = rect.as_tuple()
+                x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
+                draw.rectangle([(x0, y0), (x1, y1)], fill=1)
+            np_image = np.array(image)
+            # Find the connected components
+            labeled_image, num_features = label(
+                np_image > 0
+            )  # Label black (0 value) regions
+            # Find enclosing bounding boxes for each connected component.
+            slices = find_objects(labeled_image)
+            bounding_boxes = [
+                BoundingBox(
+                    l=slc[1].start,
+                    t=slc[0].start,
+                    r=slc[1].stop - 1,
+                    b=slc[0].stop - 1,
+                    coord_origin=CoordOrigin.TOPLEFT,
+                )
+                for slc in slices
+            ]
+            # Compute area fraction on page covered by bitmaps
+            area_frac = np.sum(np_image > 0) / (size.width * size.height)
+            return (area_frac, bounding_boxes)  # fraction covered  # boxes
+        bitmap_rects = page._backend.get_bitmap_rects()
+        coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
+        # return full-page rectangle if sufficiently covered with bitmaps
+        if coverage > BITMAP_COVERAGE_TRESHOLD:
+            return [
+                BoundingBox(
+                    l=0,
+                    t=0,
+                    r=page.size.width,
+                    b=page.size.height,
+                    coord_origin=CoordOrigin.TOPLEFT,
+                )
+            ]
+        # return individual rectangles if the bitmap coverage is smaller
+        elif coverage < BITMAP_COVERAGE_TRESHOLD:
+            return ocr_rects
+    # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
+    def filter_ocr_cells(self, ocr_cells, programmatic_cells):
+        # Create R-tree index for programmatic cells
+        p = index.Property()
+        p.dimension = 2
+        idx = index.Index(properties=p)
+        for i, cell in enumerate(programmatic_cells):
+            idx.insert(i, cell.bbox.as_tuple())
+        def is_overlapping_with_existing_cells(ocr_cell):
+            # Query the R-tree to get overlapping rectangles
+            possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
+            return (
+                len(possible_matches_index) > 0
+            )  # this is a weak criterion but it works.
+        filtered_ocr_cells = [
+            rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
+        ]
+        return filtered_ocr_cells
+    def draw_ocr_rects_and_cells(self, page, ocr_rects):
+        image = copy.deepcopy(page.image)
+        draw = ImageDraw.Draw(image, "RGBA")
+        # Draw OCR rectangles as yellow filled rect
+        for rect in ocr_rects:
+            x0, y0, x1, y1 = rect.as_tuple()
+            shade_color = (255, 255, 0, 40)  # transparent yellow
+            draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
+        # Draw OCR and programmatic cells
+        for tc in page.cells:
+            x0, y0, x1, y1 = tc.bbox.as_tuple()
+            color = "red"
+            if isinstance(tc, OcrCell):
+                color = "magenta"
+            draw.rectangle([(x0, y0), (x1, y1)], outline=color)
+        image.show()
+    @abstractmethod
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        pass

docling/models/easyocr_model.py CHANGED Viewed

@@ -1,20 +1,18 @@
-import copy
 import logging
-import random
 from typing import Iterable
 import numpy
-from PIL import ImageDraw
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
-class EasyOcrModel:
+class EasyOcrModel(BaseOcrModel):
     def __init__(self, config):
-        self.config = config
-        self.enabled = config["enabled"]
+        super().__init__(config)
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
         if self.enabled:
@@ -29,49 +27,44 @@ class EasyOcrModel:
             return
         for page in page_batch:
-            # rects = page._fpage.
-            high_res_image = page._backend.get_page_image(scale=self.scale)
-            im = numpy.array(high_res_image)
-            result = self.reader.readtext(im)
-            del high_res_image
-            del im
-            cells = [
-                OcrCell(
-                    id=ix,
-                    text=line[1],
-                    confidence=line[2],
-                    bbox=BoundingBox.from_tuple(
-                        coord=(
-                            line[0][0][0] / self.scale,
-                            line[0][0][1] / self.scale,
-                            line[0][2][0] / self.scale,
-                            line[0][2][1] / self.scale,
-                        ),
-                        origin=CoordOrigin.TOPLEFT,
-                    ),
+            ocr_rects = self.get_ocr_rects(page)
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
                 )
-                for ix, line in enumerate(result)
-            ]
+                im = numpy.array(high_res_image)
+                result = self.reader.readtext(im)
+                del high_res_image
+                del im
+                cells = [
+                    OcrCell(
+                        id=ix,
+                        text=line[1],
+                        confidence=line[2],
+                        bbox=BoundingBox.from_tuple(
+                            coord=(
+                                (line[0][0][0] / self.scale) + ocr_rect.l,
+                                (line[0][0][1] / self.scale) + ocr_rect.t,
+                                (line[0][2][0] / self.scale) + ocr_rect.l,
+                                (line[0][2][1] / self.scale) + ocr_rect.t,
+                            ),
+                            origin=CoordOrigin.TOPLEFT,
+                        ),
+                    )
+                    for ix, line in enumerate(result)
+                ]
+                all_ocr_cells.extend(cells)
-            page.cells = cells  # For now, just overwrites all digital cells.
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
-            # DEBUG code:
-            def draw_clusters_and_cells():
-                image = copy.deepcopy(page.image)
-                draw = ImageDraw.Draw(image)
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                for tc in cells:
-                    x0, y0, x1, y1 = tc.bbox.as_tuple()
-                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-                image.show()
+            page.cells.extend(filtered_ocr_cells)
-            # draw_clusters_and_cells()
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
             yield page

docling/models/layout_model.py CHANGED Viewed

@@ -267,7 +267,9 @@ class LayoutModel:
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
         for page in page_batch:
             clusters = []
-            for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
+            for ix, pred_item in enumerate(
+                self.layout_predictor.predict(page.get_image(scale=1.0))
+            ):
                 cluster = Cluster(
                     id=ix,
                     label=pred_item["label"],

docling/models/table_structure_model.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import copy
-import random
 from typing import Iterable, List
 import numpy
@@ -34,7 +33,9 @@ class TableStructureModel:
             self.scale = 2.0  # Scale up table input images to 144 dpi
     def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
-        image = page._backend.get_page_image()
+        image = (
+            page._backend.get_page_image()
+        )  # make new image to avoid drawing on the saved ones
         draw = ImageDraw.Draw(image)
         for table_element in tbl_list:
@@ -94,13 +95,7 @@ class TableStructureModel:
                 "width": page.size.width * self.scale,
                 "height": page.size.height * self.scale,
             }
-            # add image to page input.
-            if self.scale == 1.0:
-                page_input["image"] = numpy.asarray(page.image)
-            else:  # render new page image on the fly at desired scale
-                page_input["image"] = numpy.asarray(
-                    page._backend.get_page_image(scale=self.scale)
-                )
+            page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
             table_clusters, table_bboxes = zip(*in_tables)

docling/pipeline/base_model_pipeline.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from abc import abstractmethod
 from pathlib import Path
 from typing import Iterable

docling/pipeline/standard_model_pipeline.py CHANGED Viewed

@@ -1,10 +1,8 @@
 from pathlib import Path
-from typing import Iterable
-from docling.datamodel.base_models import Page, PipelineOptions
+from docling.datamodel.base_models import PipelineOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
-from docling.models.page_assemble_model import PageAssembleModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline

{docling-1.4.0.dist-info → docling-1.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.4.0
+Version: 1.6.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -19,20 +19,20 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
-Provides-Extra: easyocr
 Provides-Extra: ocr
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.19.0,<1)
 Requires-Dist: docling-core (>=1.1.2,<2.0.0)
 Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
 Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
-Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
+Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "ocr"
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
 Requires-Dist: requests (>=2.32.3,<3.0.0)
+Requires-Dist: rtree (>=1.3.0,<2.0.0)
 Project-URL: Repository, https://github.com/DS4SD/docling
 Description-Content-Type: text/markdown
@@ -44,6 +44,7 @@ Description-Content-Type: text/markdown
 # Docling
+[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
 ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
 [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -172,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
 If you use Docling in your projects, please consider citing the following:
 ```bib
-@software{Docling,
-author = {Deep Search Team},
-month = {7},
-title = {{Docling}},
-url = {https://github.com/DS4SD/docling},
-version = {main},
-year = {2024}
+@techreport{Docling,
+  author = {Deep Search Team},
+  month = {8},
+  title = {{Docling Technical Report}},
+  url={https://arxiv.org/abs/2408.09869},
+  eprint={2408.09869},
+  doi = "10.48550/arXiv.2408.09869",
+  version = {1.0.0},
+  year = {2024}
 }
 ```

docling-1.6.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,27 @@
+docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/abstract_backend.py,sha256=ZfEHaBPGM1cmqrhaEoU3MHhnHU11NhOnhtFEIbVMYDo,1221
+docling/backend/docling_parse_backend.py,sha256=TN7Ln3Lkc8k0v6HzxA2iUGc8f2iqMw0I-3eryLQkpdw,6924
+docling/backend/pypdfium2_backend.py,sha256=xUiIYgd7i22YDx4-W2hfPUaQFszW0gcT6pavG5qZ8LE,8062
+docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
+docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
+docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
+docling/document_converter.py,sha256=UFSELvUSWsr8s0VByu4lNuzu7bn7zZauJTL3FTSLSBg,10371
+docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
+docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
+docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
+docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
+docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
+docling/models/table_structure_model.py,sha256=5jzTlpM-GdCSq4l0vD1W6aSPTJXeTcXEnNuPxnw-DlA,5437
+docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/pipeline/base_model_pipeline.py,sha256=AC5NTR0xLy5JIZqsTINkKEHeCPqpyvJpuE_bcnZhyvI,529
+docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
+docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
+docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
+docling-1.6.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
+docling-1.6.0.dist-info/METADATA,sha256=iMNzQ5wFtqHCTYat46cOq9JK0nhYKr1N6_PuEuah5D4,7227
+docling-1.6.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-1.6.0.dist-info/RECORD,,

docling-1.4.0.dist-info/RECORD DELETED Viewed

@@ -1,26 +0,0 @@
-docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
-docling/backend/docling_parse_backend.py,sha256=bgsmnwDmroBwuOwkEUzlN9KMEIFJ1xUaCZW6rsr5G-c,5924
-docling/backend/pypdfium2_backend.py,sha256=tv6JxyTkTdT2qr2ghsQgYA2zgpCDxKYSdHVBTAR7FSk,7411
-docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/datamodel/base_models.py,sha256=irZLAHdsROOOwRbywKIA0mk3H8GrLwtGjOgTV6G0QoU,7004
-docling/datamodel/document.py,sha256=lZHXINmPWvpzrV3PTilgJs1blqTMCnJdLEww_qfcqdE,12533
-docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
-docling/document_converter.py,sha256=dMucsq6M_nwPsC1ChogVwJgNDv8sJuFklQWWinDZaug,10246
-docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
-docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
-docling/models/layout_model.py,sha256=3mOgNvCYPh99_oLxJy-ZaIqGOFgG5bcIQ0tTubW656Q,11204
-docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
-docling/models/table_structure_model.py,sha256=xUmfunZNYC30P0fRdESdztqy1FVlMzlhJjLBp-xcn4A,5638
-docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
-docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
-docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
-docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-1.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
-docling-1.4.0.dist-info/METADATA,sha256=Hu8pvrxpc0b1qzQvvzI_ijRAQWjOfcfNl4_1Zb7oyoc,7042
-docling-1.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-1.4.0.dist-info/RECORD,,

{docling-1.4.0.dist-info → docling-1.6.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-1.4.0.dist-info → docling-1.6.0.dist-info}/WHEEL RENAMED Viewed

File without changes

docling 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

docling 1.4.0py3-none-any.whl → 1.6.0py3-none-any.whl