PyPI - docling - Versions diffs - 1.1.2__tar.gz → 1.5.0__tar.gz - Mend

docling 1.1.2tar.gz → 1.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{docling-1.1.2 → docling-1.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.1.2
+Version: 1.5.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -21,9 +21,11 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Provides-Extra: easyocr
 Provides-Extra: ocr
+Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.19.0,<1)
 Requires-Dist: docling-core (>=1.1.2,<2.0.0)
-Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
+Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
+Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
 Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -42,6 +44,7 @@ Description-Content-Type: text/markdown
 # Docling
+[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
 ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
 [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -92,17 +95,21 @@ print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotate
 ### Convert a batch of documents
-For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
 From a local repo clone, you can run it with:
 ```
-python examples/convert.py
+python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.
 ### Adjust pipeline features
+The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
+one can adjust the conversion pipeline and features.
 #### Control pipeline options
 You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
@@ -166,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
 If you use Docling in your projects, please consider citing the following:
 ```bib
-@software{Docling,
-author = {Deep Search Team},
-month = {7},
-title = {{Docling}},
-url = {https://github.com/DS4SD/docling},
-version = {main},
-year = {2024}
+@techreport{Docling,
+  author = {Deep Search Team},
+  month = {8},
+  title = {{Docling Technical Report}},
+  url={https://arxiv.org/abs/2408.09869},
+  eprint={2408.09869},
+  doi = "10.48550/arXiv.2408.09869",
+  version = {1.0.0},
+  year = {2024}
 }
 ```

{docling-1.1.2 → docling-1.5.0}/README.md RENAMED Viewed

@@ -6,6 +6,7 @@
 # Docling
+[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
 ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
 [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -56,17 +57,21 @@ print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotate
 ### Convert a batch of documents
-For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
 From a local repo clone, you can run it with:
 ```
-python examples/convert.py
+python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.
 ### Adjust pipeline features
+The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
+one can adjust the conversion pipeline and features.
 #### Control pipeline options
 You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
@@ -130,13 +135,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
 If you use Docling in your projects, please consider citing the following:
 ```bib
-@software{Docling,
-author = {Deep Search Team},
-month = {7},
-title = {{Docling}},
-url = {https://github.com/DS4SD/docling},
-version = {main},
-year = {2024}
+@techreport{Docling,
+  author = {Deep Search Team},
+  month = {8},
+  title = {{Docling Technical Report}},
+  url={https://arxiv.org/abs/2408.09869},
+  eprint={2408.09869},
+  doi = "10.48550/arXiv.2408.09869",
+  version = {1.0.0},
+  year = {2024}
 }
 ```

{docling-1.1.2 → docling-1.5.0}/docling/backend/abstract_backend.py RENAMED Viewed

@@ -35,7 +35,7 @@ class PdfPageBackend(ABC):
 class PdfDocumentBackend(ABC):
     @abstractmethod
-    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+    def __init__(self, path_or_stream: Union[BytesIO, Path]):
         pass
     @abstractmethod

docling-1.5.0/docling/backend/docling_parse_backend.py ADDED Viewed

@@ -0,0 +1,187 @@
+import logging
+import random
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Iterable, List, Optional, Union
+import pypdfium2 as pdfium
+from docling_parse.docling_parse import pdf_parser
+from PIL import Image, ImageDraw
+from pypdfium2 import PdfPage
+from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+_log = logging.getLogger(__name__)
+class DoclingParsePageBackend(PdfPageBackend):
+    def __init__(self, page_obj: PdfPage, docling_page_obj):
+        super().__init__(page_obj)
+        self._ppage = page_obj
+        self._dpage = docling_page_obj
+        self.text_page = None
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+        parser_width = self._dpage["width"]
+        parser_height = self._dpage["height"]
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+        for i in range(len(self._dpage["cells"])):
+            rect = self._dpage["cells"][i]["box"]["device"]
+            x0, y0, x1, y1 = rect
+            cell_bbox = BoundingBox(
+                l=x0 * scale * page_size.width / parser_width,
+                b=y0 * scale * page_size.height / parser_height,
+                r=x1 * scale * page_size.width / parser_width,
+                t=y1 * scale * page_size.height / parser_height,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            ).to_top_left_origin(page_size.height * scale)
+            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
+        return text_piece
+    def get_text_cells(self) -> Iterable[Cell]:
+        cells = []
+        cell_counter = 0
+        page_size = self.get_size()
+        parser_width = self._dpage["width"]
+        parser_height = self._dpage["height"]
+        for i in range(len(self._dpage["cells"])):
+            rect = self._dpage["cells"][i]["box"]["device"]
+            x0, y0, x1, y1 = rect
+            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
+            cells.append(
+                Cell(
+                    id=cell_counter,
+                    text=text_piece,
+                    bbox=BoundingBox(
+                        # l=x0, b=y0, r=x1, t=y1,
+                        l=x0 * page_size.width / parser_width,
+                        b=y0 * page_size.height / parser_height,
+                        r=x1 * page_size.width / parser_width,
+                        t=y1 * page_size.height / parser_height,
+                        coord_origin=CoordOrigin.BOTTOMLEFT,
+                    ).to_top_left_origin(page_size.height),
+                )
+            )
+            cell_counter += 1
+        def draw_clusters_and_cells():
+            image = (
+                self.get_page_image()
+            )  # make new image to avoid drawing on the saved ones
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+            image.show()
+        # before merge:
+        # draw_clusters_and_cells()
+        # cells = merge_horizontal_cells(cells)
+        # after merge:
+        # draw_clusters_and_cells()
+        return cells
+    def get_page_image(
+        self, scale: int = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        page_size = self.get_size()
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+            padbox = BoundingBox(
+                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
+            )
+        else:
+            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox.r = page_size.width - padbox.r
+            padbox.t = page_size.height - padbox.t
+        image = (
+            self._ppage.render(
+                scale=scale * 1.5,
+                rotation=0,  # no additional rotation
+                crop=padbox.as_tuple(),
+            )
+            .to_pil()
+            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
+        )  # We resize the image from 1.5x the given scale to make it sharper.
+        return image
+    def get_size(self) -> PageSize:
+        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def unload(self):
+        self._ppage = None
+        self._dpage = None
+        self.text_page = None
+class DoclingParseDocumentBackend(PdfDocumentBackend):
+    def __init__(self, path_or_stream: Union[BytesIO, Path]):
+        super().__init__(path_or_stream)
+        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        # Parsing cells with docling_parser call
+        parser = pdf_parser()
+        start_pb_time = time.time()
+        if isinstance(path_or_stream, BytesIO):
+            self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
+        else:
+            self._parser_doc = parser.find_cells(str(path_or_stream))
+        end_pb_time = time.time() - start_pb_time
+        _log.info(
+            f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
+        )
+    def page_count(self) -> int:
+        return len(self._parser_doc["pages"])
+    def load_page(self, page_no: int) -> PdfPage:
+        return DoclingParsePageBackend(
+            self._pdoc[page_no], self._parser_doc["pages"][page_no]
+        )
+    def is_valid(self) -> bool:
+        return self.page_count() > 0
+    def unload(self):
+        self._pdoc.close()
+        self._pdoc = None
+        self._parser_doc = None

{docling-1.1.2 → docling-1.5.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
             return merged_cells
         def draw_clusters_and_cells():
-            image = self.get_page_image()
+            image = (
+                self.get_page_image()
+            )  # make new image to avoid drawing on the saved ones
             draw = ImageDraw.Draw(image)
             for c in cells:
                 x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -199,7 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+    def __init__(self, path_or_stream: Union[BytesIO, Path]):
         super().__init__(path_or_stream)
         self._pdoc = pdfium.PdfDocument(path_or_stream)

{docling-1.1.2 → docling-1.5.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -1,10 +1,12 @@
 import copy
+import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, model_validator
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from typing_extensions import Self
 from docling.backend.abstract_backend import PdfPageBackend
@@ -234,14 +236,30 @@ class Page(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
     page_no: int
-    page_hash: str = None
-    size: PageSize = None
-    image: Image = None
+    page_hash: Optional[str] = None
+    size: Optional[PageSize] = None
     cells: List[Cell] = None
     predictions: PagePredictions = PagePredictions()
-    assembled: AssembledUnit = None
+    assembled: Optional[AssembledUnit] = None
-    _backend: PdfPageBackend = None  # Internal PDF backend
+    _backend: Optional[PdfPageBackend] = (
+        None  # Internal PDF backend. By default it is cleared during assembling.
+    )
+    _default_image_scale: float = 1.0  # Default image scale for external usage.
+    _image_cache: Dict[float, Image] = (
+        {}
+    )  # Cache of images in different scales. By default it is cleared during assembling.
+    def get_image(self, scale: float = 1.0) -> Optional[Image]:
+        if self._backend is None:
+            return self._image_cache.get(scale, None)
+        if not scale in self._image_cache:
+            self._image_cache[scale] = self._backend.get_page_image(scale=scale)
+        return self._image_cache[scale]
+    @property
+    def image(self) -> Optional[Image]:
+        return self.get_image(scale=self._default_image_scale)
 class DocumentStream(BaseModel):
@@ -265,3 +283,22 @@ class PipelineOptions(BaseModel):
     do_ocr: bool = False  # True: perform OCR, replace programmatic PDF text
     table_structure_options: TableStructureOptions = TableStructureOptions()
+class AssembleOptions(BaseModel):
+    keep_page_images: Annotated[
+        bool,
+        Field(
+            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
+        ),
+    ] = False  # False: page images are removed in the assemble step
+    images_scale: Optional[float] = None  # if set, the scale for generated images
+    @model_validator(mode="after")
+    def set_page_images_from_deprecated(self) -> Self:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", DeprecationWarning)
+            default_scale = 1.0
+            if self.keep_page_images and self.images_scale is None:
+                self.images_scale = default_scale
+        return self

{docling-1.1.2 → docling-1.5.0}/docling/datamodel/document.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
+from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
 from docling_core.types import BaseCell, BaseText
 from docling_core.types import BoundingBox as DsBoundingBox
@@ -14,13 +14,14 @@ from docling_core.types import TableCell
 from pydantic import BaseModel
 from docling.backend.abstract_backend import PdfDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import (
     AssembledUnit,
     ConversionStatus,
     DocumentStream,
     FigureElement,
     Page,
+    PageElement,
     TableElement,
     TextElement,
 )
@@ -64,7 +65,7 @@ class InputDocument(BaseModel):
         path_or_stream: Union[BytesIO, Path],
         filename: Optional[str] = None,
         limits: Optional[DocumentLimits] = None,
-        pdf_backend=PyPdfiumDocumentBackend,
+        pdf_backend=DoclingParseDocumentBackend,
     ):
         super().__init__()
@@ -302,13 +303,27 @@ class ConvertedDocument(BaseModel):
         else:
             return ""
+    def render_element_images(
+        self, element_types: Tuple[PageElement] = (FigureElement,)
+    ):
+        for element in self.assembled.elements:
+            if isinstance(element, element_types):
+                page_ix = element.page_no
+                scale = self.pages[page_ix]._default_image_scale
+                crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
+                    page_height=self.pages[page_ix].size.height * scale
+                )
+                cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
+                yield element, cropped_im
 class DocumentConversionInput(BaseModel):
     _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
     limits: Optional[DocumentLimits] = DocumentLimits()
-    DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
+    DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
     def docs(
         self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None

{docling-1.1.2 → docling-1.5.0}/docling/document_converter.py RENAMED Viewed

@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
 from docling.backend.abstract_backend import PdfDocumentBackend
 from docling.datamodel.base_models import (
     AssembledUnit,
+    AssembleOptions,
     ConversionStatus,
     Page,
     PipelineOptions,
@@ -44,6 +45,7 @@ class DocumentConverter:
         pipeline_options: PipelineOptions = PipelineOptions(),
         pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
         pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
+        assemble_options: AssembleOptions = AssembleOptions(),
     ):
         if not artifacts_path:
             artifacts_path = self.download_models_hf()
@@ -57,6 +59,7 @@ class DocumentConverter:
         self.page_assemble_model = PageAssembleModel(config={})
         self.glm_model = GlmModel(config={})
         self.pdf_backend = pdf_backend
+        self.assemble_options = assemble_options
     @staticmethod
     def download_models_hf(
@@ -174,17 +177,21 @@ class DocumentConverter:
                     pages_with_images,
                 )
+                # 4. Run pipeline stages
                 pipeline_pages = self.model_pipeline.apply(pages_with_cells)
-                # 7. Assemble page elements (per page)
+                # 5. Assemble page elements (per page)
                 assembled_pages = self.page_assemble_model(pipeline_pages)
                 # exhaust assembled_pages
                 for assembled_page in assembled_pages:
                     # Free up mem resources before moving on with next batch
-                    assembled_page.image = (
-                        None  # Comment this if you want to visualize page images
-                    )
+                    # Remove page images (can be disabled)
+                    if self.assemble_options.images_scale is None:
+                        assembled_page._image_cache = {}
+                    # Unload backend
                     assembled_page._backend.unload()
                     all_assembled_pages.append(assembled_page)
@@ -222,7 +229,15 @@ class DocumentConverter:
     # Generate the page image and store it in the page object
     def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
-        page.image = page._backend.get_page_image()
+        # default scale
+        page.get_image(scale=1.0)
+        # user requested scales
+        if self.assemble_options.images_scale is not None:
+            page._default_image_scale = self.assemble_options.images_scale
+            page.get_image(
+                scale=self.assemble_options.images_scale
+            )  # this will trigger storing the image in the internal cache
         return page
@@ -238,7 +253,7 @@ class DocumentConverter:
                 draw.rectangle([(x0, y0), (x1, y1)], outline="red")
             image.show()
-        # draw_text_boxes(page.image, cells)
+        # draw_text_boxes(page.get_image(scale=1.0), cells)
         return page

{docling-1.1.2 → docling-1.5.0}/docling/models/easyocr_model.py RENAMED Viewed

@@ -30,7 +30,7 @@ class EasyOcrModel:
         for page in page_batch:
             # rects = page._fpage.
-            high_res_image = page._backend.get_page_image(scale=self.scale)
+            high_res_image = page.get_image(scale=self.scale)
             im = numpy.array(high_res_image)
             result = self.reader.readtext(im)

{docling-1.1.2 → docling-1.5.0}/docling/models/layout_model.py RENAMED Viewed

@@ -69,6 +69,10 @@ class LayoutModel:
             "Key-Value Region": 0.45,
         }
+        CLASS_REMAPPINGS = {
+            "Document Index": "Table",
+        }
         _log.debug("================= Start postprocess function ====================")
         start_time = time.time()
         # Apply Confidence Threshold to cluster predictions
@@ -79,6 +83,10 @@ class LayoutModel:
             confidence = CLASS_THRESHOLDS[cluster.label]
             if cluster.confidence >= confidence:
                 # annotation["created_by"] = "high_conf_pred"
+                # Remap class labels where needed.
+                if cluster.label in CLASS_REMAPPINGS.keys():
+                    cluster.label = CLASS_REMAPPINGS[cluster.label]
                 clusters_out.append(cluster)
         # map to dictionary clusters and cells, with bottom left origin
@@ -259,7 +267,9 @@ class LayoutModel:
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
         for page in page_batch:
             clusters = []
-            for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
+            for ix, pred_item in enumerate(
+                self.layout_predictor.predict(page.get_image(scale=1.0))
+            ):
                 cluster = Cluster(
                     id=ix,
                     label=pred_item["label"],

{docling-1.1.2 → docling-1.5.0}/docling/models/table_structure_model.py RENAMED Viewed

@@ -34,7 +34,9 @@ class TableStructureModel:
             self.scale = 2.0  # Scale up table input images to 144 dpi
     def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
-        image = page._backend.get_page_image()
+        image = (
+            page._backend.get_page_image()
+        )  # make new image to avoid drawing on the saved ones
         draw = ImageDraw.Draw(image)
         for table_element in tbl_list:
@@ -94,13 +96,7 @@ class TableStructureModel:
                 "width": page.size.width * self.scale,
                 "height": page.size.height * self.scale,
             }
-            # add image to page input.
-            if self.scale == 1.0:
-                page_input["image"] = numpy.asarray(page.image)
-            else:  # render new page image on the fly at desired scale
-                page_input["image"] = numpy.asarray(
-                    page._backend.get_page_image(scale=self.scale)
-                )
+            page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
             table_clusters, table_bboxes = zip(*in_tables)

{docling-1.1.2 → docling-1.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.1.2"  # DO NOT EDIT, updated automatically
+version = "1.5.0"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -24,7 +24,7 @@ packages = [{include = "docling"}]
 python = "^3.10"
 pydantic = "^2.0.0"
 docling-core = "^1.1.2"
-docling-ibm-models = "^1.1.0"
+docling-ibm-models = "^1.1.1"
 deepsearch-glm = ">=0.19.0,<1"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
@@ -32,6 +32,8 @@ pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
 requests = "^2.32.3"
 easyocr = { version = "^1.7", optional = true }
+docling-parse = "^0.2.0"
+certifi = ">=2024.7.4"
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}