PyPI - docling - Versions diffs - 2.15.0__py3-none-any.whl → 2.16.0__py3-none-any.whl - Mend

docling 2.15.0py3-none-any.whl → 2.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

docling/backend/abstract_backend.py +0 -1
docling/backend/asciidoc_backend.py +0 -1
docling/backend/docling_parse_backend.py +2 -2
docling/backend/docling_parse_v2_backend.py +2 -2
docling/backend/html_backend.py +1 -1
docling/backend/json/__init__.py +0 -0
docling/backend/json/docling_json_backend.py +58 -0
docling/backend/md_backend.py +44 -27
docling/backend/msexcel_backend.py +50 -38
docling/backend/msword_backend.py +0 -1
docling/backend/pdf_backend.py +0 -2
docling/backend/pypdfium2_backend.py +2 -2
docling/datamodel/base_models.py +30 -3
docling/datamodel/document.py +2 -0
docling/datamodel/pipeline_options.py +7 -10
docling/document_converter.py +4 -0
docling/models/base_model.py +62 -6
docling/models/base_ocr_model.py +15 -12
docling/models/code_formula_model.py +245 -0
docling/models/document_picture_classifier.py +187 -0
docling/models/layout_model.py +10 -86
docling/models/page_assemble_model.py +1 -33
docling/models/tesseract_ocr_cli_model.py +0 -1
docling/models/tesseract_ocr_model.py +63 -15
docling/pipeline/base_pipeline.py +40 -17
docling/pipeline/standard_pdf_pipeline.py +31 -2
docling/utils/glm_utils.py +4 -1
docling/utils/visualization.py +80 -0
{docling-2.15.0.dist-info → docling-2.16.0.dist-info}/METADATA +7 -7
docling-2.16.0.dist-info/RECORD +61 -0
docling-2.15.0.dist-info/RECORD +0 -56
{docling-2.15.0.dist-info → docling-2.16.0.dist-info}/LICENSE +0 -0
{docling-2.15.0.dist-info → docling-2.16.0.dist-info}/WHEEL +0 -0
{docling-2.15.0.dist-info → docling-2.16.0.dist-info}/entry_points.txt +0 -0

docling/models/layout_model.py CHANGED Viewed

@@ -1,28 +1,21 @@
 import copy
 import logging
-import random
-import time
 from pathlib import Path
-from typing import Iterable, List
+from typing import Iterable
-from docling_core.types.doc import CoordOrigin, DocItemLabel
+from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
-from PIL import Image, ImageDraw, ImageFont
-from docling.datamodel.base_models import (
-    BoundingBox,
-    Cell,
-    Cluster,
-    LayoutPrediction,
-    Page,
-)
+from PIL import Image
+from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
 from docling.utils.profiling import TimeRecorder
+from docling.utils.visualization import draw_clusters
 _log = logging.getLogger(__name__)
@@ -40,7 +33,7 @@ class LayoutModel(BasePageModel):
         DocItemLabel.PAGE_FOOTER,
         DocItemLabel.CODE,
         DocItemLabel.LIST_ITEM,
-        # "Formula",
+        DocItemLabel.FORMULA,
     ]
     PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
@@ -82,78 +75,9 @@ class LayoutModel(BasePageModel):
         left_image = copy.deepcopy(page.image)
         right_image = copy.deepcopy(page.image)
-        # Function to draw clusters on an image
-        def draw_clusters(image, clusters):
-            draw = ImageDraw.Draw(image, "RGBA")
-            # Create a smaller font for the labels
-            try:
-                font = ImageFont.truetype("arial.ttf", 12)
-            except OSError:
-                # Fallback to default font if arial is not available
-                font = ImageFont.load_default()
-            for c_tl in clusters:
-                all_clusters = [c_tl, *c_tl.children]
-                for c in all_clusters:
-                    # Draw cells first (underneath)
-                    cell_color = (0, 0, 0, 40)  # Transparent black for cells
-                    for tc in c.cells:
-                        cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
-                        cx0 *= scale_x
-                        cx1 *= scale_x
-                        cy0 *= scale_x
-                        cy1 *= scale_y
-                        draw.rectangle(
-                            [(cx0, cy0), (cx1, cy1)],
-                            outline=None,
-                            fill=cell_color,
-                        )
-                    # Draw cluster rectangle
-                    x0, y0, x1, y1 = c.bbox.as_tuple()
-                    x0 *= scale_x
-                    x1 *= scale_x
-                    y0 *= scale_x
-                    y1 *= scale_y
-                    cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
-                    cluster_outline_color = (
-                        *list(DocItemLabel.get_color(c.label)),
-                        255,
-                    )
-                    draw.rectangle(
-                        [(x0, y0), (x1, y1)],
-                        outline=cluster_outline_color,
-                        fill=cluster_fill_color,
-                    )
-                    # Add label name and confidence
-                    label_text = f"{c.label.name} ({c.confidence:.2f})"
-                    # Create semi-transparent background for text
-                    text_bbox = draw.textbbox((x0, y0), label_text, font=font)
-                    text_bg_padding = 2
-                    draw.rectangle(
-                        [
-                            (
-                                text_bbox[0] - text_bg_padding,
-                                text_bbox[1] - text_bg_padding,
-                            ),
-                            (
-                                text_bbox[2] + text_bg_padding,
-                                text_bbox[3] + text_bg_padding,
-                            ),
-                        ],
-                        fill=(255, 255, 255, 180),  # Semi-transparent white
-                    )
-                    # Draw text
-                    draw.text(
-                        (x0, y0),
-                        label_text,
-                        fill=(0, 0, 0, 255),  # Solid black
-                        font=font,
-                    )
         # Draw clusters on both images
-        draw_clusters(left_image, left_clusters)
-        draw_clusters(right_image, right_clusters)
+        draw_clusters(left_image, left_clusters, scale_x, scale_y)
+        draw_clusters(right_image, right_clusters, scale_x, scale_y)
         # Combine the images side by side
         combined_width = left_image.width * 2
         combined_height = left_image.height

docling/models/page_assemble_model.py CHANGED Viewed

@@ -22,7 +22,7 @@ _log = logging.getLogger(__name__)
 class PageAssembleOptions(BaseModel):
-    keep_images: bool = False
+    pass
 class PageAssembleModel(BasePageModel):
@@ -135,31 +135,6 @@ class PageAssembleModel(BasePageModel):
                                 )
                             elements.append(fig)
                             body.append(fig)
-                        elif cluster.label == LayoutModel.FORMULA_LABEL:
-                            equation = None
-                            if page.predictions.equations_prediction:
-                                equation = page.predictions.equations_prediction.equation_map.get(
-                                    cluster.id, None
-                                )
-                            if (
-                                not equation
-                            ):  # fallback: add empty formula, if it isn't present
-                                text = self.sanitize_text(
-                                    [
-                                        cell.text.replace("\x02", "-").strip()
-                                        for cell in cluster.cells
-                                        if len(cell.text.strip()) > 0
-                                    ]
-                                )
-                                equation = TextElement(
-                                    label=cluster.label,
-                                    id=cluster.id,
-                                    cluster=cluster,
-                                    page_no=page.page_no,
-                                    text=text,
-                                )
-                            elements.append(equation)
-                            body.append(equation)
                         elif cluster.label in LayoutModel.CONTAINER_LABELS:
                             container_el = ContainerElement(
                                 label=cluster.label,
@@ -174,11 +149,4 @@ class PageAssembleModel(BasePageModel):
                         elements=elements, headers=headers, body=body
                     )
-                    # Remove page images (can be disabled)
-                    if not self.options.keep_images:
-                        page._image_cache = {}
-                    # Unload backend
-                    page._backend.unload()
                 yield page

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -20,7 +20,6 @@ _log = logging.getLogger(__name__)
 class TesseractOcrCliModel(BaseOcrModel):
     def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
         super().__init__(enabled=enabled, options=options)
         self.options: TesseractCliOcrOptions

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -54,43 +54,56 @@ class TesseractOcrModel(BaseOcrModel):
             # Initialize the tesseractAPI
             _log.debug("Initializing TesserOCR: %s", tesseract_version)
             lang = "+".join(self.options.lang)
+            self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
+            if any([l.startswith("script/") for l in tesserocr_languages]):
+                self.script_prefix = "script/"
+            else:
+                self.script_prefix = ""
+            tesserocr_kwargs = {
+                "psm": tesserocr.PSM.AUTO,
+                "init": True,
+                "oem": tesserocr.OEM.DEFAULT,
+            }
             if self.options.path is not None:
+                tesserocr_kwargs["path"] = self.options.path
+            if lang == "auto":
                 self.reader = tesserocr.PyTessBaseAPI(
-                    path=self.options.path,
-                    lang=lang,
-                    psm=tesserocr.PSM.AUTO,
-                    init=True,
-                    oem=tesserocr.OEM.DEFAULT,
+                    **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
                 )
             else:
                 self.reader = tesserocr.PyTessBaseAPI(
-                    lang=lang,
-                    psm=tesserocr.PSM.AUTO,
-                    init=True,
-                    oem=tesserocr.OEM.DEFAULT,
+                    **{"lang": lang} | tesserocr_kwargs,
                 )
             self.reader_RIL = tesserocr.RIL
     def __del__(self):
         if self.reader is not None:
             # Finalize the tesseractAPI
             self.reader.End()
+        for script in self.script_readers:
+            self.script_readers[script].End()
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
+        import tesserocr
         for page in page_batch:
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
             else:
                 with TimeRecorder(conv_res, "ocr"):
                     assert self.reader is not None
                     ocr_rects = self.get_ocr_rects(page)
@@ -106,20 +119,55 @@ class TesseractOcrModel(BaseOcrModel):
                         # Retrieve text snippets with their bounding boxes
                         self.reader.SetImage(high_res_image)
-                        boxes = self.reader.GetComponentImages(
+                        if self.options.lang == ["auto"]:
+                            osd = self.reader.DetectOrientationScript()
+                            # No text, probably
+                            if osd is None:
+                                continue
+                            script = osd["script_name"]
+                            if script == "Katakana" or script == "Hiragana":
+                                script = "Japanese"
+                            elif script == "Han":
+                                script = "HanS"
+                            elif script == "Korean":
+                                script = "Hangul"
+                            _log.debug(
+                                f'Using model for the detected script "{script}"'
+                            )
+                            if script not in self.script_readers:
+                                self.script_readers[script] = tesserocr.PyTessBaseAPI(
+                                    path=self.reader.GetDatapath(),
+                                    lang=f"{self.script_prefix}{script}",
+                                    psm=tesserocr.PSM.AUTO,
+                                    init=True,
+                                    oem=tesserocr.OEM.DEFAULT,
+                                )
+                            local_reader = self.script_readers[script]
+                            local_reader.SetImage(high_res_image)
+                        else:
+                            local_reader = self.reader
+                        boxes = local_reader.GetComponentImages(
                             self.reader_RIL.TEXTLINE, True
                         )
                         cells = []
                         for ix, (im, box, _, _) in enumerate(boxes):
                             # Set the area of interest. Tesseract uses Bottom-Left for the origin
-                            self.reader.SetRectangle(
+                            local_reader.SetRectangle(
                                 box["x"], box["y"], box["w"], box["h"]
                             )
                             # Extract text within the bounding box
-                            text = self.reader.GetUTF8Text().strip()
-                            confidence = self.reader.MeanTextConf()
+                            text = local_reader.GetUTF8Text().strip()
+                            confidence = local_reader.MeanTextConf()
                             left = box["x"] / self.scale
                             bottom = box["y"] / self.scale
                             right = (box["x"] + box["w"]) / self.scale

docling/pipeline/base_pipeline.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import time
 import traceback
 from abc import ABC, abstractmethod
-from typing import Callable, Iterable, List
+from typing import Any, Callable, Iterable, List
 from docling_core.types.doc import DoclingDocument, NodeItem
@@ -18,7 +18,7 @@ from docling.datamodel.base_models import (
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
-from docling.models.base_model import BaseEnrichmentModel
+from docling.models.base_model import GenericEnrichmentModel
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import chunkify
@@ -28,8 +28,9 @@ _log = logging.getLogger(__name__)
 class BasePipeline(ABC):
     def __init__(self, pipeline_options: PipelineOptions):
         self.pipeline_options = pipeline_options
+        self.keep_images = False
         self.build_pipe: List[Callable] = []
-        self.enrichment_pipe: List[BaseEnrichmentModel] = []
+        self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
     def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
         conv_res = ConversionResult(input=in_doc)
@@ -40,7 +41,7 @@ class BasePipeline(ABC):
                 conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
             ):
                 # These steps are building and assembling the structure of the
-                # output DoclingDocument
+                # output DoclingDocument.
                 conv_res = self._build_document(conv_res)
                 conv_res = self._assemble_document(conv_res)
                 # From this stage, all operations should rely only on conv_res.output
@@ -50,6 +51,8 @@ class BasePipeline(ABC):
             conv_res.status = ConversionStatus.FAILURE
             if raises_on_error:
                 raise e
+        finally:
+            self._unload(conv_res)
         return conv_res
@@ -62,21 +65,22 @@ class BasePipeline(ABC):
     def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
-        def _filter_elements(
-            doc: DoclingDocument, model: BaseEnrichmentModel
+        def _prepare_elements(
+            conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
         ) -> Iterable[NodeItem]:
-            for element, _level in doc.iterate_items():
-                if model.is_processable(doc=doc, element=element):
-                    yield element
+            for doc_element, _level in conv_res.document.iterate_items():
+                prepared_element = model.prepare_element(
+                    conv_res=conv_res, element=doc_element
+                )
+                if prepared_element is not None:
+                    yield prepared_element
         with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
             for model in self.enrichment_pipe:
                 for element_batch in chunkify(
-                    _filter_elements(conv_res.document, model),
+                    _prepare_elements(conv_res, model),
                     settings.perf.elements_batch_size,
                 ):
-                    # TODO: currently we assume the element itself is modified, because
-                    # we don't have an interface to save the element back to the document
                     for element in model(
                         doc=conv_res.document, element_batch=element_batch
                     ):  # Must exhaust!
@@ -88,6 +92,9 @@ class BasePipeline(ABC):
     def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
         pass
+    def _unload(self, conv_res: ConversionResult):
+        pass
     @classmethod
     @abstractmethod
     def get_default_options(cls) -> PipelineOptions:
@@ -107,6 +114,10 @@ class BasePipeline(ABC):
 class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
+    def __init__(self, pipeline_options: PipelineOptions):
+        super().__init__(pipeline_options)
+        self.keep_backend = False
     def _apply_on_pages(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -148,7 +159,14 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                     pipeline_pages = self._apply_on_pages(conv_res, init_pages)
                     for p in pipeline_pages:  # Must exhaust!
-                        pass
+                        # Cleanup cached images
+                        if not self.keep_images:
+                            p._image_cache = {}
+                        # Cleanup page backends
+                        if not self.keep_backend and p._backend is not None:
+                            p._backend.unload()
                     end_batch_time = time.monotonic()
                     total_elapsed_time += end_batch_time - start_batch_time
@@ -177,10 +195,15 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                 )
                 raise e
-            finally:
-                # Always unload the PDF backend, even in case of failure
-                if conv_res.input._backend:
-                    conv_res.input._backend.unload()
+        return conv_res
+    def _unload(self, conv_res: ConversionResult) -> ConversionResult:
+        for page in conv_res.pages:
+            if page._backend is not None:
+                page._backend.unload()
+        if conv_res.input._backend:
+            conv_res.input._backend.unload()
         return conv_res

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -18,6 +18,11 @@ from docling.datamodel.pipeline_options import (
     TesseractOcrOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
+from docling.models.document_picture_classifier import (
+    DocumentPictureClassifier,
+    DocumentPictureClassifierOptions,
+)
 from docling.models.ds_glm_model import GlmModel, GlmOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
@@ -50,7 +55,7 @@ class StandardPdfPipeline(PaginatedPipeline):
         else:
             self.artifacts_path = Path(pipeline_options.artifacts_path)
-        keep_images = (
+        self.keep_images = (
             self.pipeline_options.generate_page_images
             or self.pipeline_options.generate_picture_images
             or self.pipeline_options.generate_table_images
@@ -87,13 +92,37 @@ class StandardPdfPipeline(PaginatedPipeline):
                 accelerator_options=pipeline_options.accelerator_options,
             ),
             # Page assemble
-            PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
+            PageAssembleModel(options=PageAssembleOptions()),
         ]
         self.enrichment_pipe = [
             # Other models working on `NodeItem` elements in the DoclingDocument
+            # Code Formula Enrichment Model
+            CodeFormulaModel(
+                enabled=pipeline_options.do_code_enrichment
+                or pipeline_options.do_formula_enrichment,
+                artifacts_path=pipeline_options.artifacts_path,
+                options=CodeFormulaModelOptions(
+                    do_code_enrichment=pipeline_options.do_code_enrichment,
+                    do_formula_enrichment=pipeline_options.do_formula_enrichment,
+                ),
+                accelerator_options=pipeline_options.accelerator_options,
+            ),
+            # Document Picture Classifier
+            DocumentPictureClassifier(
+                enabled=pipeline_options.do_picture_classification,
+                artifacts_path=pipeline_options.artifacts_path,
+                options=DocumentPictureClassifierOptions(),
+                accelerator_options=pipeline_options.accelerator_options,
+            ),
         ]
+        if (
+            self.pipeline_options.do_formula_enrichment
+            or self.pipeline_options.do_code_enrichment
+        ):
+            self.keep_backend = True
     @staticmethod
     def download_models_hf(
         local_dir: Optional[Path] = None, force: bool = False

docling/utils/glm_utils.py CHANGED Viewed

@@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
             container_el = doc.add_group(label=group_label)
             _add_child_elements(container_el, doc, obj, pelem)
         elif "text" in obj:
             text = obj["text"][span_i:span_j]
@@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
                 current_list = None
                 doc.add_heading(text=text, prov=prov)
+            elif label == DocItemLabel.CODE:
+                current_list = None
+                doc.add_code(text=text, prov=prov)
             else:
                 current_list = None

docling/utils/visualization.py ADDED Viewed

@@ -0,0 +1,80 @@
+from docling_core.types.doc import DocItemLabel
+from PIL import Image, ImageDraw, ImageFont
+from PIL.ImageFont import FreeTypeFont
+from docling.datamodel.base_models import Cluster
+def draw_clusters(
+    image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
+) -> None:
+    """
+    Draw clusters on an image
+    """
+    draw = ImageDraw.Draw(image, "RGBA")
+    # Create a smaller font for the labels
+    font: ImageFont.ImageFont | FreeTypeFont
+    try:
+        font = ImageFont.truetype("arial.ttf", 12)
+    except OSError:
+        # Fallback to default font if arial is not available
+        font = ImageFont.load_default()
+    for c_tl in clusters:
+        all_clusters = [c_tl, *c_tl.children]
+        for c in all_clusters:
+            # Draw cells first (underneath)
+            cell_color = (0, 0, 0, 40)  # Transparent black for cells
+            for tc in c.cells:
+                cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
+                cx0 *= scale_x
+                cx1 *= scale_x
+                cy0 *= scale_x
+                cy1 *= scale_y
+                draw.rectangle(
+                    [(cx0, cy0), (cx1, cy1)],
+                    outline=None,
+                    fill=cell_color,
+                )
+            # Draw cluster rectangle
+            x0, y0, x1, y1 = c.bbox.as_tuple()
+            x0 *= scale_x
+            x1 *= scale_x
+            y0 *= scale_x
+            y1 *= scale_y
+            cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
+            cluster_outline_color = (
+                *list(DocItemLabel.get_color(c.label)),
+                255,
+            )
+            draw.rectangle(
+                [(x0, y0), (x1, y1)],
+                outline=cluster_outline_color,
+                fill=cluster_fill_color,
+            )
+            # Add label name and confidence
+            label_text = f"{c.label.name} ({c.confidence:.2f})"
+            # Create semi-transparent background for text
+            text_bbox = draw.textbbox((x0, y0), label_text, font=font)
+            text_bg_padding = 2
+            draw.rectangle(
+                [
+                    (
+                        text_bbox[0] - text_bg_padding,
+                        text_bbox[1] - text_bg_padding,
+                    ),
+                    (
+                        text_bbox[2] + text_bg_padding,
+                        text_bbox[3] + text_bg_padding,
+                    ),
+                ],
+                fill=(255, 255, 255, 180),  # Semi-transparent white
+            )
+            # Draw text
+            draw.text(
+                (x0, y0),
+                label_text,
+                fill=(0, 0, 0, 255),  # Solid black
+                font=font,
+            )

{docling-2.15.0.dist-info → docling-2.16.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.15.0
+Version: 2.16.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -26,9 +26,9 @@ Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
-Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
-Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
-Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
+Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
+Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
+Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,13 +39,14 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
 Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
 Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
+Requires-Dist: pillow (>=10.0.0,<11.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
 Requires-Dist: python-docx (>=1.1.2,<2.0.0)
 Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
 Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
-Requires-Dist: requests (>=2.32.3,<3.0.0)
+Requires-Dist: requests (>=2.32.2,<3.0.0)
 Requires-Dist: rtree (>=1.3.0,<2.0.0)
 Requires-Dist: scipy (>=1.6.0,<2.0.0)
 Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
@@ -84,7 +85,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
+* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 OCR support for scanned PDFs
 * 💻 Simple and convenient CLI
@@ -94,7 +95,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
 * ♾️ Equation & code extraction
 * 📝 Metadata extraction, including title, authors, references & language
-* 🦜🔗 Native LangChain extension
 ## Installation

docling 2.15.0__py3-none-any.whl → 2.16.0__py3-none-any.whl

docling 2.15.0py3-none-any.whl → 2.16.0py3-none-any.whl