PyPI - docling - Versions diffs - 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl - Mend

docling 2.15.1py3-none-any.whl → 2.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

docling/backend/abstract_backend.py +0 -1
docling/backend/asciidoc_backend.py +0 -1
docling/backend/docling_parse_backend.py +1 -1
docling/backend/docling_parse_v2_backend.py +1 -1
docling/backend/html_backend.py +4 -3
docling/backend/json/__init__.py +0 -0
docling/backend/json/docling_json_backend.py +58 -0
docling/backend/md_backend.py +49 -36
docling/backend/msexcel_backend.py +50 -38
docling/backend/msword_backend.py +0 -1
docling/backend/pdf_backend.py +0 -2
docling/backend/pypdfium2_backend.py +1 -1
docling/backend/xml/uspto_backend.py +25 -25
docling/cli/main.py +18 -3
docling/datamodel/base_models.py +30 -3
docling/datamodel/document.py +4 -0
docling/datamodel/pipeline_options.py +7 -9
docling/document_converter.py +4 -0
docling/models/base_model.py +62 -6
docling/models/code_formula_model.py +245 -0
docling/models/document_picture_classifier.py +187 -0
docling/models/layout_model.py +10 -86
docling/models/page_assemble_model.py +1 -33
docling/models/rapid_ocr_model.py +1 -0
docling/models/tesseract_ocr_cli_model.py +72 -5
docling/models/tesseract_ocr_model.py +68 -20
docling/pipeline/base_pipeline.py +40 -17
docling/pipeline/standard_pdf_pipeline.py +31 -2
docling/utils/glm_utils.py +4 -1
docling/utils/ocr_utils.py +9 -0
docling/utils/visualization.py +80 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/METADATA +17 -13
docling-2.17.0.dist-info/RECORD +62 -0
docling-2.15.1.dist-info/RECORD +0 -56
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 import os
 import tempfile
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -14,13 +14,13 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.ocr_utils import map_tesseract_script
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class TesseractOcrCliModel(BaseOcrModel):
     def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
         super().__init__(enabled=enabled, options=options)
         self.options: TesseractCliOcrOptions
@@ -29,10 +29,13 @@ class TesseractOcrCliModel(BaseOcrModel):
         self._name: Optional[str] = None
         self._version: Optional[str] = None
+        self._tesseract_languages: Optional[List[str]] = None
+        self._script_prefix: Optional[str] = None
         if self.enabled:
             try:
                 self._get_name_and_version()
+                self._set_languages_and_prefix()
             except Exception as exc:
                 raise RuntimeError(
@@ -74,12 +77,20 @@ class TesseractOcrCliModel(BaseOcrModel):
         return name, version
     def _run_tesseract(self, ifilename: str):
+        r"""
+        Run tesseract CLI
+        """
         cmd = [self.options.tesseract_cmd]
-        if self.options.lang is not None and len(self.options.lang) > 0:
+        if "auto" in self.options.lang:
+            lang = self._detect_language(ifilename)
+            if lang is not None:
+                cmd.append("-l")
+                cmd.append(lang)
+        elif self.options.lang is not None and len(self.options.lang) > 0:
             cmd.append("-l")
             cmd.append("+".join(self.options.lang))
         if self.options.path is not None:
             cmd.append("--tessdata-dir")
             cmd.append(self.options.path)
@@ -107,6 +118,63 @@ class TesseractOcrCliModel(BaseOcrModel):
         return df_filtered
+    def _detect_language(self, ifilename: str):
+        r"""
+        Run tesseract in PSM 0 mode to detect the language
+        """
+        assert self._tesseract_languages is not None
+        cmd = [self.options.tesseract_cmd]
+        cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
+        _log.info("command: {}".format(" ".join(cmd)))
+        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
+        output, _ = proc.communicate()
+        decoded_data = output.decode("utf-8")
+        df = pd.read_csv(
+            io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
+        )
+        scripts = df.loc[df["key"] == "Script"].value.tolist()
+        if len(scripts) == 0:
+            _log.warning("Tesseract cannot detect the script of the page")
+            return None
+        script = map_tesseract_script(scripts[0].strip())
+        lang = f"{self._script_prefix}{script}"
+        # Check if the detected language has been installed
+        if lang not in self._tesseract_languages:
+            msg = f"Tesseract detected the script '{script}' and language '{lang}'."
+            msg += " However this language is not installed in your system and will be ignored."
+            _log.warning(msg)
+            return None
+        _log.debug(
+            f"Using tesseract model for the detected script '{script}' and language '{lang}'"
+        )
+        return lang
+    def _set_languages_and_prefix(self):
+        r"""
+        Read and set the languages installed in tesseract and decide the script prefix
+        """
+        # Get all languages
+        cmd = [self.options.tesseract_cmd]
+        cmd.append("--list-langs")
+        _log.info("command: {}".format(" ".join(cmd)))
+        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
+        output, _ = proc.communicate()
+        decoded_data = output.decode("utf-8")
+        df = pd.read_csv(io.StringIO(decoded_data), header=None)
+        self._tesseract_languages = df[0].tolist()[1:]
+        # Decide the script prefix
+        if any([l.startswith("script/") for l in self._tesseract_languages]):
+            script_prefix = "script/"
+        else:
+            script_prefix = ""
+        self._script_prefix = script_prefix
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -121,7 +189,6 @@ class TesseractOcrCliModel(BaseOcrModel):
                 yield page
             else:
                 with TimeRecorder(conv_res, "ocr"):
                     ocr_rects = self.get_ocr_rects(page)
                     all_ocr_cells = []

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -8,6 +8,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.ocr_utils import map_tesseract_script
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -20,6 +21,7 @@ class TesseractOcrModel(BaseOcrModel):
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
         self.reader = None
+        self.osd_reader = None
         if self.enabled:
             install_errmsg = (
@@ -47,27 +49,38 @@ class TesseractOcrModel(BaseOcrModel):
             except:
                 raise ImportError(install_errmsg)
-            _, tesserocr_languages = tesserocr.get_languages()
-            if not tesserocr_languages:
+            _, self._tesserocr_languages = tesserocr.get_languages()
+            if not self._tesserocr_languages:
                 raise ImportError(missing_langs_errmsg)
             # Initialize the tesseractAPI
             _log.debug("Initializing TesserOCR: %s", tesseract_version)
             lang = "+".join(self.options.lang)
+            self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
+            if any([l.startswith("script/") for l in self._tesserocr_languages]):
+                self.script_prefix = "script/"
+            else:
+                self.script_prefix = ""
+            tesserocr_kwargs = {
+                "psm": tesserocr.PSM.AUTO,
+                "init": True,
+                "oem": tesserocr.OEM.DEFAULT,
+            }
             if self.options.path is not None:
-                self.reader = tesserocr.PyTessBaseAPI(
-                    path=self.options.path,
-                    lang=lang,
-                    psm=tesserocr.PSM.AUTO,
-                    init=True,
-                    oem=tesserocr.OEM.DEFAULT,
+                tesserocr_kwargs["path"] = self.options.path
+            if lang == "auto":
+                self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
+                self.osd_reader = tesserocr.PyTessBaseAPI(
+                    **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
                 )
             else:
                 self.reader = tesserocr.PyTessBaseAPI(
-                    lang=lang,
-                    psm=tesserocr.PSM.AUTO,
-                    init=True,
-                    oem=tesserocr.OEM.DEFAULT,
+                    **{"lang": lang} | tesserocr_kwargs,
                 )
             self.reader_RIL = tesserocr.RIL
@@ -75,11 +88,12 @@ class TesseractOcrModel(BaseOcrModel):
         if self.reader is not None:
             # Finalize the tesseractAPI
             self.reader.End()
+        for script in self.script_readers:
+            self.script_readers[script].End()
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
@@ -90,8 +104,8 @@ class TesseractOcrModel(BaseOcrModel):
                 yield page
             else:
                 with TimeRecorder(conv_res, "ocr"):
                     assert self.reader is not None
+                    assert self._tesserocr_languages is not None
                     ocr_rects = self.get_ocr_rects(page)
@@ -104,22 +118,56 @@ class TesseractOcrModel(BaseOcrModel):
                             scale=self.scale, cropbox=ocr_rect
                         )
-                        # Retrieve text snippets with their bounding boxes
-                        self.reader.SetImage(high_res_image)
-                        boxes = self.reader.GetComponentImages(
+                        local_reader = self.reader
+                        if "auto" in self.options.lang:
+                            assert self.osd_reader is not None
+                            self.osd_reader.SetImage(high_res_image)
+                            osd = self.osd_reader.DetectOrientationScript()
+                            # No text, probably
+                            if osd is None:
+                                continue
+                            script = osd["script_name"]
+                            script = map_tesseract_script(script)
+                            lang = f"{self.script_prefix}{script}"
+                            # Check if the detected languge is present in the system
+                            if lang not in self._tesserocr_languages:
+                                msg = f"Tesseract detected the script '{script}' and language '{lang}'."
+                                msg += " However this language is not installed in your system and will be ignored."
+                                _log.warning(msg)
+                            else:
+                                if script not in self.script_readers:
+                                    import tesserocr
+                                    self.script_readers[script] = (
+                                        tesserocr.PyTessBaseAPI(
+                                            path=self.reader.GetDatapath(),
+                                            lang=lang,
+                                            psm=tesserocr.PSM.AUTO,
+                                            init=True,
+                                            oem=tesserocr.OEM.DEFAULT,
+                                        )
+                                    )
+                                local_reader = self.script_readers[script]
+                        local_reader.SetImage(high_res_image)
+                        boxes = local_reader.GetComponentImages(
                             self.reader_RIL.TEXTLINE, True
                         )
                         cells = []
                         for ix, (im, box, _, _) in enumerate(boxes):
                             # Set the area of interest. Tesseract uses Bottom-Left for the origin
-                            self.reader.SetRectangle(
+                            local_reader.SetRectangle(
                                 box["x"], box["y"], box["w"], box["h"]
                             )
                             # Extract text within the bounding box
-                            text = self.reader.GetUTF8Text().strip()
-                            confidence = self.reader.MeanTextConf()
+                            text = local_reader.GetUTF8Text().strip()
+                            confidence = local_reader.MeanTextConf()
                             left = box["x"] / self.scale
                             bottom = box["y"] / self.scale
                             right = (box["x"] + box["w"]) / self.scale

docling/pipeline/base_pipeline.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import time
 import traceback
 from abc import ABC, abstractmethod
-from typing import Callable, Iterable, List
+from typing import Any, Callable, Iterable, List
 from docling_core.types.doc import DoclingDocument, NodeItem
@@ -18,7 +18,7 @@ from docling.datamodel.base_models import (
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
-from docling.models.base_model import BaseEnrichmentModel
+from docling.models.base_model import GenericEnrichmentModel
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import chunkify
@@ -28,8 +28,9 @@ _log = logging.getLogger(__name__)
 class BasePipeline(ABC):
     def __init__(self, pipeline_options: PipelineOptions):
         self.pipeline_options = pipeline_options
+        self.keep_images = False
         self.build_pipe: List[Callable] = []
-        self.enrichment_pipe: List[BaseEnrichmentModel] = []
+        self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
     def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
         conv_res = ConversionResult(input=in_doc)
@@ -40,7 +41,7 @@ class BasePipeline(ABC):
                 conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
             ):
                 # These steps are building and assembling the structure of the
-                # output DoclingDocument
+                # output DoclingDocument.
                 conv_res = self._build_document(conv_res)
                 conv_res = self._assemble_document(conv_res)
                 # From this stage, all operations should rely only on conv_res.output
@@ -50,6 +51,8 @@ class BasePipeline(ABC):
             conv_res.status = ConversionStatus.FAILURE
             if raises_on_error:
                 raise e
+        finally:
+            self._unload(conv_res)
         return conv_res
@@ -62,21 +65,22 @@ class BasePipeline(ABC):
     def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
-        def _filter_elements(
-            doc: DoclingDocument, model: BaseEnrichmentModel
+        def _prepare_elements(
+            conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
         ) -> Iterable[NodeItem]:
-            for element, _level in doc.iterate_items():
-                if model.is_processable(doc=doc, element=element):
-                    yield element
+            for doc_element, _level in conv_res.document.iterate_items():
+                prepared_element = model.prepare_element(
+                    conv_res=conv_res, element=doc_element
+                )
+                if prepared_element is not None:
+                    yield prepared_element
         with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
             for model in self.enrichment_pipe:
                 for element_batch in chunkify(
-                    _filter_elements(conv_res.document, model),
+                    _prepare_elements(conv_res, model),
                     settings.perf.elements_batch_size,
                 ):
-                    # TODO: currently we assume the element itself is modified, because
-                    # we don't have an interface to save the element back to the document
                     for element in model(
                         doc=conv_res.document, element_batch=element_batch
                     ):  # Must exhaust!
@@ -88,6 +92,9 @@ class BasePipeline(ABC):
     def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
         pass
+    def _unload(self, conv_res: ConversionResult):
+        pass
     @classmethod
     @abstractmethod
     def get_default_options(cls) -> PipelineOptions:
@@ -107,6 +114,10 @@ class BasePipeline(ABC):
 class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
+    def __init__(self, pipeline_options: PipelineOptions):
+        super().__init__(pipeline_options)
+        self.keep_backend = False
     def _apply_on_pages(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -148,7 +159,14 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                     pipeline_pages = self._apply_on_pages(conv_res, init_pages)
                     for p in pipeline_pages:  # Must exhaust!
-                        pass
+                        # Cleanup cached images
+                        if not self.keep_images:
+                            p._image_cache = {}
+                        # Cleanup page backends
+                        if not self.keep_backend and p._backend is not None:
+                            p._backend.unload()
                     end_batch_time = time.monotonic()
                     total_elapsed_time += end_batch_time - start_batch_time
@@ -177,10 +195,15 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                 )
                 raise e
-            finally:
-                # Always unload the PDF backend, even in case of failure
-                if conv_res.input._backend:
-                    conv_res.input._backend.unload()
+        return conv_res
+    def _unload(self, conv_res: ConversionResult) -> ConversionResult:
+        for page in conv_res.pages:
+            if page._backend is not None:
+                page._backend.unload()
+        if conv_res.input._backend:
+            conv_res.input._backend.unload()
         return conv_res

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -18,6 +18,11 @@ from docling.datamodel.pipeline_options import (
     TesseractOcrOptions,
 )
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
+from docling.models.document_picture_classifier import (
+    DocumentPictureClassifier,
+    DocumentPictureClassifierOptions,
+)
 from docling.models.ds_glm_model import GlmModel, GlmOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
@@ -50,7 +55,7 @@ class StandardPdfPipeline(PaginatedPipeline):
         else:
             self.artifacts_path = Path(pipeline_options.artifacts_path)
-        keep_images = (
+        self.keep_images = (
             self.pipeline_options.generate_page_images
             or self.pipeline_options.generate_picture_images
             or self.pipeline_options.generate_table_images
@@ -87,13 +92,37 @@ class StandardPdfPipeline(PaginatedPipeline):
                 accelerator_options=pipeline_options.accelerator_options,
             ),
             # Page assemble
-            PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
+            PageAssembleModel(options=PageAssembleOptions()),
         ]
         self.enrichment_pipe = [
             # Other models working on `NodeItem` elements in the DoclingDocument
+            # Code Formula Enrichment Model
+            CodeFormulaModel(
+                enabled=pipeline_options.do_code_enrichment
+                or pipeline_options.do_formula_enrichment,
+                artifacts_path=pipeline_options.artifacts_path,
+                options=CodeFormulaModelOptions(
+                    do_code_enrichment=pipeline_options.do_code_enrichment,
+                    do_formula_enrichment=pipeline_options.do_formula_enrichment,
+                ),
+                accelerator_options=pipeline_options.accelerator_options,
+            ),
+            # Document Picture Classifier
+            DocumentPictureClassifier(
+                enabled=pipeline_options.do_picture_classification,
+                artifacts_path=pipeline_options.artifacts_path,
+                options=DocumentPictureClassifierOptions(),
+                accelerator_options=pipeline_options.accelerator_options,
+            ),
         ]
+        if (
+            self.pipeline_options.do_formula_enrichment
+            or self.pipeline_options.do_code_enrichment
+        ):
+            self.keep_backend = True
     @staticmethod
     def download_models_hf(
         local_dir: Optional[Path] = None, force: bool = False

docling/utils/glm_utils.py CHANGED Viewed

@@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
             container_el = doc.add_group(label=group_label)
             _add_child_elements(container_el, doc, obj, pelem)
         elif "text" in obj:
             text = obj["text"][span_i:span_j]
@@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
                 current_list = None
                 doc.add_heading(text=text, prov=prov)
+            elif label == DocItemLabel.CODE:
+                current_list = None
+                doc.add_code(text=text, prov=prov)
             else:
                 current_list = None

docling/utils/ocr_utils.py ADDED Viewed

@@ -0,0 +1,9 @@
+def map_tesseract_script(script: str) -> str:
+    r""" """
+    if script == "Katakana" or script == "Hiragana":
+        script = "Japanese"
+    elif script == "Han":
+        script = "HanS"
+    elif script == "Korean":
+        script = "Hangul"
+    return script

docling/utils/visualization.py ADDED Viewed

@@ -0,0 +1,80 @@
+from docling_core.types.doc import DocItemLabel
+from PIL import Image, ImageDraw, ImageFont
+from PIL.ImageFont import FreeTypeFont
+from docling.datamodel.base_models import Cluster
+def draw_clusters(
+    image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
+) -> None:
+    """
+    Draw clusters on an image
+    """
+    draw = ImageDraw.Draw(image, "RGBA")
+    # Create a smaller font for the labels
+    font: ImageFont.ImageFont | FreeTypeFont
+    try:
+        font = ImageFont.truetype("arial.ttf", 12)
+    except OSError:
+        # Fallback to default font if arial is not available
+        font = ImageFont.load_default()
+    for c_tl in clusters:
+        all_clusters = [c_tl, *c_tl.children]
+        for c in all_clusters:
+            # Draw cells first (underneath)
+            cell_color = (0, 0, 0, 40)  # Transparent black for cells
+            for tc in c.cells:
+                cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
+                cx0 *= scale_x
+                cx1 *= scale_x
+                cy0 *= scale_x
+                cy1 *= scale_y
+                draw.rectangle(
+                    [(cx0, cy0), (cx1, cy1)],
+                    outline=None,
+                    fill=cell_color,
+                )
+            # Draw cluster rectangle
+            x0, y0, x1, y1 = c.bbox.as_tuple()
+            x0 *= scale_x
+            x1 *= scale_x
+            y0 *= scale_x
+            y1 *= scale_y
+            cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
+            cluster_outline_color = (
+                *list(DocItemLabel.get_color(c.label)),
+                255,
+            )
+            draw.rectangle(
+                [(x0, y0), (x1, y1)],
+                outline=cluster_outline_color,
+                fill=cluster_fill_color,
+            )
+            # Add label name and confidence
+            label_text = f"{c.label.name} ({c.confidence:.2f})"
+            # Create semi-transparent background for text
+            text_bbox = draw.textbbox((x0, y0), label_text, font=font)
+            text_bg_padding = 2
+            draw.rectangle(
+                [
+                    (
+                        text_bbox[0] - text_bg_padding,
+                        text_bbox[1] - text_bg_padding,
+                    ),
+                    (
+                        text_bbox[2] + text_bg_padding,
+                        text_bbox[3] + text_bg_padding,
+                    ),
+                ],
+                fill=(255, 255, 255, 180),  # Semi-transparent white
+            )
+            # Draw text
+            draw.text(
+                (x0, y0),
+                label_text,
+                fill=(0, 0, 0, 255),  # Solid black
+                font=font,
+            )

{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.15.1
+Version: 2.17.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -26,9 +26,9 @@ Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
-Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
-Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
-Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
+Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
+Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
+Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,6 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
 Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
 Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
+Requires-Dist: pillow (>=10.0.0,<11.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -77,22 +78,21 @@ Description-Content-Type: text/markdown
 [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
-Docling parses documents and exports them to the desired format with ease and speed.
+Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
 ## Features
-* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
-* 📑 Advanced PDF document understanding including page layout, reading order & table structures
-* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
-* 🔍 OCR support for scanned PDFs
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
+* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
+* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
+* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
+* 🔒 Local execution capabilities for sensitive data and air-gapped environments
+* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
+* 🔍 Extensive OCR support for scanned PDFs and images
 * 💻 Simple and convenient CLI
-Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
 ### Coming soon
-* ♾️ Equation & code extraction
 * 📝 Metadata extraction, including title, authors, references & language
 ## Installation
@@ -176,3 +176,7 @@ For individual model usage, please refer to the model licenses found in the orig
 Docling has been brought to you by IBM.
+[supported_formats]: https://ds4sd.github.io/docling/supported_formats/
+[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
+[integrations]: https://ds4sd.github.io/docling/integrations/

docling 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl

docling 2.15.1py3-none-any.whl → 2.17.0py3-none-any.whl