PyPI - docling - Versions diffs - 2.15.0__tar.gz → 2.15.1__tar.gz - Mend

docling 2.15.0tar.gz → 2.15.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{docling-2.15.0 → docling-2.15.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.15.0
+Version: 2.15.1
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -45,7 +45,7 @@ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
 Requires-Dist: python-docx (>=1.1.2,<2.0.0)
 Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
 Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
-Requires-Dist: requests (>=2.32.3,<3.0.0)
+Requires-Dist: requests (>=2.32.2,<3.0.0)
 Requires-Dist: rtree (>=1.3.0,<2.0.0)
 Requires-Dist: scipy (>=1.6.0,<2.0.0)
 Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
@@ -84,7 +84,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
+* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 OCR support for scanned PDFs
 * 💻 Simple and convenient CLI
@@ -94,7 +94,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
 * ♾️ Equation & code extraction
 * 📝 Metadata extraction, including title, authors, references & language
-* 🦜🔗 Native LangChain extension
 ## Installation

{docling-2.15.0 → docling-2.15.1}/README.md RENAMED Viewed

@@ -29,7 +29,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
+* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 OCR support for scanned PDFs
 * 💻 Simple and convenient CLI
@@ -39,7 +39,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
 * ♾️ Equation & code extraction
 * 📝 Metadata extraction, including title, authors, references & language
-* 🦜🔗 Native LangChain extension
 ## Installation

{docling-2.15.0 → docling-2.15.1}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
         return cells
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
         for i in range(len(self._dpage["images"])):
             bitmap = self._dpage["images"][i]

{docling-2.15.0 → docling-2.15.1}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
         return cells
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
         images = self._dpage["sanitized"]["images"]["data"]
         images_header = self._dpage["sanitized"]["images"]["header"]

{docling-2.15.0 → docling-2.15.1}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
         return self.valid
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
         for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
             pos = obj.get_pos()
             cropbox = BoundingBox.from_tuple(

{docling-2.15.0 → docling-2.15.1}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):
     use_gpu: Optional[bool] = None
-    confidence_threshold: float = 0.65
+    confidence_threshold: float = 0.5
     model_storage_directory: Optional[str] = None
     recog_network: Optional[str] = "standard"

{docling-2.15.0 → docling-2.15.1}/docling/models/base_ocr_model.py RENAMED Viewed

@@ -8,7 +8,7 @@ import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from PIL import Image, ImageDraw
 from rtree import index
-from scipy.ndimage import find_objects, label
+from scipy.ndimage import binary_dilation, find_objects, label
 from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
@@ -43,6 +43,12 @@ class BaseOcrModel(BasePageModel):
             np_image = np.array(image)
+            # Dilate the image by 10 pixels to merge nearby bitmap rectangles
+            structure = np.ones(
+                (20, 20)
+            )  # Create a 20x20 structure element (10 pixels in all directions)
+            np_image = binary_dilation(np_image > 0, structure=structure)
             # Find the connected components
             labeled_image, num_features = label(
                 np_image > 0
@@ -72,7 +78,7 @@ class BaseOcrModel(BasePageModel):
             bitmap_rects = []
         coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
-        # return full-page rectangle if sufficiently covered with bitmaps
+        # return full-page rectangle if page is dominantly covered with bitmaps
         if self.options.force_full_page_ocr or coverage > max(
             BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
         ):
@@ -85,17 +91,11 @@ class BaseOcrModel(BasePageModel):
                     coord_origin=CoordOrigin.TOPLEFT,
                 )
             ]
-        # return individual rectangles if the bitmap coverage is smaller
-        else:  # coverage <= BITMAP_COVERAGE_TRESHOLD:
-            # skip OCR if the bitmap area on the page is smaller than the options threshold
-            ocr_rects = [
-                rect
-                for rect in ocr_rects
-                if rect.area() / (page.size.width * page.size.height)
-                > self.options.bitmap_area_threshold
-            ]
+        # return individual rectangles if the bitmap coverage is above the threshold
+        elif coverage > self.options.bitmap_area_threshold:
             return ocr_rects
+        else:  # overall coverage of bitmaps is too low, drop all bitmap rectangles.
+            return []
     # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
     def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
@@ -162,6 +162,9 @@ class BaseOcrModel(BasePageModel):
             x0 *= scale_x
             x1 *= scale_x
+            if y1 <= y0:
+                y1, y0 = y0, y1
             color = "gray"
             if isinstance(tc, OcrCell):
                 color = "magenta"

{docling-2.15.0 → docling-2.15.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.15.0"  # DO NOT EDIT, updated automatically
+version = "2.15.1"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -34,7 +34,7 @@ filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
 pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
-requests = "^2.32.3"
+requests = "^2.32.2"
 easyocr = "^1.7"
 tesserocr = { version = "^2.7.1", optional = true }
 certifi = ">=2024.7.4"

{docling-2.15.0 → docling-2.15.1}/LICENSE RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/html_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/msword_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/xml/pubmed_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/cli/main.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/datamodel/base_models.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/datamodel/document.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/datamodel/settings.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/document_converter.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/base_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/ds_glm_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/easyocr_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/layout_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/page_assemble_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/page_preprocessing_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/rapid_ocr_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/table_structure_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/models/tesseract_ocr_model.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/pipeline/base_pipeline.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/utils/glm_utils.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/utils/layout_postprocessor.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.15.0 → docling-2.15.1}/docling/utils/utils.py RENAMED Viewed

File without changes

docling 2.15.0__tar.gz → 2.15.1__tar.gz

docling 2.15.0tar.gz → 2.15.1tar.gz