PyPI - docling - Versions diffs - 2.4.1__tar.gz → 2.5.0__tar.gz - Mend

docling 2.4.1tar.gz → 2.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

{docling-2.4.1 → docling-2.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.4.1
+Version: 2.5.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT

{docling-2.4.1 → docling-2.5.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -120,6 +120,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.handle_header(element, idx, doc)
         elif element.name in ["p"]:
             self.handle_paragraph(element, idx, doc)
+        elif element.name in ["pre"]:
+            self.handle_code(element, idx, doc)
         elif element.name in ["ul", "ol"]:
             self.handle_list(element, idx, doc)
         elif element.name in ["li"]:
@@ -205,6 +207,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 level=hlevel,
             )
+    def handle_code(self, element, idx, doc):
+        """Handles monospace code snippets (pre)."""
+        if element.text is None:
+            return
+        text = element.text.strip()
+        label = DocItemLabel.CODE
+        if len(text) == 0:
+            return
+        doc.add_text(parent=self.parents[self.level], label=label, text=text)
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""
         if element.text is None:

{docling-2.4.1 → docling-2.5.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

@@ -358,41 +358,36 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
             size = Size(width=slide_width, height=slide_height)
             parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
-            # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
-            # Loop through each shape in the slide
-            for shape in slide.shapes:
+            def handle_shapes(shape, parent_slide, slide_ind, doc):
+                handle_groups(shape, parent_slide, slide_ind, doc)
                 if shape.has_table:
                     # Handle Tables
                     self.handle_tables(shape, parent_slide, slide_ind, doc)
                 if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
-                    # Handle Tables
+                    # Handle Pictures
                     self.handle_pictures(shape, parent_slide, slide_ind, doc)
                 # If shape doesn't have any text, move on to the next shape
                 if not hasattr(shape, "text"):
-                    continue
+                    return
                 if shape.text is None:
-                    continue
+                    return
                 if len(shape.text.strip()) == 0:
-                    continue
+                    return
                 if not shape.has_text_frame:
-                    _log.warn("Warning: shape has text but not text_frame")
-                    continue
-                # if shape.is_placeholder:
-                # Handle Titles (Headers) and Subtitles
-                # Check if the shape is a placeholder (titles are placeholders)
-                # self.handle_title(shape, parent_slide, slide_ind, doc)
-                # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
-                # else:
+                    _log.warning("Warning: shape has text but not text_frame")
+                    return
                 # Handle other text elements, including lists (bullet lists, numbered lists)
                 self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+                return
+            def handle_groups(shape, parent_slide, slide_ind, doc):
+                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+                    for groupedshape in shape.shapes:
+                        handle_shapes(groupedshape, parent_slide, slide_ind, doc)
-                # figures...
-                # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
+            # Loop through each shape in the slide
+            for shape in slide.shapes:
+                handle_shapes(shape, parent_slide, slide_ind, doc)
         return doc

{docling-2.4.1 → docling-2.5.0}/docling/cli/main.py RENAMED Viewed

@@ -153,6 +153,13 @@ def convert(
             ..., help="If enabled, the bitmap content will be processed using OCR."
         ),
     ] = True,
+    force_ocr: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="Replace any existing text with OCR generated text over the full content.",
+        ),
+    ] = False,
     ocr_engine: Annotated[
         OcrEngine, typer.Option(..., help="The OCR engine to use.")
     ] = OcrEngine.EASYOCR,
@@ -219,11 +226,11 @@ def convert(
     match ocr_engine:
         case OcrEngine.EASYOCR:
-            ocr_options: OcrOptions = EasyOcrOptions()
+            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
         case OcrEngine.TESSERACT_CLI:
-            ocr_options = TesseractCliOcrOptions()
+            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
         case OcrEngine.TESSERACT:
-            ocr_options = TesseractOcrOptions()
+            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
         case _:
             raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

{docling-2.4.1 → docling-2.5.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
 class OcrOptions(BaseModel):
     kind: str
+    force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
     bitmap_area_threshold: float = (
         0.05  # percentage of the area for a bitmap to processed with OCR
     )

{docling-2.4.1 → docling-2.5.0}/docling/datamodel/settings.py RENAMED Viewed

@@ -2,7 +2,7 @@ import sys
 from pathlib import Path
 from pydantic import BaseModel
-from pydantic_settings import BaseSettings
+from pydantic_settings import BaseSettings, SettingsConfigDict
 class DocumentLimits(BaseModel):
@@ -40,6 +40,8 @@ class DebugSettings(BaseModel):
 class AppSettings(BaseSettings):
+    model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
     perf: BatchConcurrencySettings
     debug: DebugSettings

{docling-2.4.1 → docling-2.5.0}/docling/models/base_ocr_model.py RENAMED Viewed

@@ -10,7 +10,7 @@ from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import find_objects, label
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrOptions
 from docling.datamodel.settings import settings
@@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
         coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
         # return full-page rectangle if sufficiently covered with bitmaps
-        if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
+        if self.options.force_full_page_ocr or coverage > max(
+            BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
+        ):
             return [
                 BoundingBox(
                     l=0,
@@ -96,7 +98,7 @@ class BaseOcrModel(BasePageModel):
             return ocr_rects
     # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
-    def filter_ocr_cells(self, ocr_cells, programmatic_cells):
+    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
         # Create R-tree index for programmatic cells
         p = index.Property()
         p.dimension = 2
@@ -117,6 +119,23 @@ class BaseOcrModel(BasePageModel):
         ]
         return filtered_ocr_cells
+    def post_process_cells(self, ocr_cells, programmatic_cells):
+        r"""
+        Post-process the ocr and programmatic cells and return the final list of of cells
+        """
+        if self.options.force_full_page_ocr:
+            # If a full page OCR is forced, use only the OCR cells
+            cells = [
+                Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
+                for c_ocr in ocr_cells
+            ]
+            return cells
+        ## Remove OCR cells which overlap with programmatic cells.
+        filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
+        programmatic_cells.extend(filtered_ocr_cells)
+        return programmatic_cells
     def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
         image = copy.deepcopy(page.image)
         draw = ImageDraw.Draw(image, "RGBA")

{docling-2.4.1 → docling-2.5.0}/docling/models/easyocr_model.py RENAMED Viewed

@@ -2,9 +2,10 @@ import logging
 from typing import Iterable
 import numpy
+import torch
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.datamodel.settings import settings
@@ -32,6 +33,7 @@ class EasyOcrModel(BaseOcrModel):
             self.reader = easyocr.Reader(
                 lang_list=self.options.lang,
+                gpu=self.options.use_gpu,
                 model_storage_directory=self.options.model_storage_directory,
                 download_enabled=self.options.download_enabled,
             )
@@ -86,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
                         ]
                         all_ocr_cells.extend(cells)
-                    ## Remove OCR cells which overlap with programmatic cells.
-                    filtered_ocr_cells = self.filter_ocr_cells(
-                        all_ocr_cells, page.cells
-                    )
-                    page.cells.extend(filtered_ocr_cells)
+                    # Post-process the cells
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
                 # DEBUG code:
                 if settings.debug.visualize_ocr:

{docling-2.4.1 → docling-2.5.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

@@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
@@ -170,12 +170,8 @@ class TesseractOcrCliModel(BaseOcrModel):
                             )
                             all_ocr_cells.append(cell)
-                    ## Remove OCR cells which overlap with programmatic cells.
-                    filtered_ocr_cells = self.filter_ocr_cells(
-                        all_ocr_cells, page.cells
-                    )
-                    page.cells.extend(filtered_ocr_cells)
+                    # Post-process the cells
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
                 # DEBUG code:
                 if settings.debug.visualize_ocr:

{docling-2.4.1 → docling-2.5.0}/docling/models/tesseract_ocr_model.py RENAMED Viewed

@@ -3,7 +3,7 @@ from typing import Iterable
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
@@ -140,12 +140,8 @@ class TesseractOcrModel(BaseOcrModel):
                         # del high_res_image
                         all_ocr_cells.extend(cells)
-                    ## Remove OCR cells which overlap with programmatic cells.
-                    filtered_ocr_cells = self.filter_ocr_cells(
-                        all_ocr_cells, page.cells
-                    )
-                    page.cells.extend(filtered_ocr_cells)
+                    # Post-process the cells
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
                 # DEBUG code:
                 if settings.debug.visualize_ocr:

{docling-2.4.1 → docling-2.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.4.1"  # DO NOT EDIT, updated automatically
+version = "2.5.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"