PyPI - docling - Versions diffs - 2.33.0__tar.gz → 2.34.0__tar.gz - Mend

docling 2.33.0tar.gz → 2.34.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{docling-2.33.0 → docling-2.34.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.33.0
+Version: 2.34.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/docling-project/docling
 License: MIT

{docling-2.33.0 → docling-2.34.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
                 coord_origin=CoordOrigin.BOTTOMLEFT,
             ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
             if overlap_frac > 0.5:
                 if len(text_piece) > 0:

{docling-2.33.0 → docling-2.34.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                 coord_origin=CoordOrigin.BOTTOMLEFT,
             ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
             if overlap_frac > 0.5:
                 if len(text_piece) > 0:

{docling-2.33.0 → docling-2.34.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
                 .scaled(scale)
             )
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
             if overlap_frac > 0.5:
                 if len(text_piece) > 0:

{docling-2.33.0 → docling-2.34.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -1,6 +1,9 @@
+import math
+from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
+import numpy as np
 from docling_core.types.doc import (
     BoundingBox,
     DocItemLabel,
@@ -16,7 +19,7 @@ from docling_core.types.io import (
     DocumentStream,
 )
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field, computed_field
 if TYPE_CHECKING:
     from docling.backend.pdf_backend import PdfPageBackend
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
     choices: List[OpenAiResponseChoice]
     created: int
     usage: OpenAiResponseUsage
+# Create a type alias for score values
+ScoreValue = float
+class QualityGrade(str, Enum):
+    POOR = "poor"
+    FAIR = "fair"
+    GOOD = "good"
+    EXCELLENT = "excellent"
+    UNSPECIFIED = "unspecified"
+class PageConfidenceScores(BaseModel):
+    parse_score: ScoreValue = np.nan
+    layout_score: ScoreValue = np.nan
+    table_score: ScoreValue = np.nan
+    ocr_score: ScoreValue = np.nan
+    def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
+        if score < 0.5:
+            return QualityGrade.POOR
+        elif score < 0.8:
+            return QualityGrade.FAIR
+        elif score < 0.9:
+            return QualityGrade.GOOD
+        elif score >= 0.9:
+            return QualityGrade.EXCELLENT
+        return QualityGrade.UNSPECIFIED
+    @computed_field  # type: ignore
+    @property
+    def mean_grade(self) -> QualityGrade:
+        return self._score_to_grade(self.mean_score)
+    @computed_field  # type: ignore
+    @property
+    def low_grade(self) -> QualityGrade:
+        return self._score_to_grade(self.low_score)
+    @computed_field  # type: ignore
+    @property
+    def mean_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanmean(
+                [
+                    self.ocr_score,
+                    self.table_score,
+                    self.layout_score,
+                    self.parse_score,
+                ]
+            )
+        )
+    @computed_field  # type: ignore
+    @property
+    def low_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanquantile(
+                [
+                    self.ocr_score,
+                    self.table_score,
+                    self.layout_score,
+                    self.parse_score,
+                ],
+                q=0.05,
+            )
+        )
+class ConfidenceReport(PageConfidenceScores):
+    pages: Dict[int, PageConfidenceScores] = Field(
+        default_factory=lambda: defaultdict(PageConfidenceScores)
+    )
+    @computed_field  # type: ignore
+    @property
+    def mean_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanmean(
+                [c.mean_score for c in self.pages.values()],
+            )
+        )
+    @computed_field  # type: ignore
+    @property
+    def low_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanmean(
+                [c.low_score for c in self.pages.values()],
+            )
+        )

{docling-2.33.0 → docling-2.34.0}/docling/datamodel/document.py RENAMED Viewed

@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
 )
 from docling_core.utils.file import resolve_source_to_stream
 from docling_core.utils.legacy import docling_document_to_legacy
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from typing_extensions import deprecated
 from docling.backend.abstract_backend import (
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
 )
 from docling.datamodel.base_models import (
     AssembledUnit,
+    ConfidenceReport,
     ConversionStatus,
     DocumentStream,
     ErrorItem,
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
     pages: List[Page] = []
     assembled: AssembledUnit = AssembledUnit()
     timings: Dict[str, ProfilingItem] = {}
+    confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
     document: DoclingDocument = _EMPTY_DOCLING_DOC

{docling-2.33.0 → docling-2.34.0}/docling/models/layout_model.py RENAMED Viewed

@@ -5,6 +5,7 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
+import numpy as np
 from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import Image
@@ -184,6 +185,14 @@ class LayoutModel(BasePageModel):
                     ).postprocess()
                     # processed_clusters, processed_cells = clusters, page.cells
+                    conv_res.confidence.pages[page.page_no].layout_score = float(
+                        np.mean([c.confidence for c in processed_clusters])
+                    )
+                    conv_res.confidence.pages[page.page_no].ocr_score = float(
+                        np.mean([c.confidence for c in processed_cells if c.from_ocr])
+                    )
                     page.cells = processed_cells
                     page.predictions.layout = LayoutPrediction(
                         clusters=processed_clusters

{docling-2.33.0 → docling-2.34.0}/docling/models/page_assemble_model.py RENAMED Viewed

@@ -3,6 +3,7 @@ import re
 from collections.abc import Iterable
 from typing import List
+import numpy as np
 from pydantic import BaseModel
 from docling.datamodel.base_models import (

{docling-2.33.0 → docling-2.34.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

@@ -1,11 +1,13 @@
+import re
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
+import numpy as np
 from PIL import ImageDraw
 from pydantic import BaseModel
-from docling.datamodel.base_models import Page
+from docling.datamodel.base_models import Page, ScoreValue
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
@@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
     def __init__(self, options: PagePreprocessingOptions):
         self.options = options
+        # Pre-compiled regex patterns for efficiency
+        self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
+        self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
+        self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
+        self.SLASH_NUMBER_GARBAGE_RE = re.compile(
+            r"(?:/\w+\s*){2,}"
+        )  # Two or more "/token " sequences
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
         if self.options.create_parsed_page:
             page.parsed_page = page._backend.get_segmented_page()
+        # Rate the text quality from the PDF parser, and aggregate on page
+        text_scores = []
+        for c in page.cells:
+            score = self.rate_text_quality(c.text)
+            text_scores.append(score)
+        conv_res.confidence.pages[page.page_no].parse_score = float(
+            np.nanquantile(
+                text_scores, q=0.10
+            )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
+        )
         # DEBUG code:
         def draw_text_boxes(image, cells, show: bool = False):
             draw = ImageDraw.Draw(image)
@@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
             draw_text_boxes(page.get_image(scale=1.0), page.cells)
         return page
+    def rate_text_quality(self, text: str) -> float:
+        # Hard errors: if any of these patterns are found, return 0.0 immediately.
+        blacklist_chars = ["�"]
+        if (
+            any(text.find(c) >= 0 for c in blacklist_chars)
+            or self.GLYPH_RE.search(text)
+            or self.SLASH_G_RE.search(text)
+            or self.SLASH_NUMBER_GARBAGE_RE.match(
+                text
+            )  # Check if text is mostly slash-number pattern
+        ):
+            return 0.0
+        penalty = 0.0
+        # Apply a penalty only if the fragmented words pattern occurs at least three times.
+        frag_matches = self.FRAG_RE.findall(text)
+        if len(frag_matches) >= 3:
+            penalty += 0.1 * len(frag_matches)
+        # Additional heuristic: if the average token length is below 2, add a penalty.
+        # tokens = text.split()
+        # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
+        #    penalty += 0.2
+        return max(1.0 - penalty, 0.0)

{docling-2.33.0 → docling-2.34.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

@@ -2,6 +2,7 @@ import csv
 import io
 import logging
 import os
+import subprocess
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc.page import TextCell
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
+    map_tesseract_script,
+    parse_tesseract_orientation,
+    tesseract_box_to_bounding_rectangle,
+)
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
         self._version: Optional[str] = None
         self._tesseract_languages: Optional[List[str]] = None
         self._script_prefix: Optional[str] = None
+        self._is_auto: bool = "auto" in self.options.lang
         if self.enabled:
             try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
         return name, version
-    def _run_tesseract(self, ifilename: str):
+    def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
         r"""
         Run tesseract CLI
         """
         cmd = [self.options.tesseract_cmd]
-        if "auto" in self.options.lang:
-            lang = self._detect_language(ifilename)
+        if self._is_auto:
+            lang = self._parse_language(osd)
             if lang is not None:
                 cmd.append("-l")
                 cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
         cmd += [ifilename, "stdout", "tsv"]
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
         # _log.info(output)
         # Decode the byte string to a regular string
-        decoded_data = output.decode("utf-8")
+        decoded_data = output.stdout.decode("utf-8")
         # _log.info(decoded_data)
         # Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
         return df_filtered
-    def _detect_language(self, ifilename: str):
+    def _perform_osd(self, ifilename: str) -> pd.DataFrame:
         r"""
         Run tesseract in PSM 0 mode to detect the language
         """
-        assert self._tesseract_languages is not None
         cmd = [self.options.tesseract_cmd]
         cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, capture_output=True, check=True)
+        decoded_data = output.stdout.decode("utf-8")
         df_detected = pd.read_csv(
             io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
         )
-        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
+        return df_detected
+    def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
+        assert self._tesseract_languages is not None
+        scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
         if len(scripts) == 0:
             _log.warning("Tesseract cannot detect the script of the page")
             return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
         cmd = [self.options.tesseract_cmd]
         cmd.append("--list-langs")
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
+        decoded_data = output.stdout.decode("utf-8")
         df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
         self._tesseract_languages = df_list[0].tolist()[1:]
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
             yield from page_batch
             return
-        for page in page_batch:
+        for page_i, page in enumerate(page_batch):
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                     ocr_rects = self.get_ocr_rects(page)
                     all_ocr_cells = []
-                    for ocr_rect in ocr_rects:
+                    for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
                         # Skip zero area boxes
                         if ocr_rect.area() == 0:
                             continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
                             ) as image_file:
                                 fname = image_file.name
                                 high_res_image.save(image_file)
-                            df_result = self._run_tesseract(fname)
+                            doc_orientation = 0
+                            try:
+                                df_osd = self._perform_osd(fname)
+                                doc_orientation = _parse_orientation(df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "OSD failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                # Skipping if OSD fail when in auto mode, otherwise proceed
+                                # to OCR in the hope OCR will succeed while OSD failed
+                                if self._is_auto:
+                                    continue
+                            if doc_orientation != 0:
+                                high_res_image = high_res_image.rotate(
+                                    -doc_orientation, expand=True
+                                )
+                                high_res_image.save(fname)
+                            try:
+                                df_result = self._run_tesseract(fname, df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "tesseract OCR failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                continue
                         finally:
                             if os.path.exists(fname):
                                 os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
                             text = row["text"]
                             conf = row["conf"]
-                            l = float(row["left"])  # noqa: E741
-                            b = float(row["top"])
-                            w = float(row["width"])
-                            h = float(row["height"])
-                            t = b + h
-                            r = l + w
+                            left, top = float(row["left"]), float(row["top"])
+                            right = left + float(row["width"])
+                            bottom = top + row["height"]
+                            bbox = BoundingBox(
+                                l=left,
+                                t=top,
+                                r=right,
+                                b=bottom,
+                                coord_origin=CoordOrigin.TOPLEFT,
+                            )
+                            rect = tesseract_box_to_bounding_rectangle(
+                                bbox,
+                                original_offset=ocr_rect,
+                                scale=self.scale,
+                                orientation=doc_orientation,
+                                im_size=high_res_image.size,
+                            )
                             cell = TextCell(
                                 index=ix,
                                 text=str(text),
                                 orig=str(text),
                                 from_ocr=True,
                                 confidence=conf / 100.0,
-                                rect=BoundingRectangle.from_bounding_box(
-                                    BoundingBox.from_tuple(
-                                        coord=(
-                                            (l / self.scale) + ocr_rect.l,
-                                            (b / self.scale) + ocr_rect.t,
-                                            (r / self.scale) + ocr_rect.l,
-                                            (t / self.scale) + ocr_rect.t,
-                                        ),
-                                        origin=CoordOrigin.TOPLEFT,
-                                    )
-                                ),
+                                rect=rect,
                             )
                             all_ocr_cells.append(cell)
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
     @classmethod
     def get_options_type(cls) -> Type[OcrOptions]:
         return TesseractCliOcrOptions
+def _parse_orientation(df_osd: pd.DataFrame) -> int:
+    orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
+    orientation = parse_tesseract_orientation(orientations[0].strip())
+    return orientation

{docling-2.33.0 → docling-2.34.0}/docling/models/tesseract_ocr_model.py RENAMED Viewed

@@ -1,12 +1,11 @@
 from __future__ import annotations
 import logging
-from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional, Type
+from typing import Iterable, Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc.page import TextCell
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
+    map_tesseract_script,
+    parse_tesseract_orientation,
+    tesseract_box_to_bounding_rectangle,
+)
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
             accelerator_options=accelerator_options,
         )
         self.options: TesseractOcrOptions
+        self._is_auto: bool = "auto" in self.options.lang
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
         self.reader = None
         self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
             if lang == "auto":
                 self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
-                self.osd_reader = tesserocr.PyTessBaseAPI(
-                    **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
-                )
             else:
                 self.reader = tesserocr.PyTessBaseAPI(
                     **{"lang": lang} | tesserocr_kwargs,
                 )
+            self.osd_reader = tesserocr.PyTessBaseAPI(
+                **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
+            )
             self.reader_RIL = tesserocr.RIL
     def __del__(self):
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
             yield from page_batch
             return
-        for page in page_batch:
+        for page_i, page in enumerate(page_batch):
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
             else:
                 with TimeRecorder(conv_res, "ocr"):
                     assert self.reader is not None
+                    assert self.osd_reader is not None
                     assert self._tesserocr_languages is not None
                     ocr_rects = self.get_ocr_rects(page)
                     all_ocr_cells = []
-                    for ocr_rect in ocr_rects:
+                    for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
                         # Skip zero area boxes
                         if ocr_rect.area() == 0:
                             continue
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
                         )
                         local_reader = self.reader
-                        if "auto" in self.options.lang:
-                            assert self.osd_reader is not None
-                            self.osd_reader.SetImage(high_res_image)
-                            osd = self.osd_reader.DetectOrientationScript()
-                            # No text, probably
-                            if osd is None:
+                        self.osd_reader.SetImage(high_res_image)
+                        osd = self.osd_reader.DetectOrientationScript()
+                        # No text, or Orientation and Script detection failure
+                        if osd is None:
+                            _log.error(
+                                "OSD failed for doc (doc %s, page: %s, "
+                                "OCR rectangle: %s)",
+                                conv_res.input.file,
+                                page_i,
+                                ocr_rect_i,
+                            )
+                            # Skipping if OSD fail when in auto mode, otherwise proceed
+                            # to OCR in the hope OCR will succeed while OSD failed
+                            if self._is_auto:
                                 continue
+                        doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
+                        if doc_orientation != 0:
+                            high_res_image = high_res_image.rotate(
+                                -doc_orientation, expand=True
+                            )
+                        if self._is_auto:
                             script = osd["script_name"]
                             script = map_tesseract_script(script)
                             lang = f"{self.script_prefix}{script}"
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
                             # Extract text within the bounding box
                             text = local_reader.GetUTF8Text().strip()
                             confidence = local_reader.MeanTextConf()
-                            left = box["x"] / self.scale
-                            bottom = box["y"] / self.scale
-                            right = (box["x"] + box["w"]) / self.scale
-                            top = (box["y"] + box["h"]) / self.scale
+                            left, top = box["x"], box["y"]
+                            right = left + box["w"]
+                            bottom = top + box["h"]
+                            bbox = BoundingBox(
+                                l=left,
+                                t=top,
+                                r=right,
+                                b=bottom,
+                                coord_origin=CoordOrigin.TOPLEFT,
+                            )
+                            rect = tesseract_box_to_bounding_rectangle(
+                                bbox,
+                                original_offset=ocr_rect,
+                                scale=self.scale,
+                                orientation=doc_orientation,
+                                im_size=high_res_image.size,
+                            )
                             cells.append(
                                 TextCell(
                                     index=ix,
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
                                     orig=text,
                                     from_ocr=True,
                                     confidence=confidence,
-                                    rect=BoundingRectangle.from_bounding_box(
-                                        BoundingBox.from_tuple(
-                                            coord=(left, top, right, bottom),
-                                            origin=CoordOrigin.TOPLEFT,
-                                        ),
-                                    ),
+                                    rect=rect,
                                 )
                             )

{docling-2.33.0 → docling-2.34.0}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

@@ -3,11 +3,12 @@ import warnings
 from pathlib import Path
 from typing import Optional, cast
+import numpy as np
 from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import AssembledUnit, Page
+from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
@@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline):
             or self.pipeline_options.generate_table_images
         )
-        self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
+        self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
         ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
@@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline):
                 elements=all_elements, headers=all_headers, body=all_body
             )
-            conv_res.document = self.glm_model(conv_res)
+            conv_res.document = self.reading_order_model(conv_res)
             # Generate page images in the output
             if self.pipeline_options.generate_page_images:
@@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline):
                             cropped_im, dpi=int(72 * scale)
                         )
+            # Aggregate confidence values for document:
+            if len(conv_res.pages) > 0:
+                conv_res.confidence.layout_score = float(
+                    np.nanmean(
+                        [c.layout_score for c in conv_res.confidence.pages.values()]
+                    )
+                )
+                conv_res.confidence.parse_score = float(
+                    np.nanquantile(
+                        [c.parse_score for c in conv_res.confidence.pages.values()],
+                        q=0.1,  # parse score should relate to worst 10% of pages.
+                    )
+                )
+                conv_res.confidence.table_score = float(
+                    np.nanmean(
+                        [c.table_score for c in conv_res.confidence.pages.values()]
+                    )
+                )
+                conv_res.confidence.ocr_score = float(
+                    np.nanmean(
+                        [c.ocr_score for c in conv_res.confidence.pages.values()]
+                    )
+                )
         return conv_res
     @classmethod

{docling-2.33.0 → docling-2.34.0}/docling/utils/layout_postprocessor.py RENAMED Viewed

@@ -90,17 +90,12 @@ class SpatialClusterIndex:
         containment_threshold: float,
     ) -> bool:
         """Check if two bboxes overlap sufficiently."""
-        area1, area2 = bbox1.area(), bbox2.area()
-        if area1 <= 0 or area2 <= 0:
+        if bbox1.area() <= 0 or bbox2.area() <= 0:
             return False
-        overlap_area = bbox1.intersection_area_with(bbox2)
-        if overlap_area <= 0:
-            return False
-        iou = overlap_area / (area1 + area2 - overlap_area)
-        containment1 = overlap_area / area1
-        containment2 = overlap_area / area2
+        iou = bbox1.intersection_over_union(bbox2)
+        containment1 = bbox1.intersection_over_self(bbox2)
+        containment2 = bbox2.intersection_over_self(bbox1)
         return (
             iou > overlap_threshold
@@ -321,11 +316,9 @@ class LayoutPostprocessor:
         for special in special_clusters:
             contained = []
             for cluster in self.regular_clusters:
-                overlap = cluster.bbox.intersection_area_with(special.bbox)
-                if overlap > 0:
-                    containment = overlap / cluster.bbox.area()
-                    if containment > 0.8:
-                        contained.append(cluster)
+                containment = cluster.bbox.intersection_over_self(special.bbox)
+                if containment > 0.8:
+                    contained.append(cluster)
             if contained:
                 # Sort contained clusters by minimum cell ID:
@@ -379,9 +372,7 @@ class LayoutPostprocessor:
             for regular in self.regular_clusters:
                 if regular.label == DocItemLabel.TABLE:
                     # Calculate overlap
-                    overlap = regular.bbox.intersection_area_with(wrapper.bbox)
-                    wrapper_area = wrapper.bbox.area()
-                    overlap_ratio = overlap / wrapper_area
+                    overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
                     conf_diff = wrapper.confidence - regular.confidence
@@ -421,8 +412,7 @@ class LayoutPostprocessor:
         # Rule 2: CODE vs others
         if candidate.label == DocItemLabel.CODE:
             # Calculate how much of the other cluster is contained within the CODE cluster
-            overlap = other.bbox.intersection_area_with(candidate.bbox)
-            containment = overlap / other.bbox.area()
+            containment = other.bbox.intersection_over_self(candidate.bbox)
             if containment > 0.8:  # other is 80% contained within CODE
                 return True
@@ -586,11 +576,9 @@ class LayoutPostprocessor:
                 if cell.rect.to_bounding_box().area() <= 0:
                     continue
-                overlap = cell.rect.to_bounding_box().intersection_area_with(
+                overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
                     cluster.bbox
                 )
-                overlap_ratio = overlap / cell.rect.to_bounding_box().area()
                 if overlap_ratio > best_overlap:
                     best_overlap = overlap_ratio
                     best_cluster = cluster

docling-2.34.0/docling/utils/ocr_utils.py ADDED Viewed

@@ -0,0 +1,69 @@
+from typing import Optional, Tuple
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle
+from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
+def map_tesseract_script(script: str) -> str:
+    r""" """
+    if script == "Katakana" or script == "Hiragana":
+        script = "Japanese"
+    elif script == "Han":
+        script = "HanS"
+    elif script == "Korean":
+        script = "Hangul"
+    return script
+def parse_tesseract_orientation(orientation: str) -> int:
+    # Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
+    # are [0, 360[ counterclockwise
+    parsed = int(orientation)
+    if parsed not in CLIPPED_ORIENTATIONS:
+        msg = (
+            f"invalid tesseract document orientation {orientation}, "
+            f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
+        )
+        raise ValueError(msg)
+    parsed = -parsed
+    parsed %= 360
+    return parsed
+def tesseract_box_to_bounding_rectangle(
+    bbox: BoundingBox,
+    *,
+    original_offset: Optional[BoundingBox] = None,
+    scale: float,
+    orientation: int,
+    im_size: Tuple[int, int],
+) -> BoundingRectangle:
+    # box is in the top, left, height, width format, top left coordinates
+    rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
+    rect = BoundingRectangle(
+        r_x0=rect.r_x0 / scale,
+        r_y0=rect.r_y0 / scale,
+        r_x1=rect.r_x1 / scale,
+        r_y1=rect.r_y1 / scale,
+        r_x2=rect.r_x2 / scale,
+        r_y2=rect.r_y2 / scale,
+        r_x3=rect.r_x3 / scale,
+        r_y3=rect.r_y3 / scale,
+        coord_origin=CoordOrigin.TOPLEFT,
+    )
+    if original_offset is not None:
+        if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
+            msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
+            raise ValueError(msg)
+        if original_offset is not None:
+            rect.r_x0 += original_offset.l
+            rect.r_x1 += original_offset.l
+            rect.r_x2 += original_offset.l
+            rect.r_x3 += original_offset.l
+            rect.r_y0 += original_offset.t
+            rect.r_y1 += original_offset.t
+            rect.r_y2 += original_offset.t
+            rect.r_y3 += original_offset.t
+    return rect

docling-2.34.0/docling/utils/orientation.py ADDED Viewed

@@ -0,0 +1,71 @@
+from typing import Tuple
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle
+CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
+def rotate_bounding_box(
+    bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
+) -> BoundingRectangle:
+    # The box is left top width height in TOPLEFT coordinates
+    # Bounding rectangle start with r_0 at the bottom left whatever the
+    # coordinate system. Then other corners are found rotating counterclockwise
+    bbox = bbox.to_top_left_origin(im_size[1])
+    left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
+    im_h, im_w = im_size
+    angle = angle % 360
+    if angle == 0:
+        r_x0 = left
+        r_y0 = top + height
+        r_x1 = r_x0 + width
+        r_y1 = r_y0
+        r_x2 = r_x0 + width
+        r_y2 = r_y0 - height
+        r_x3 = r_x0
+        r_y3 = r_y0 - height
+    elif angle == 90:
+        r_x0 = im_w - (top + height)
+        r_y0 = left
+        r_x1 = r_x0
+        r_y1 = r_y0 + width
+        r_x2 = r_x0 + height
+        r_y2 = r_y0 + width
+        r_x3 = r_x0
+        r_y3 = r_y0 + width
+    elif angle == 180:
+        r_x0 = im_h - left
+        r_y0 = im_w - (top + height)
+        r_x1 = r_x0 - width
+        r_y1 = r_y0
+        r_x2 = r_x0 - width
+        r_y2 = r_y0 + height
+        r_x3 = r_x0
+        r_y3 = r_y0 + height
+    elif angle == 270:
+        r_x0 = top + height
+        r_y0 = im_h - left
+        r_x1 = r_x0
+        r_y1 = r_y0 - width
+        r_x2 = r_x0 - height
+        r_y2 = r_y0 - width
+        r_x3 = r_x0 - height
+        r_y3 = r_y0
+    else:
+        msg = (
+            f"invalid orientation {angle}, expected values in:"
+            f" {sorted(CLIPPED_ORIENTATIONS)}"
+        )
+        raise ValueError(msg)
+    return BoundingRectangle(
+        r_x0=r_x0,
+        r_y0=r_y0,
+        r_x1=r_x1,
+        r_y1=r_y1,
+        r_x2=r_x2,
+        r_y2=r_y2,
+        r_x3=r_x3,
+        r_y3=r_y3,
+        coord_origin=CoordOrigin.TOPLEFT,
+    )

{docling-2.33.0 → docling-2.34.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.33.0"  # DO NOT EDIT, updated automatically
+version = "2.34.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = [
   "Christoph Auer <cau@zurich.ibm.com>",

docling-2.33.0/docling/utils/ocr_utils.py DELETED Viewed

@@ -1,9 +0,0 @@
-def map_tesseract_script(script: str) -> str:
-    r""" """
-    if script == "Katakana" or script == "Hiragana":
-        script = "Japanese"
-    elif script == "Han":
-        script = "HanS"
-    elif script == "Korean":
-        script = "Hangul"
-    return script

{docling-2.33.0 → docling-2.34.0}/LICENSE RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/README.md RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/csv_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/docx/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/docx/latex/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/docx/latex/latex_dict.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/docx/latex/omml.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/html_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/json/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/json/docling_json_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/msword_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/xml/jats_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/cli/main.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/cli/models.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/cli/tools.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/datamodel/settings.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/document_converter.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/api_vlm_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/base_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/base_ocr_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/code_formula_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/document_picture_classifier.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/easyocr_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/factories/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/factories/base_factory.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/factories/ocr_factory.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/factories/picture_description_factory.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/hf_mlx_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/hf_vlm_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/picture_description_api_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/picture_description_base_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/picture_description_vlm_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/plugins/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/plugins/defaults.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/rapid_ocr_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/readingorder_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/models/table_structure_model.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/pipeline/base_pipeline.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/pipeline/vlm_pipeline.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/api_image_request.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/glm_utils.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/locks.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/model_downloader.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/utils.py RENAMED Viewed

File without changes

{docling-2.33.0 → docling-2.34.0}/docling/utils/visualization.py RENAMED Viewed

File without changes

docling 2.33.0__tar.gz → 2.34.0__tar.gz

docling 2.33.0tar.gz → 2.34.0tar.gz