PyPI - docling - Versions diffs - 2.32.0__py3-none-any.whl → 2.34.0__py3-none-any.whl - Mend

docling 2.32.0py3-none-any.whl → 2.34.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

docling/backend/docling_parse_backend.py +1 -1
docling/backend/docling_parse_v2_backend.py +1 -1
docling/backend/docling_parse_v4_backend.py +1 -1
docling/backend/msword_backend.py +269 -12
docling/backend/pypdfium2_backend.py +6 -1
docling/datamodel/base_models.py +99 -2
docling/datamodel/document.py +11 -2
docling/models/layout_model.py +9 -0
docling/models/page_assemble_model.py +1 -0
docling/models/page_preprocessing_model.py +50 -1
docling/models/tesseract_ocr_cli_model.py +85 -41
docling/models/tesseract_ocr_model.py +52 -30
docling/pipeline/standard_pdf_pipeline.py +28 -3
docling/pipeline/vlm_pipeline.py +19 -21
docling/utils/layout_postprocessor.py +10 -22
docling/utils/ocr_utils.py +60 -0
docling/utils/orientation.py +71 -0
{docling-2.32.0.dist-info → docling-2.34.0.dist-info}/METADATA +2 -2
{docling-2.32.0.dist-info → docling-2.34.0.dist-info}/RECORD +22 -21
{docling-2.32.0.dist-info → docling-2.34.0.dist-info}/LICENSE +0 -0
{docling-2.32.0.dist-info → docling-2.34.0.dist-info}/WHEEL +0 -0
{docling-2.32.0.dist-info → docling-2.34.0.dist-info}/entry_points.txt +0 -0

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -2,6 +2,7 @@ import csv
 import io
 import logging
 import os
+import subprocess
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc.page import TextCell
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
+    map_tesseract_script,
+    parse_tesseract_orientation,
+    tesseract_box_to_bounding_rectangle,
+)
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
         self._version: Optional[str] = None
         self._tesseract_languages: Optional[List[str]] = None
         self._script_prefix: Optional[str] = None
+        self._is_auto: bool = "auto" in self.options.lang
         if self.enabled:
             try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
         return name, version
-    def _run_tesseract(self, ifilename: str):
+    def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
         r"""
         Run tesseract CLI
         """
         cmd = [self.options.tesseract_cmd]
-        if "auto" in self.options.lang:
-            lang = self._detect_language(ifilename)
+        if self._is_auto:
+            lang = self._parse_language(osd)
             if lang is not None:
                 cmd.append("-l")
                 cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
         cmd += [ifilename, "stdout", "tsv"]
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
         # _log.info(output)
         # Decode the byte string to a regular string
-        decoded_data = output.decode("utf-8")
+        decoded_data = output.stdout.decode("utf-8")
         # _log.info(decoded_data)
         # Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
         return df_filtered
-    def _detect_language(self, ifilename: str):
+    def _perform_osd(self, ifilename: str) -> pd.DataFrame:
         r"""
         Run tesseract in PSM 0 mode to detect the language
         """
-        assert self._tesseract_languages is not None
         cmd = [self.options.tesseract_cmd]
         cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, capture_output=True, check=True)
+        decoded_data = output.stdout.decode("utf-8")
         df_detected = pd.read_csv(
             io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
         )
-        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
+        return df_detected
+    def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
+        assert self._tesseract_languages is not None
+        scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
         if len(scripts) == 0:
             _log.warning("Tesseract cannot detect the script of the page")
             return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
         cmd = [self.options.tesseract_cmd]
         cmd.append("--list-langs")
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
+        decoded_data = output.stdout.decode("utf-8")
         df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
         self._tesseract_languages = df_list[0].tolist()[1:]
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
             yield from page_batch
             return
-        for page in page_batch:
+        for page_i, page in enumerate(page_batch):
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                     ocr_rects = self.get_ocr_rects(page)
                     all_ocr_cells = []
-                    for ocr_rect in ocr_rects:
+                    for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
                         # Skip zero area boxes
                         if ocr_rect.area() == 0:
                             continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
                             ) as image_file:
                                 fname = image_file.name
                                 high_res_image.save(image_file)
-                            df_result = self._run_tesseract(fname)
+                            doc_orientation = 0
+                            try:
+                                df_osd = self._perform_osd(fname)
+                                doc_orientation = _parse_orientation(df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "OSD failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                # Skipping if OSD fail when in auto mode, otherwise proceed
+                                # to OCR in the hope OCR will succeed while OSD failed
+                                if self._is_auto:
+                                    continue
+                            if doc_orientation != 0:
+                                high_res_image = high_res_image.rotate(
+                                    -doc_orientation, expand=True
+                                )
+                                high_res_image.save(fname)
+                            try:
+                                df_result = self._run_tesseract(fname, df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "tesseract OCR failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                continue
                         finally:
                             if os.path.exists(fname):
                                 os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
                             text = row["text"]
                             conf = row["conf"]
-                            l = float(row["left"])  # noqa: E741
-                            b = float(row["top"])
-                            w = float(row["width"])
-                            h = float(row["height"])
-                            t = b + h
-                            r = l + w
+                            left, top = float(row["left"]), float(row["top"])
+                            right = left + float(row["width"])
+                            bottom = top + row["height"]
+                            bbox = BoundingBox(
+                                l=left,
+                                t=top,
+                                r=right,
+                                b=bottom,
+                                coord_origin=CoordOrigin.TOPLEFT,
+                            )
+                            rect = tesseract_box_to_bounding_rectangle(
+                                bbox,
+                                original_offset=ocr_rect,
+                                scale=self.scale,
+                                orientation=doc_orientation,
+                                im_size=high_res_image.size,
+                            )
                             cell = TextCell(
                                 index=ix,
                                 text=str(text),
                                 orig=str(text),
                                 from_ocr=True,
                                 confidence=conf / 100.0,
-                                rect=BoundingRectangle.from_bounding_box(
-                                    BoundingBox.from_tuple(
-                                        coord=(
-                                            (l / self.scale) + ocr_rect.l,
-                                            (b / self.scale) + ocr_rect.t,
-                                            (r / self.scale) + ocr_rect.l,
-                                            (t / self.scale) + ocr_rect.t,
-                                        ),
-                                        origin=CoordOrigin.TOPLEFT,
-                                    )
-                                ),
+                                rect=rect,
                             )
                             all_ocr_cells.append(cell)
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
     @classmethod
     def get_options_type(cls) -> Type[OcrOptions]:
         return TesseractCliOcrOptions
+def _parse_orientation(df_osd: pd.DataFrame) -> int:
+    orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
+    orientation = parse_tesseract_orientation(orientations[0].strip())
+    return orientation

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -1,12 +1,11 @@
 from __future__ import annotations
 import logging
-from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional, Type
+from typing import Iterable, Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc.page import TextCell
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
+    map_tesseract_script,
+    parse_tesseract_orientation,
+    tesseract_box_to_bounding_rectangle,
+)
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
             accelerator_options=accelerator_options,
         )
         self.options: TesseractOcrOptions
+        self._is_auto: bool = "auto" in self.options.lang
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
         self.reader = None
         self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
             if lang == "auto":
                 self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
-                self.osd_reader = tesserocr.PyTessBaseAPI(
-                    **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
-                )
             else:
                 self.reader = tesserocr.PyTessBaseAPI(
                     **{"lang": lang} | tesserocr_kwargs,
                 )
+            self.osd_reader = tesserocr.PyTessBaseAPI(
+                **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
+            )
             self.reader_RIL = tesserocr.RIL
     def __del__(self):
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
             yield from page_batch
             return
-        for page in page_batch:
+        for page_i, page in enumerate(page_batch):
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
             else:
                 with TimeRecorder(conv_res, "ocr"):
                     assert self.reader is not None
+                    assert self.osd_reader is not None
                     assert self._tesserocr_languages is not None
                     ocr_rects = self.get_ocr_rects(page)
                     all_ocr_cells = []
-                    for ocr_rect in ocr_rects:
+                    for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
                         # Skip zero area boxes
                         if ocr_rect.area() == 0:
                             continue
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
                         )
                         local_reader = self.reader
-                        if "auto" in self.options.lang:
-                            assert self.osd_reader is not None
-                            self.osd_reader.SetImage(high_res_image)
-                            osd = self.osd_reader.DetectOrientationScript()
-                            # No text, probably
-                            if osd is None:
+                        self.osd_reader.SetImage(high_res_image)
+                        osd = self.osd_reader.DetectOrientationScript()
+                        # No text, or Orientation and Script detection failure
+                        if osd is None:
+                            _log.error(
+                                "OSD failed for doc (doc %s, page: %s, "
+                                "OCR rectangle: %s)",
+                                conv_res.input.file,
+                                page_i,
+                                ocr_rect_i,
+                            )
+                            # Skipping if OSD fail when in auto mode, otherwise proceed
+                            # to OCR in the hope OCR will succeed while OSD failed
+                            if self._is_auto:
                                 continue
+                        doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
+                        if doc_orientation != 0:
+                            high_res_image = high_res_image.rotate(
+                                -doc_orientation, expand=True
+                            )
+                        if self._is_auto:
                             script = osd["script_name"]
                             script = map_tesseract_script(script)
                             lang = f"{self.script_prefix}{script}"
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
                             # Extract text within the bounding box
                             text = local_reader.GetUTF8Text().strip()
                             confidence = local_reader.MeanTextConf()
-                            left = box["x"] / self.scale
-                            bottom = box["y"] / self.scale
-                            right = (box["x"] + box["w"]) / self.scale
-                            top = (box["y"] + box["h"]) / self.scale
+                            left, top = box["x"], box["y"]
+                            right = left + box["w"]
+                            bottom = top + box["h"]
+                            bbox = BoundingBox(
+                                l=left,
+                                t=top,
+                                r=right,
+                                b=bottom,
+                                coord_origin=CoordOrigin.TOPLEFT,
+                            )
+                            rect = tesseract_box_to_bounding_rectangle(
+                                bbox,
+                                original_offset=ocr_rect,
+                                scale=self.scale,
+                                orientation=doc_orientation,
+                                im_size=high_res_image.size,
+                            )
                             cells.append(
                                 TextCell(
                                     index=ix,
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
                                     orig=text,
                                     from_ocr=True,
                                     confidence=confidence,
-                                    rect=BoundingRectangle.from_bounding_box(
-                                        BoundingBox.from_tuple(
-                                            coord=(left, top, right, bottom),
-                                            origin=CoordOrigin.TOPLEFT,
-                                        ),
-                                    ),
+                                    rect=rect,
                                 )
                             )

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -3,11 +3,12 @@ import warnings
 from pathlib import Path
 from typing import Optional, cast
+import numpy as np
 from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import AssembledUnit, Page
+from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
@@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline):
             or self.pipeline_options.generate_table_images
         )
-        self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
+        self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
         ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
@@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline):
                 elements=all_elements, headers=all_headers, body=all_body
             )
-            conv_res.document = self.glm_model(conv_res)
+            conv_res.document = self.reading_order_model(conv_res)
             # Generate page images in the output
             if self.pipeline_options.generate_page_images:
@@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline):
                             cropped_im, dpi=int(72 * scale)
                         )
+            # Aggregate confidence values for document:
+            if len(conv_res.pages) > 0:
+                conv_res.confidence.layout_score = float(
+                    np.nanmean(
+                        [c.layout_score for c in conv_res.confidence.pages.values()]
+                    )
+                )
+                conv_res.confidence.parse_score = float(
+                    np.nanquantile(
+                        [c.parse_score for c in conv_res.confidence.pages.values()],
+                        q=0.1,  # parse score should relate to worst 10% of pages.
+                    )
+                )
+                conv_res.confidence.table_score = float(
+                    np.nanmean(
+                        [c.table_score for c in conv_res.confidence.pages.values()]
+                    )
+                )
+                conv_res.confidence.ocr_score = float(
+                    np.nanmean(
+                        [c.ocr_score for c in conv_res.confidence.pages.values()]
+                    )
+                )
         return conv_res
     @classmethod

docling/pipeline/vlm_pipeline.py CHANGED Viewed

@@ -3,7 +3,7 @@ from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
-# from docling_core.types import DoclingDocument
+from docling_core.types import DoclingDocument
 from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
 from docling_core.types.doc.document import DocTagsDocument
 from PIL import Image as PILImage
@@ -133,28 +133,26 @@ class VlmPipeline(PaginatedPipeline):
                 doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
                     doctags_list_c, image_list_c
                 )
-                conv_res.document.load_from_doctags(doctags_doc)
+                conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
                 # If forced backend text, replace model predicted text with backend one
-                if page.size:
-                    if self.force_backend_text:
-                        scale = self.pipeline_options.images_scale
-                        for element, _level in conv_res.document.iterate_items():
-                            if (
-                                not isinstance(element, TextItem)
-                                or len(element.prov) == 0
-                            ):
-                                continue
-                            crop_bbox = (
-                                element.prov[0]
-                                .bbox.scaled(scale=scale)
-                                .to_top_left_origin(
-                                    page_height=page.size.height * scale
-                                )
-                            )
-                            txt = self.extract_text_from_backend(page, crop_bbox)
-                            element.text = txt
-                            element.orig = txt
+                if self.force_backend_text:
+                    scale = self.pipeline_options.images_scale
+                    for element, _level in conv_res.document.iterate_items():
+                        if not isinstance(element, TextItem) or len(element.prov) == 0:
+                            continue
+                        page_ix = element.prov[0].page_no - 1
+                        page = conv_res.pages[page_ix]
+                        if not page.size:
+                            continue
+                        crop_bbox = (
+                            element.prov[0]
+                            .bbox.scaled(scale=scale)
+                            .to_top_left_origin(page_height=page.size.height * scale)
+                        )
+                        txt = self.extract_text_from_backend(page, crop_bbox)
+                        element.text = txt
+                        element.orig = txt
             elif (
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.MARKDOWN

docling/utils/layout_postprocessor.py CHANGED Viewed

@@ -90,17 +90,12 @@ class SpatialClusterIndex:
         containment_threshold: float,
     ) -> bool:
         """Check if two bboxes overlap sufficiently."""
-        area1, area2 = bbox1.area(), bbox2.area()
-        if area1 <= 0 or area2 <= 0:
+        if bbox1.area() <= 0 or bbox2.area() <= 0:
             return False
-        overlap_area = bbox1.intersection_area_with(bbox2)
-        if overlap_area <= 0:
-            return False
-        iou = overlap_area / (area1 + area2 - overlap_area)
-        containment1 = overlap_area / area1
-        containment2 = overlap_area / area2
+        iou = bbox1.intersection_over_union(bbox2)
+        containment1 = bbox1.intersection_over_self(bbox2)
+        containment2 = bbox2.intersection_over_self(bbox1)
         return (
             iou > overlap_threshold
@@ -321,11 +316,9 @@ class LayoutPostprocessor:
         for special in special_clusters:
             contained = []
             for cluster in self.regular_clusters:
-                overlap = cluster.bbox.intersection_area_with(special.bbox)
-                if overlap > 0:
-                    containment = overlap / cluster.bbox.area()
-                    if containment > 0.8:
-                        contained.append(cluster)
+                containment = cluster.bbox.intersection_over_self(special.bbox)
+                if containment > 0.8:
+                    contained.append(cluster)
             if contained:
                 # Sort contained clusters by minimum cell ID:
@@ -379,9 +372,7 @@ class LayoutPostprocessor:
             for regular in self.regular_clusters:
                 if regular.label == DocItemLabel.TABLE:
                     # Calculate overlap
-                    overlap = regular.bbox.intersection_area_with(wrapper.bbox)
-                    wrapper_area = wrapper.bbox.area()
-                    overlap_ratio = overlap / wrapper_area
+                    overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
                     conf_diff = wrapper.confidence - regular.confidence
@@ -421,8 +412,7 @@ class LayoutPostprocessor:
         # Rule 2: CODE vs others
         if candidate.label == DocItemLabel.CODE:
             # Calculate how much of the other cluster is contained within the CODE cluster
-            overlap = other.bbox.intersection_area_with(candidate.bbox)
-            containment = overlap / other.bbox.area()
+            containment = other.bbox.intersection_over_self(candidate.bbox)
             if containment > 0.8:  # other is 80% contained within CODE
                 return True
@@ -586,11 +576,9 @@ class LayoutPostprocessor:
                 if cell.rect.to_bounding_box().area() <= 0:
                     continue
-                overlap = cell.rect.to_bounding_box().intersection_area_with(
+                overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
                     cluster.bbox
                 )
-                overlap_ratio = overlap / cell.rect.to_bounding_box().area()
                 if overlap_ratio > best_overlap:
                     best_overlap = overlap_ratio
                     best_cluster = cluster

docling/utils/ocr_utils.py CHANGED Viewed

@@ -1,3 +1,11 @@
+from typing import Optional, Tuple
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import BoundingRectangle
+from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
 def map_tesseract_script(script: str) -> str:
     r""" """
     if script == "Katakana" or script == "Hiragana":
@@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
     elif script == "Korean":
         script = "Hangul"
     return script
+def parse_tesseract_orientation(orientation: str) -> int:
+    # Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
+    # are [0, 360[ counterclockwise
+    parsed = int(orientation)
+    if parsed not in CLIPPED_ORIENTATIONS:
+        msg = (
+            f"invalid tesseract document orientation {orientation}, "
+            f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
+        )
+        raise ValueError(msg)
+    parsed = -parsed
+    parsed %= 360
+    return parsed
+def tesseract_box_to_bounding_rectangle(
+    bbox: BoundingBox,
+    *,
+    original_offset: Optional[BoundingBox] = None,
+    scale: float,
+    orientation: int,
+    im_size: Tuple[int, int],
+) -> BoundingRectangle:
+    # box is in the top, left, height, width format, top left coordinates
+    rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
+    rect = BoundingRectangle(
+        r_x0=rect.r_x0 / scale,
+        r_y0=rect.r_y0 / scale,
+        r_x1=rect.r_x1 / scale,
+        r_y1=rect.r_y1 / scale,
+        r_x2=rect.r_x2 / scale,
+        r_y2=rect.r_y2 / scale,
+        r_x3=rect.r_x3 / scale,
+        r_y3=rect.r_y3 / scale,
+        coord_origin=CoordOrigin.TOPLEFT,
+    )
+    if original_offset is not None:
+        if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
+            msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
+            raise ValueError(msg)
+        if original_offset is not None:
+            rect.r_x0 += original_offset.l
+            rect.r_x1 += original_offset.l
+            rect.r_x2 += original_offset.l
+            rect.r_x3 += original_offset.l
+            rect.r_y0 += original_offset.t
+            rect.r_y1 += original_offset.t
+            rect.r_y2 += original_offset.t
+            rect.r_y3 += original_offset.t
+    return rect

docling 2.32.0__py3-none-any.whl → 2.34.0__py3-none-any.whl

docling 2.32.0py3-none-any.whl → 2.34.0py3-none-any.whl