PyPI - docling - Versions diffs - 2.33.0__tar.gz → 2.35.0__tar.gz - Mend

docling 2.33.0tar.gz → 2.35.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{docling-2.33.0 → docling-2.35.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.33.0
+Version: 2.35.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/docling-project/docling
 License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: click (<8.2.0)
-Requires-Dist: docling-core[chunking] (>=2.29.0,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.31.2,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
 Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)

{docling-2.33.0 → docling-2.35.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
                 coord_origin=CoordOrigin.BOTTOMLEFT,
             ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
             if overlap_frac > 0.5:
                 if len(text_piece) > 0:

{docling-2.33.0 → docling-2.35.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                 coord_origin=CoordOrigin.BOTTOMLEFT,
             ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
             if overlap_frac > 0.5:
                 if len(text_piece) > 0:

{docling-2.33.0 → docling-2.35.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
                 .scaled(scale)
             )
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
             if overlap_frac > 0.5:
                 if len(text_piece) > 0:

{docling-2.33.0 → docling-2.35.0}/docling/cli/main.py RENAMED Viewed

@@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
 import rich.table
 import typer
+from docling_core.transforms.serializer.html import (
+    HTMLDocSerializer,
+    HTMLOutputStyle,
+    HTMLParams,
+)
+from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
 from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
@@ -156,6 +162,7 @@ def export_documents(
     export_json: bool,
     export_html: bool,
     export_html_split_page: bool,
+    show_layout: bool,
     export_md: bool,
     export_txt: bool,
     export_doctags: bool,
@@ -189,9 +196,27 @@ def export_documents(
             if export_html_split_page:
                 fname = output_dir / f"{doc_filename}.html"
                 _log.info(f"writing HTML output to {fname}")
-                conv_res.document.save_as_html(
-                    filename=fname, image_mode=image_export_mode, split_page_view=True
-                )
+                if show_layout:
+                    ser = HTMLDocSerializer(
+                        doc=conv_res.document,
+                        params=HTMLParams(
+                            image_mode=image_export_mode,
+                            output_style=HTMLOutputStyle.SPLIT_PAGE,
+                        ),
+                    )
+                    visualizer = LayoutVisualizer()
+                    visualizer.params.show_label = False
+                    ser_res = ser.serialize(
+                        visualizer=visualizer,
+                    )
+                    with open(fname, "w") as fw:
+                        fw.write(ser_res.text)
+                else:
+                    conv_res.document.save_as_html(
+                        filename=fname,
+                        image_mode=image_export_mode,
+                        split_page_view=True,
+                    )
             # Export Text format:
             if export_txt:
@@ -250,6 +275,13 @@ def convert(  # noqa: C901
     to_formats: List[OutputFormat] = typer.Option(
         None, "--to", help="Specify output formats. Defaults to Markdown."
     ),
+    show_layout: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="If enabled, the page images will show the bounding-boxes of the items.",
+        ),
+    ] = False,
     headers: str = typer.Option(
         None,
         "--headers",
@@ -596,6 +628,7 @@ def convert(  # noqa: C901
             export_json=export_json,
             export_html=export_html,
             export_html_split_page=export_html_split_page,
+            show_layout=show_layout,
             export_md=export_md,
             export_txt=export_txt,
             export_doctags=export_doctags,

{docling-2.33.0 → docling-2.35.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -1,6 +1,9 @@
+import math
+from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
+import numpy as np
 from docling_core.types.doc import (
     BoundingBox,
     DocItemLabel,
@@ -16,7 +19,7 @@ from docling_core.types.io import (
     DocumentStream,
 )
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field, computed_field
 if TYPE_CHECKING:
     from docling.backend.pdf_backend import PdfPageBackend
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
     choices: List[OpenAiResponseChoice]
     created: int
     usage: OpenAiResponseUsage
+# Create a type alias for score values
+ScoreValue = float
+class QualityGrade(str, Enum):
+    POOR = "poor"
+    FAIR = "fair"
+    GOOD = "good"
+    EXCELLENT = "excellent"
+    UNSPECIFIED = "unspecified"
+class PageConfidenceScores(BaseModel):
+    parse_score: ScoreValue = np.nan
+    layout_score: ScoreValue = np.nan
+    table_score: ScoreValue = np.nan
+    ocr_score: ScoreValue = np.nan
+    def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
+        if score < 0.5:
+            return QualityGrade.POOR
+        elif score < 0.8:
+            return QualityGrade.FAIR
+        elif score < 0.9:
+            return QualityGrade.GOOD
+        elif score >= 0.9:
+            return QualityGrade.EXCELLENT
+        return QualityGrade.UNSPECIFIED
+    @computed_field  # type: ignore
+    @property
+    def mean_grade(self) -> QualityGrade:
+        return self._score_to_grade(self.mean_score)
+    @computed_field  # type: ignore
+    @property
+    def low_grade(self) -> QualityGrade:
+        return self._score_to_grade(self.low_score)
+    @computed_field  # type: ignore
+    @property
+    def mean_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanmean(
+                [
+                    self.ocr_score,
+                    self.table_score,
+                    self.layout_score,
+                    self.parse_score,
+                ]
+            )
+        )
+    @computed_field  # type: ignore
+    @property
+    def low_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanquantile(
+                [
+                    self.ocr_score,
+                    self.table_score,
+                    self.layout_score,
+                    self.parse_score,
+                ],
+                q=0.05,
+            )
+        )
+class ConfidenceReport(PageConfidenceScores):
+    pages: Dict[int, PageConfidenceScores] = Field(
+        default_factory=lambda: defaultdict(PageConfidenceScores)
+    )
+    @computed_field  # type: ignore
+    @property
+    def mean_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanmean(
+                [c.mean_score for c in self.pages.values()],
+            )
+        )
+    @computed_field  # type: ignore
+    @property
+    def low_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanmean(
+                [c.low_score for c in self.pages.values()],
+            )
+        )

{docling-2.33.0 → docling-2.35.0}/docling/datamodel/document.py RENAMED Viewed

@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
 )
 from docling_core.utils.file import resolve_source_to_stream
 from docling_core.utils.legacy import docling_document_to_legacy
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from typing_extensions import deprecated
 from docling.backend.abstract_backend import (
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
 )
 from docling.datamodel.base_models import (
     AssembledUnit,
+    ConfidenceReport,
     ConversionStatus,
     DocumentStream,
     ErrorItem,
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
     pages: List[Page] = []
     assembled: AssembledUnit = AssembledUnit()
     timings: Dict[str, ProfilingItem] = {}
+    confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
     document: DoclingDocument = _EMPTY_DOCLING_DOC
@@ -332,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
     ) -> Optional[InputFormat]:
         """Guess the input format of a document by checking part of its content."""
         input_format: Optional[InputFormat] = None
-        content_str = content.decode("utf-8")
         if mime == "application/xml":
+            content_str = content.decode("utf-8")
             match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
             if match_doctype:
                 xml_doctype = match_doctype.group()
@@ -356,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
                     input_format = InputFormat.XML_JATS
         elif mime == "text/plain":
+            content_str = content.decode("utf-8")
             if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
                 input_format = InputFormat.XML_USPTO
@@ -409,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
             else:
                 return "application/xml"
-        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
+        if re.match(
+            r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
+            content_str,
+            re.DOTALL,
+        ):
             return "text/html"
         p = re.compile(

{docling-2.33.0 → docling-2.35.0}/docling/models/layout_model.py RENAMED Viewed

@@ -5,6 +5,7 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
+import numpy as np
 from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import Image
@@ -184,6 +185,24 @@ class LayoutModel(BasePageModel):
                     ).postprocess()
                     # processed_clusters, processed_cells = clusters, page.cells
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings(
+                            "ignore",
+                            "Mean of empty slice|invalid value encountered in scalar divide",
+                            RuntimeWarning,
+                            "numpy",
+                        )
+                        conv_res.confidence.pages[page.page_no].layout_score = float(
+                            np.mean([c.confidence for c in processed_clusters])
+                        )
+                        conv_res.confidence.pages[page.page_no].ocr_score = float(
+                            np.mean(
+                                [c.confidence for c in processed_cells if c.from_ocr]
+                            )
+                        )
                     page.cells = processed_cells
                     page.predictions.layout = LayoutPrediction(
                         clusters=processed_clusters

{docling-2.33.0 → docling-2.35.0}/docling/models/page_assemble_model.py RENAMED Viewed

@@ -3,6 +3,7 @@ import re
 from collections.abc import Iterable
 from typing import List
+import numpy as np
 from pydantic import BaseModel
 from docling.datamodel.base_models import (

{docling-2.33.0 → docling-2.35.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

@@ -1,7 +1,10 @@
+import re
+import warnings
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
+import numpy as np
 from PIL import ImageDraw
 from pydantic import BaseModel
@@ -21,6 +24,14 @@ class PagePreprocessingModel(BasePageModel):
     def __init__(self, options: PagePreprocessingOptions):
         self.options = options
+        # Pre-compiled regex patterns for efficiency
+        self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
+        self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
+        self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
+        self.SLASH_NUMBER_GARBAGE_RE = re.compile(
+            r"(?:/\w+\s*){2,}"
+        )  # Two or more "/token " sequences
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -60,6 +71,22 @@ class PagePreprocessingModel(BasePageModel):
         if self.options.create_parsed_page:
             page.parsed_page = page._backend.get_segmented_page()
+        # Rate the text quality from the PDF parser, and aggregate on page
+        text_scores = []
+        for c in page.cells:
+            score = self.rate_text_quality(c.text)
+            text_scores.append(score)
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", "Mean of empty slice", RuntimeWarning, "numpy"
+            )
+            conv_res.confidence.pages[page.page_no].parse_score = float(
+                np.nanquantile(
+                    text_scores, q=0.10
+                )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
+            )
         # DEBUG code:
         def draw_text_boxes(image, cells, show: bool = False):
             draw = ImageDraw.Draw(image)
@@ -88,3 +115,30 @@ class PagePreprocessingModel(BasePageModel):
             draw_text_boxes(page.get_image(scale=1.0), page.cells)
         return page
+    def rate_text_quality(self, text: str) -> float:
+        # Hard errors: if any of these patterns are found, return 0.0 immediately.
+        blacklist_chars = ["�"]
+        if (
+            any(text.find(c) >= 0 for c in blacklist_chars)
+            or self.GLYPH_RE.search(text)
+            or self.SLASH_G_RE.search(text)
+            or self.SLASH_NUMBER_GARBAGE_RE.match(
+                text
+            )  # Check if text is mostly slash-number pattern
+        ):
+            return 0.0
+        penalty = 0.0
+        # Apply a penalty only if the fragmented words pattern occurs at least three times.
+        frag_matches = self.FRAG_RE.findall(text)
+        if len(frag_matches) >= 3:
+            penalty += 0.1 * len(frag_matches)
+        # Additional heuristic: if the average token length is below 2, add a penalty.
+        # tokens = text.split()
+        # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
+        #    penalty += 0.2
+        return max(1.0 - penalty, 0.0)

{docling-2.33.0 → docling-2.35.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

@@ -2,6 +2,7 @@ import csv
 import io
 import logging
 import os
+import subprocess
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc.page import TextCell
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
+    map_tesseract_script,
+    parse_tesseract_orientation,
+    tesseract_box_to_bounding_rectangle,
+)
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
         self._version: Optional[str] = None
         self._tesseract_languages: Optional[List[str]] = None
         self._script_prefix: Optional[str] = None
+        self._is_auto: bool = "auto" in self.options.lang
         if self.enabled:
             try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
         return name, version
-    def _run_tesseract(self, ifilename: str):
+    def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
         r"""
         Run tesseract CLI
         """
         cmd = [self.options.tesseract_cmd]
-        if "auto" in self.options.lang:
-            lang = self._detect_language(ifilename)
+        if self._is_auto:
+            lang = self._parse_language(osd)
             if lang is not None:
                 cmd.append("-l")
                 cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
         cmd += [ifilename, "stdout", "tsv"]
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
         # _log.info(output)
         # Decode the byte string to a regular string
-        decoded_data = output.decode("utf-8")
+        decoded_data = output.stdout.decode("utf-8")
         # _log.info(decoded_data)
         # Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
         return df_filtered
-    def _detect_language(self, ifilename: str):
+    def _perform_osd(self, ifilename: str) -> pd.DataFrame:
         r"""
         Run tesseract in PSM 0 mode to detect the language
         """
-        assert self._tesseract_languages is not None
         cmd = [self.options.tesseract_cmd]
         cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, capture_output=True, check=True)
+        decoded_data = output.stdout.decode("utf-8")
         df_detected = pd.read_csv(
             io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
         )
-        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
+        return df_detected
+    def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
+        assert self._tesseract_languages is not None
+        scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
         if len(scripts) == 0:
             _log.warning("Tesseract cannot detect the script of the page")
             return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
         cmd = [self.options.tesseract_cmd]
         cmd.append("--list-langs")
         _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
+        decoded_data = output.stdout.decode("utf-8")
         df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
         self._tesseract_languages = df_list[0].tolist()[1:]
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
             yield from page_batch
             return
-        for page in page_batch:
+        for page_i, page in enumerate(page_batch):
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                     ocr_rects = self.get_ocr_rects(page)
                     all_ocr_cells = []
-                    for ocr_rect in ocr_rects:
+                    for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
                         # Skip zero area boxes
                         if ocr_rect.area() == 0:
                             continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
                             ) as image_file:
                                 fname = image_file.name
                                 high_res_image.save(image_file)
-                            df_result = self._run_tesseract(fname)
+                            doc_orientation = 0
+                            try:
+                                df_osd = self._perform_osd(fname)
+                                doc_orientation = _parse_orientation(df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "OSD failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                # Skipping if OSD fail when in auto mode, otherwise proceed
+                                # to OCR in the hope OCR will succeed while OSD failed
+                                if self._is_auto:
+                                    continue
+                            if doc_orientation != 0:
+                                high_res_image = high_res_image.rotate(
+                                    -doc_orientation, expand=True
+                                )
+                                high_res_image.save(fname)
+                            try:
+                                df_result = self._run_tesseract(fname, df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "tesseract OCR failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                continue
                         finally:
                             if os.path.exists(fname):
                                 os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
                             text = row["text"]
                             conf = row["conf"]
-                            l = float(row["left"])  # noqa: E741
-                            b = float(row["top"])
-                            w = float(row["width"])
-                            h = float(row["height"])
-                            t = b + h
-                            r = l + w
+                            left, top = float(row["left"]), float(row["top"])
+                            right = left + float(row["width"])
+                            bottom = top + row["height"]
+                            bbox = BoundingBox(
+                                l=left,
+                                t=top,
+                                r=right,
+                                b=bottom,
+                                coord_origin=CoordOrigin.TOPLEFT,
+                            )
+                            rect = tesseract_box_to_bounding_rectangle(
+                                bbox,
+                                original_offset=ocr_rect,
+                                scale=self.scale,
+                                orientation=doc_orientation,
+                                im_size=high_res_image.size,
+                            )
                             cell = TextCell(
                                 index=ix,
                                 text=str(text),
                                 orig=str(text),
                                 from_ocr=True,
                                 confidence=conf / 100.0,
-                                rect=BoundingRectangle.from_bounding_box(
-                                    BoundingBox.from_tuple(
-                                        coord=(
-                                            (l / self.scale) + ocr_rect.l,
-                                            (b / self.scale) + ocr_rect.t,
-                                            (r / self.scale) + ocr_rect.l,
-                                            (t / self.scale) + ocr_rect.t,
-                                        ),
-                                        origin=CoordOrigin.TOPLEFT,
-                                    )
-                                ),
+                                rect=rect,
                             )
                             all_ocr_cells.append(cell)
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
     @classmethod
     def get_options_type(cls) -> Type[OcrOptions]:
         return TesseractCliOcrOptions
+def _parse_orientation(df_osd: pd.DataFrame) -> int:
+    orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
+    orientation = parse_tesseract_orientation(orientations[0].strip())
+    return orientation

docling 2.33.0__tar.gz → 2.35.0__tar.gz

docling 2.33.0tar.gz → 2.35.0tar.gz