PyPI - docling - Versions diffs - 2.32.0__tar.gz → 2.34.0__tar.gz - Mend

docling 2.32.0tar.gz → 2.34.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

{docling-2.32.0 → docling-2.34.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.32.0
+Version: 2.34.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/docling-project/docling
 License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: click (<8.2.0)
-Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.29.0,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
 Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)

{docling-2.32.0 → docling-2.34.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
                 coord_origin=CoordOrigin.BOTTOMLEFT,
             ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
             if overlap_frac > 0.5:
                 if len(text_piece) > 0:

{docling-2.32.0 → docling-2.34.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                 coord_origin=CoordOrigin.BOTTOMLEFT,
             ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
             if overlap_frac > 0.5:
                 if len(text_piece) > 0:

{docling-2.32.0 → docling-2.34.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
                 .scaled(scale)
             )
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
             if overlap_frac > 0.5:
                 if len(text_piece) > 0:

{docling-2.32.0 → docling-2.34.0}/docling/backend/msword_backend.py RENAMED Viewed

@@ -2,7 +2,7 @@ import logging
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Any, List, Optional, Union
 from docling_core.types.doc import (
     DocItemLabel,
@@ -24,7 +24,6 @@ from docx.text.hyperlink import Hyperlink
 from docx.text.paragraph import Paragraph
 from docx.text.run import Run
 from lxml import etree
-from lxml.etree import XPath
 from PIL import Image, UnidentifiedImageError
 from pydantic import AnyUrl
 from typing_extensions import override
@@ -59,6 +58,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.parents: dict[int, Optional[NodeItem]] = {}
         self.numbered_headers: dict[int, int] = {}
         self.equation_bookends: str = "<eq>{EQ}</eq>"
+        # Track processed textbox elements to avoid duplication
+        self.processed_textbox_elements: List[int] = []
+        # Track content hash of processed paragraphs to avoid duplicate content
+        self.processed_paragraph_content: List[str] = []
         for i in range(-1, self.max_levels):
             self.parents[i] = None
@@ -175,10 +179,74 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
                 "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
                 "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
+                "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
+                "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
+                "v": "urn:schemas-microsoft-com:vml",
+                "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
+                "w10": "urn:schemas-microsoft-com:office:word",
+                "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
             }
-            xpath_expr = XPath(".//a:blip", namespaces=namespaces)
+            xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
             drawing_blip = xpath_expr(element)
+            # Check for textbox content - check multiple textbox formats
+            # Only process if the element hasn't been processed before
+            element_id = id(element)
+            if element_id not in self.processed_textbox_elements:
+                # Modern Word textboxes
+                txbx_xpath = etree.XPath(
+                    ".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
+                )
+                textbox_elements = txbx_xpath(element)
+                # No modern textboxes found, check for alternate/legacy textbox formats
+                if not textbox_elements and tag_name in ["drawing", "pict"]:
+                    # Additional checks for textboxes in DrawingML and VML formats
+                    alt_txbx_xpath = etree.XPath(
+                        ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
+                        namespaces=namespaces,
+                    )
+                    textbox_elements = alt_txbx_xpath(element)
+                    # Check for shape text that's not in a standard textbox
+                    if not textbox_elements:
+                        shape_text_xpath = etree.XPath(
+                            ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
+                            namespaces=namespaces,
+                        )
+                        shape_text_elements = shape_text_xpath(element)
+                        if shape_text_elements:
+                            # Create custom text elements from shape text
+                            text_content = " ".join(
+                                [t.text for t in shape_text_elements if t.text]
+                            )
+                            if text_content.strip():
+                                _log.debug(f"Found shape text: {text_content[:50]}...")
+                                # Create a paragraph-like element to process with standard handler
+                                level = self._get_level()
+                                shape_group = doc.add_group(
+                                    label=GroupLabel.SECTION,
+                                    parent=self.parents[level - 1],
+                                    name="shape-text",
+                                )
+                                doc.add_text(
+                                    label=DocItemLabel.PARAGRAPH,
+                                    parent=shape_group,
+                                    text=text_content,
+                                )
+                if textbox_elements:
+                    # Mark the parent element as processed
+                    self.processed_textbox_elements.append(element_id)
+                    # Also mark all found textbox elements as processed
+                    for tb_element in textbox_elements:
+                        self.processed_textbox_elements.append(id(tb_element))
+                    _log.debug(
+                        f"Found textbox content with {len(textbox_elements)} elements"
+                    )
+                    self._handle_textbox_content(textbox_elements, docx_obj, doc)
             # Check for Tables
             if element.tag.endswith("tbl"):
                 try:
@@ -291,15 +359,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     @classmethod
     def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
-        has_any_formatting = run.bold or run.italic or run.underline
-        return (
-            Formatting(
-                bold=run.bold or False,
-                italic=run.italic or False,
-                underline=run.underline or False,
-            )
-            if has_any_formatting
-            else None
+        # The .bold and .italic properties are booleans, but .underline can be an enum
+        # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
+        has_bold = run.bold or False
+        has_italic = run.italic or False
+        # Convert any non-None underline value to True
+        has_underline = bool(run.underline is not None and run.underline)
+        return Formatting(
+            bold=has_bold,
+            italic=has_italic,
+            underline=has_underline,
         )
     def _get_paragraph_elements(self, paragraph: Paragraph):
@@ -355,6 +425,182 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         return paragraph_elements
+    def _get_paragraph_position(self, paragraph_element):
+        """Extract vertical position information from paragraph element."""
+        # First try to directly get the index from w:p element that has an order-related attribute
+        if (
+            hasattr(paragraph_element, "getparent")
+            and paragraph_element.getparent() is not None
+        ):
+            parent = paragraph_element.getparent()
+            # Get all paragraph siblings
+            paragraphs = [
+                p for p in parent.getchildren() if etree.QName(p).localname == "p"
+            ]
+            # Find index of current paragraph within its siblings
+            try:
+                paragraph_index = paragraphs.index(paragraph_element)
+                return paragraph_index  # Use index as position for consistent ordering
+            except ValueError:
+                pass
+        # Look for position hints in element attributes and ancestor elements
+        for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
+            # Check for direct position attributes
+            for attr_name in ["y", "top", "positionY", "y-position", "position"]:
+                value = elem.get(attr_name)
+                if value:
+                    try:
+                        # Remove any non-numeric characters (like 'pt', 'px', etc.)
+                        clean_value = re.sub(r"[^0-9.]", "", value)
+                        if clean_value:
+                            return float(clean_value)
+                    except (ValueError, TypeError):
+                        pass
+            # Check for position in transform attribute
+            transform = elem.get("transform")
+            if transform:
+                # Extract translation component from transform matrix
+                match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
+                if match:
+                    try:
+                        return float(match.group(1))
+                    except ValueError:
+                        pass
+            # Check for anchors or relative position indicators in Word format
+            # 'dist' attributes can indicate relative positioning
+            for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
+                if elem.get(attr_name) is not None:
+                    return elem.sourceline  # Use the XML source line number as fallback
+        # For VML shapes, look for specific attributes
+        for ns_uri in paragraph_element.nsmap.values():
+            if "vml" in ns_uri:
+                # Try to extract position from style attribute
+                style = paragraph_element.get("style")
+                if style:
+                    match = re.search(r"top:([0-9.]+)pt", style)
+                    if match:
+                        try:
+                            return float(match.group(1))
+                        except ValueError:
+                            pass
+        # If no better position indicator found, use XML source line number as proxy for order
+        return (
+            paragraph_element.sourceline
+            if hasattr(paragraph_element, "sourceline")
+            else None
+        )
+    def _collect_textbox_paragraphs(self, textbox_elements):
+        """Collect and organize paragraphs from textbox elements."""
+        processed_paragraphs = []
+        container_paragraphs = {}
+        for element in textbox_elements:
+            element_id = id(element)
+            # Skip if we've already processed this exact element
+            if element_id in processed_paragraphs:
+                continue
+            tag_name = etree.QName(element).localname
+            processed_paragraphs.append(element_id)
+            # Handle paragraphs directly found (VML textboxes)
+            if tag_name == "p":
+                # Find the containing textbox or shape element
+                container_id = None
+                for ancestor in element.iterancestors():
+                    if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
+                        container_id = id(ancestor)
+                        break
+                if container_id not in container_paragraphs:
+                    container_paragraphs[container_id] = []
+                container_paragraphs[container_id].append(
+                    (element, self._get_paragraph_position(element))
+                )
+            # Handle txbxContent elements (Word DrawingML textboxes)
+            elif tag_name == "txbxContent":
+                paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
+                container_id = id(element)
+                if container_id not in container_paragraphs:
+                    container_paragraphs[container_id] = []
+                for p in paragraphs:
+                    p_id = id(p)
+                    if p_id not in processed_paragraphs:
+                        processed_paragraphs.append(p_id)
+                        container_paragraphs[container_id].append(
+                            (p, self._get_paragraph_position(p))
+                        )
+            else:
+                # Try to extract any paragraphs from unknown elements
+                paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
+                container_id = id(element)
+                if container_id not in container_paragraphs:
+                    container_paragraphs[container_id] = []
+                for p in paragraphs:
+                    p_id = id(p)
+                    if p_id not in processed_paragraphs:
+                        processed_paragraphs.append(p_id)
+                        container_paragraphs[container_id].append(
+                            (p, self._get_paragraph_position(p))
+                        )
+        return container_paragraphs
+    def _handle_textbox_content(
+        self,
+        textbox_elements: list,
+        docx_obj: DocxDocument,
+        doc: DoclingDocument,
+    ) -> None:
+        """Process textbox content and add it to the document structure."""
+        level = self._get_level()
+        # Create a textbox group to contain all text from the textbox
+        textbox_group = doc.add_group(
+            label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
+        )
+        # Set this as the current parent to ensure textbox content
+        # is properly nested in document structure
+        original_parent = self.parents[level]
+        self.parents[level] = textbox_group
+        # Collect and organize paragraphs
+        container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
+        # Process all paragraphs
+        all_paragraphs = []
+        # Sort paragraphs within each container, then process containers
+        for container_id, paragraphs in container_paragraphs.items():
+            # Sort by vertical position within each container
+            sorted_container_paragraphs = sorted(
+                paragraphs,
+                key=lambda x: (
+                    x[1] is None,
+                    x[1] if x[1] is not None else float("inf"),
+                ),
+            )
+            # Add the sorted paragraphs to our processing list
+            all_paragraphs.extend(sorted_container_paragraphs)
+        # Process all the paragraphs
+        for p, _ in all_paragraphs:
+            self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
+        # Restore original parent
+        self.parents[level] = original_parent
+        return
     def _handle_equations_in_text(self, element, text):
         only_texts = []
         only_equations = []
@@ -423,10 +669,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         element: BaseOxmlElement,
         docx_obj: DocxDocument,
         doc: DoclingDocument,
+        is_from_textbox: bool = False,
     ) -> None:
         paragraph = Paragraph(element, docx_obj)
+        # Skip if from a textbox and this exact paragraph content was already processed
+        # Skip if from a textbox and this exact paragraph content was already processed
         raw_text = paragraph.text
+        if is_from_textbox and raw_text:
+            # Create a simple hash of content to detect duplicates
+            content_hash = f"{len(raw_text)}:{raw_text[:50]}"
+            if content_hash in self.processed_paragraph_content:
+                _log.debug(f"Skipping duplicate paragraph content: {content_hash}")
+                return
+            self.processed_paragraph_content.append(content_hash)
         text, equations = self._handle_equations_in_text(element=element, text=raw_text)
         if text is None:

{docling-2.32.0 → docling-2.34.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -175,13 +175,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
                 if len(group) == 1:
                     return group[0]
-                merged_text = "".join(cell.text for cell in group)
                 merged_bbox = BoundingBox(
                     l=min(cell.rect.to_bounding_box().l for cell in group),
                     t=min(cell.rect.to_bounding_box().t for cell in group),
                     r=max(cell.rect.to_bounding_box().r for cell in group),
                     b=max(cell.rect.to_bounding_box().b for cell in group),
                 )
+                assert self._ppage is not None
+                self.text_page = self._ppage.get_textpage()
+                bbox = merged_bbox.to_bottom_left_origin(page_size.height)
+                merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
                 return TextCell(
                     index=group[0].index,
                     text=merged_text,

{docling-2.32.0 → docling-2.34.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -1,6 +1,9 @@
+import math
+from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
+import numpy as np
 from docling_core.types.doc import (
     BoundingBox,
     DocItemLabel,
@@ -16,7 +19,7 @@ from docling_core.types.io import (
     DocumentStream,
 )
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field, computed_field
 if TYPE_CHECKING:
     from docling.backend.pdf_backend import PdfPageBackend
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
     choices: List[OpenAiResponseChoice]
     created: int
     usage: OpenAiResponseUsage
+# Create a type alias for score values
+ScoreValue = float
+class QualityGrade(str, Enum):
+    POOR = "poor"
+    FAIR = "fair"
+    GOOD = "good"
+    EXCELLENT = "excellent"
+    UNSPECIFIED = "unspecified"
+class PageConfidenceScores(BaseModel):
+    parse_score: ScoreValue = np.nan
+    layout_score: ScoreValue = np.nan
+    table_score: ScoreValue = np.nan
+    ocr_score: ScoreValue = np.nan
+    def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
+        if score < 0.5:
+            return QualityGrade.POOR
+        elif score < 0.8:
+            return QualityGrade.FAIR
+        elif score < 0.9:
+            return QualityGrade.GOOD
+        elif score >= 0.9:
+            return QualityGrade.EXCELLENT
+        return QualityGrade.UNSPECIFIED
+    @computed_field  # type: ignore
+    @property
+    def mean_grade(self) -> QualityGrade:
+        return self._score_to_grade(self.mean_score)
+    @computed_field  # type: ignore
+    @property
+    def low_grade(self) -> QualityGrade:
+        return self._score_to_grade(self.low_score)
+    @computed_field  # type: ignore
+    @property
+    def mean_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanmean(
+                [
+                    self.ocr_score,
+                    self.table_score,
+                    self.layout_score,
+                    self.parse_score,
+                ]
+            )
+        )
+    @computed_field  # type: ignore
+    @property
+    def low_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanquantile(
+                [
+                    self.ocr_score,
+                    self.table_score,
+                    self.layout_score,
+                    self.parse_score,
+                ],
+                q=0.05,
+            )
+        )
+class ConfidenceReport(PageConfidenceScores):
+    pages: Dict[int, PageConfidenceScores] = Field(
+        default_factory=lambda: defaultdict(PageConfidenceScores)
+    )
+    @computed_field  # type: ignore
+    @property
+    def mean_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanmean(
+                [c.mean_score for c in self.pages.values()],
+            )
+        )
+    @computed_field  # type: ignore
+    @property
+    def low_score(self) -> ScoreValue:
+        return ScoreValue(
+            np.nanmean(
+                [c.low_score for c in self.pages.values()],
+            )
+        )

{docling-2.32.0 → docling-2.34.0}/docling/datamodel/document.py RENAMED Viewed

@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
 )
 from docling_core.utils.file import resolve_source_to_stream
 from docling_core.utils.legacy import docling_document_to_legacy
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from typing_extensions import deprecated
 from docling.backend.abstract_backend import (
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
 )
 from docling.datamodel.base_models import (
     AssembledUnit,
+    ConfidenceReport,
     ConversionStatus,
     DocumentStream,
     ErrorItem,
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
     pages: List[Page] = []
     assembled: AssembledUnit = AssembledUnit()
     timings: Dict[str, ProfilingItem] = {}
+    confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
     document: DoclingDocument = _EMPTY_DOCLING_DOC
@@ -302,7 +304,7 @@ class _DocumentConversionInput(BaseModel):
                     if ("." in obj.name and not obj.name.startswith("."))
                     else ""
                 )
-                mime = _DocumentConversionInput._mime_from_extension(ext)
+                mime = _DocumentConversionInput._mime_from_extension(ext.lower())
             if mime is not None and mime.lower() == "application/zip":
                 objname = obj.name.lower()
                 if objname.endswith(".xlsx"):
@@ -376,6 +378,13 @@ class _DocumentConversionInput(BaseModel):
             mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
         elif ext in FormatToExtensions[InputFormat.PDF]:
             mime = FormatToMimeType[InputFormat.PDF][0]
+        elif ext in FormatToExtensions[InputFormat.DOCX]:
+            mime = FormatToMimeType[InputFormat.DOCX][0]
+        elif ext in FormatToExtensions[InputFormat.PPTX]:
+            mime = FormatToMimeType[InputFormat.PPTX][0]
+        elif ext in FormatToExtensions[InputFormat.XLSX]:
+            mime = FormatToMimeType[InputFormat.XLSX][0]
         return mime
     @staticmethod

{docling-2.32.0 → docling-2.34.0}/docling/models/layout_model.py RENAMED Viewed

@@ -5,6 +5,7 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
+import numpy as np
 from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import Image
@@ -184,6 +185,14 @@ class LayoutModel(BasePageModel):
                     ).postprocess()
                     # processed_clusters, processed_cells = clusters, page.cells
+                    conv_res.confidence.pages[page.page_no].layout_score = float(
+                        np.mean([c.confidence for c in processed_clusters])
+                    )
+                    conv_res.confidence.pages[page.page_no].ocr_score = float(
+                        np.mean([c.confidence for c in processed_cells if c.from_ocr])
+                    )
                     page.cells = processed_cells
                     page.predictions.layout = LayoutPrediction(
                         clusters=processed_clusters

{docling-2.32.0 → docling-2.34.0}/docling/models/page_assemble_model.py RENAMED Viewed

@@ -3,6 +3,7 @@ import re
 from collections.abc import Iterable
 from typing import List
+import numpy as np
 from pydantic import BaseModel
 from docling.datamodel.base_models import (

{docling-2.32.0 → docling-2.34.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

@@ -1,11 +1,13 @@
+import re
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
+import numpy as np
 from PIL import ImageDraw
 from pydantic import BaseModel
-from docling.datamodel.base_models import Page
+from docling.datamodel.base_models import Page, ScoreValue
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
@@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
     def __init__(self, options: PagePreprocessingOptions):
         self.options = options
+        # Pre-compiled regex patterns for efficiency
+        self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
+        self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
+        self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
+        self.SLASH_NUMBER_GARBAGE_RE = re.compile(
+            r"(?:/\w+\s*){2,}"
+        )  # Two or more "/token " sequences
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
         if self.options.create_parsed_page:
             page.parsed_page = page._backend.get_segmented_page()
+        # Rate the text quality from the PDF parser, and aggregate on page
+        text_scores = []
+        for c in page.cells:
+            score = self.rate_text_quality(c.text)
+            text_scores.append(score)
+        conv_res.confidence.pages[page.page_no].parse_score = float(
+            np.nanquantile(
+                text_scores, q=0.10
+            )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
+        )
         # DEBUG code:
         def draw_text_boxes(image, cells, show: bool = False):
             draw = ImageDraw.Draw(image)
@@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
             draw_text_boxes(page.get_image(scale=1.0), page.cells)
         return page
+    def rate_text_quality(self, text: str) -> float:
+        # Hard errors: if any of these patterns are found, return 0.0 immediately.
+        blacklist_chars = ["�"]
+        if (
+            any(text.find(c) >= 0 for c in blacklist_chars)
+            or self.GLYPH_RE.search(text)
+            or self.SLASH_G_RE.search(text)
+            or self.SLASH_NUMBER_GARBAGE_RE.match(
+                text
+            )  # Check if text is mostly slash-number pattern
+        ):
+            return 0.0
+        penalty = 0.0
+        # Apply a penalty only if the fragmented words pattern occurs at least three times.
+        frag_matches = self.FRAG_RE.findall(text)
+        if len(frag_matches) >= 3:
+            penalty += 0.1 * len(frag_matches)
+        # Additional heuristic: if the average token length is below 2, add a penalty.
+        # tokens = text.split()
+        # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
+        #    penalty += 0.2
+        return max(1.0 - penalty, 0.0)

docling 2.32.0__tar.gz → 2.34.0__tar.gz

docling 2.32.0tar.gz → 2.34.0tar.gz