PyPI - docling - Versions diffs - 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl - Mend

docling 2.11.0py3-none-any.whl → 2.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

docling/backend/xml/__init__.py +0 -0
docling/backend/xml/uspto_backend.py +1888 -0
docling/cli/main.py +8 -0
docling/datamodel/base_models.py +18 -4
docling/datamodel/document.py +77 -13
docling/datamodel/pipeline_options.py +68 -4
docling/datamodel/settings.py +1 -0
docling/document_converter.py +11 -2
docling/models/ds_glm_model.py +34 -4
docling/models/easyocr_model.py +37 -3
docling/models/layout_model.py +144 -280
docling/models/page_assemble_model.py +11 -1
docling/models/rapid_ocr_model.py +24 -45
docling/models/table_structure_model.py +49 -33
docling/pipeline/base_pipeline.py +3 -1
docling/pipeline/standard_pdf_pipeline.py +7 -3
docling/utils/accelerator_utils.py +42 -0
docling/utils/glm_utils.py +11 -3
docling/utils/layout_postprocessor.py +666 -0
{docling-2.11.0.dist-info → docling-2.13.0.dist-info}/METADATA +3 -3
{docling-2.11.0.dist-info → docling-2.13.0.dist-info}/RECORD +24 -21
docling/utils/layout_utils.py +0 -812
{docling-2.11.0.dist-info → docling-2.13.0.dist-info}/LICENSE +0 -0
{docling-2.11.0.dist-info → docling-2.13.0.dist-info}/WHEEL +0 -0
{docling-2.11.0.dist-info → docling-2.13.0.dist-info}/entry_points.txt +0 -0

docling/cli/main.py CHANGED Viewed

@@ -26,6 +26,8 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
     EasyOcrOptions,
     OcrEngine,
     OcrMacOptions,
@@ -257,6 +259,10 @@ def convert(
             help="The timeout for processing each document, in seconds.",
         ),
     ] = None,
+    num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
+    device: Annotated[
+        AcceleratorDevice, typer.Option(..., help="Accelerator device")
+    ] = AcceleratorDevice.AUTO,
 ):
     if verbose == 0:
         logging.basicConfig(level=logging.WARNING)
@@ -336,7 +342,9 @@ def convert(
         if ocr_lang_list is not None:
             ocr_options.lang = ocr_lang_list
+        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
         pipeline_options = PdfPipelineOptions(
+            accelerator_options=accelerator_options,
             do_ocr=ocr,
             ocr_options=ocr_options,
             do_table_structure=True,

docling/datamodel/base_models.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 from docling_core.types.doc import (
@@ -28,6 +28,8 @@ class ConversionStatus(str, Enum):
 class InputFormat(str, Enum):
+    """A document format supported by document backend parsers."""
     DOCX = "docx"
     PPTX = "pptx"
     HTML = "html"
@@ -36,6 +38,7 @@ class InputFormat(str, Enum):
     ASCIIDOC = "asciidoc"
     MD = "md"
     XLSX = "xlsx"
+    XML_USPTO = "xml_uspto"
 class OutputFormat(str, Enum):
@@ -55,6 +58,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.XLSX: ["xlsx"],
+    InputFormat.XML_USPTO: ["xml", "txt"],
 }
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -81,10 +85,13 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
     InputFormat.XLSX: [
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ],
+    InputFormat.XML_USPTO: ["application/xml", "text/plain"],
 }
-MimeTypeToFormat = {
-    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
+MimeTypeToFormat: dict[str, list[InputFormat]] = {
+    mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
+    for value in FormatToMimeType.values()
+    for mime in value
 }
@@ -122,6 +129,7 @@ class Cluster(BaseModel):
     bbox: BoundingBox
     confidence: float = 1.0
     cells: List[Cell] = []
+    children: List["Cluster"] = []  # Add child cluster support
 class BasePageElement(BaseModel):
@@ -136,6 +144,12 @@ class LayoutPrediction(BaseModel):
     clusters: List[Cluster] = []
+class ContainerElement(
+    BasePageElement
+):  # Used for Form and Key-Value-Regions, only for typing.
+    pass
 class Table(BasePageElement):
     otsl_seq: List[str]
     num_rows: int = 0
@@ -175,7 +189,7 @@ class PagePredictions(BaseModel):
     equations_prediction: Optional[EquationPrediction] = None
-PageElement = Union[TextElement, Table, FigureElement]
+PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
 class AssembledUnit(BaseModel):

docling/datamodel/document.py CHANGED Viewed

@@ -3,7 +3,17 @@ import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Set,
+    Type,
+    Union,
+)
 import filetype
 from docling_core.types.doc import (
@@ -63,7 +73,7 @@ _log = logging.getLogger(__name__)
 layout_label_to_ds_type = {
     DocItemLabel.TITLE: "title",
-    DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
+    DocItemLabel.DOCUMENT_INDEX: "table",
     DocItemLabel.SECTION_HEADER: "subtitle-level-1",
     DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
     DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
@@ -78,6 +88,8 @@ layout_label_to_ds_type = {
     DocItemLabel.PICTURE: "figure",
     DocItemLabel.TEXT: "paragraph",
     DocItemLabel.PARAGRAPH: "paragraph",
+    DocItemLabel.FORM: DocItemLabel.FORM.value,
+    DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
 }
 _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
@@ -235,7 +247,7 @@ class _DocumentConversionInput(BaseModel):
             if isinstance(obj, Path):
                 yield InputDocument(
                     path_or_stream=obj,
-                    format=format,
+                    format=format,  # type: ignore[arg-type]
                     filename=obj.name,
                     limits=self.limits,
                     backend=backend,
@@ -243,7 +255,7 @@ class _DocumentConversionInput(BaseModel):
             elif isinstance(obj, DocumentStream):
                 yield InputDocument(
                     path_or_stream=obj.stream,
-                    format=format,
+                    format=format,  # type: ignore[arg-type]
                     filename=obj.name,
                     limits=self.limits,
                     backend=backend,
@@ -251,15 +263,15 @@ class _DocumentConversionInput(BaseModel):
             else:
                 raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
-    def _guess_format(self, obj: Union[Path, DocumentStream]):
+    def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
         content = b""  # empty binary blob
-        format = None
+        formats: list[InputFormat] = []
         if isinstance(obj, Path):
             mime = filetype.guess_mime(str(obj))
             if mime is None:
                 ext = obj.suffix[1:]
-                mime = self._mime_from_extension(ext)
+                mime = _DocumentConversionInput._mime_from_extension(ext)
             if mime is None:  # must guess from
                 with obj.open("rb") as f:
                     content = f.read(1024)  # Read first 1KB
@@ -274,15 +286,53 @@ class _DocumentConversionInput(BaseModel):
                     if ("." in obj.name and not obj.name.startswith("."))
                     else ""
                 )
-                mime = self._mime_from_extension(ext)
+                mime = _DocumentConversionInput._mime_from_extension(ext)
-        mime = mime or self._detect_html_xhtml(content)
+        mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
         mime = mime or "text/plain"
+        formats = MimeTypeToFormat.get(mime, [])
+        if formats:
+            # TODO: remove application/xml case after adding another XML parse
+            if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
+                return formats[0]
+            else:  # ambiguity in formats
+                return _DocumentConversionInput._guess_from_content(
+                    content, mime, formats
+                )
+        else:
+            return None
+    @staticmethod
+    def _guess_from_content(
+        content: bytes, mime: str, formats: list[InputFormat]
+    ) -> Optional[InputFormat]:
+        """Guess the input format of a document by checking part of its content."""
+        input_format: Optional[InputFormat] = None
+        content_str = content.decode("utf-8")
+        if mime == "application/xml":
+            match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
+            if match_doctype:
+                xml_doctype = match_doctype.group()
+                if InputFormat.XML_USPTO in formats and any(
+                    item in xml_doctype
+                    for item in (
+                        "us-patent-application-v4",
+                        "us-patent-grant-v4",
+                        "us-grant-025",
+                        "patent-application-publication",
+                    )
+                ):
+                    input_format = InputFormat.XML_USPTO
+        elif mime == "text/plain":
+            if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
+                input_format = InputFormat.XML_USPTO
-        format = MimeTypeToFormat.get(mime)
-        return format
+        return input_format
-    def _mime_from_extension(self, ext):
+    @staticmethod
+    def _mime_from_extension(ext):
         mime = None
         if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
             mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
@@ -293,7 +343,19 @@ class _DocumentConversionInput(BaseModel):
         return mime
-    def _detect_html_xhtml(self, content):
+    @staticmethod
+    def _detect_html_xhtml(
+        content: bytes,
+    ) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
+        """Guess the mime type of an XHTML, HTML, or XML file from its content.
+        Args:
+            content: A short piece of a document from its beginning.
+        Returns:
+            The mime type of an XHTML, HTML, or XML file, or None if the content does
+              not match any of these formats.
+        """
         content_str = content.decode("ascii", errors="ignore").lower()
         # Remove XML comments
         content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
@@ -302,6 +364,8 @@ class _DocumentConversionInput(BaseModel):
         if re.match(r"<\?xml", content_str):
             if "xhtml" in content_str[:1000]:
                 return "application/xhtml+xml"
+            else:
+                return "application/xml"
         if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
             return "text/html"

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -1,8 +1,66 @@
+import logging
+import os
+import warnings
 from enum import Enum
 from pathlib import Path
-from typing import List, Literal, Optional, Union
+from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic_settings import (
+    BaseSettings,
+    PydanticBaseSettingsSource,
+    SettingsConfigDict,
+)
+from typing_extensions import deprecated
+_log = logging.getLogger(__name__)
+class AcceleratorDevice(str, Enum):
+    """Devices to run model inference"""
+    AUTO = "auto"
+    CPU = "cpu"
+    CUDA = "cuda"
+    MPS = "mps"
+class AcceleratorOptions(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
+    )
+    num_threads: int = 4
+    device: AcceleratorDevice = AcceleratorDevice.AUTO
+    @model_validator(mode="before")
+    @classmethod
+    def check_alternative_envvars(cls, data: Any) -> Any:
+        r"""
+        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
+        The alternative envvar is used only if it is valid and the regular envvar is not set.
+        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
+        the same functionality. In case the alias envvar is set and the user tries to override the
+        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
+        as an extra input instead of simply overwriting the evvar value for that parameter.
+        """
+        if isinstance(data, dict):
+            input_num_threads = data.get("num_threads")
+            # Check if to set the num_threads from the alternative envvar
+            if input_num_threads is None:
+                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
+                omp_num_threads = os.getenv("OMP_NUM_THREADS")
+                if docling_num_threads is None and omp_num_threads is not None:
+                    try:
+                        data["num_threads"] = int(omp_num_threads)
+                    except ValueError:
+                        _log.error(
+                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
+                            omp_num_threads,
+                        )
+        return data
 class TableFormerMode(str, Enum):
@@ -78,9 +136,14 @@ class EasyOcrOptions(OcrOptions):
     kind: Literal["easyocr"] = "easyocr"
     lang: List[str] = ["fr", "de", "es", "en"]
-    use_gpu: bool = True  # same default as easyocr.Reader
+    use_gpu: Optional[bool] = None
+    confidence_threshold: float = 0.65
     model_storage_directory: Optional[str] = None
-    download_enabled: bool = True  # same default as easyocr.Reader
+    recog_network: Optional[str] = "standard"
+    download_enabled: bool = True
     model_config = ConfigDict(
         extra="forbid",
@@ -153,6 +216,7 @@ class PipelineOptions(BaseModel):
         True  # This default will be set to False on a future version of docling
     )
     document_timeout: Optional[float] = None
+    accelerator_options: AcceleratorOptions = AcceleratorOptions()
 class PdfPipelineOptions(PipelineOptions):

docling/datamodel/settings.py CHANGED Viewed

@@ -31,6 +31,7 @@ class DebugSettings(BaseModel):
     visualize_cells: bool = False
     visualize_ocr: bool = False
     visualize_layout: bool = False
+    visualize_raw_layout: bool = False
     visualize_tables: bool = False
     profile_pipeline_timings: bool = False

docling/document_converter.py CHANGED Viewed

@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
     DoclingComponentType,
@@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
     backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
-class PdfFormatOption(FormatOption):
+class PatentUsptoFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
+class ImageFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
     backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
-class ImageFormatOption(FormatOption):
+class PdfFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
     backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
@@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.HTML: FormatOption(
             pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
         ),
+        InputFormat.XML_USPTO: FormatOption(
+            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
+        ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),

docling/models/ds_glm_model.py CHANGED Viewed

@@ -22,9 +22,15 @@ from docling_core.types.legacy_doc.document import (
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from PIL import ImageDraw
-from pydantic import BaseModel, ConfigDict
-from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
+from pydantic import BaseModel, ConfigDict, TypeAdapter
+from docling.datamodel.base_models import (
+    Cluster,
+    ContainerElement,
+    FigureElement,
+    Table,
+    TextElement,
+)
 from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
 from docling.datamodel.settings import settings
 from docling.utils.glm_utils import to_docling_document
@@ -204,7 +210,31 @@ class GlmModel:
                             )
                         ],
                         obj_type=layout_label_to_ds_type.get(element.label),
-                        # data=[[]],
+                        payload={
+                            "children": TypeAdapter(List[Cluster]).dump_python(
+                                element.cluster.children
+                            )
+                        },  # hack to channel child clusters through GLM
+                    )
+                )
+            elif isinstance(element, ContainerElement):
+                main_text.append(
+                    BaseText(
+                        text="",
+                        payload={
+                            "children": TypeAdapter(List[Cluster]).dump_python(
+                                element.cluster.children
+                            )
+                        },  # hack to channel child clusters through GLM
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        name=element.label,
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no + 1,
+                                span=[0, 0],
+                            )
+                        ],
                     )
                 )

docling/models/easyocr_model.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import warnings
 from typing import Iterable
 import numpy
@@ -7,16 +8,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import EasyOcrOptions
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    EasyOcrOptions,
+)
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class EasyOcrModel(BaseOcrModel):
-    def __init__(self, enabled: bool, options: EasyOcrOptions):
+    def __init__(
+        self,
+        enabled: bool,
+        options: EasyOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
         super().__init__(enabled=enabled, options=options)
         self.options: EasyOcrOptions
@@ -31,11 +42,33 @@ class EasyOcrModel(BaseOcrModel):
                     "Alternatively, Docling has support for other OCR engines. See the documentation."
                 )
+            if self.options.use_gpu is None:
+                device = decide_device(accelerator_options.device)
+                # Enable easyocr GPU if running on CUDA, MPS
+                use_gpu = any(
+                    [
+                        device.startswith(x)
+                        for x in [
+                            AcceleratorDevice.CUDA.value,
+                            AcceleratorDevice.MPS.value,
+                        ]
+                    ]
+                )
+            else:
+                warnings.warn(
+                    "Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
+                    "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
+                    "to run EasyOCR. Otherwise, EasyOCR runs in CPU."
+                )
+                use_gpu = self.options.use_gpu
             self.reader = easyocr.Reader(
                 lang_list=self.options.lang,
-                gpu=self.options.use_gpu,
+                gpu=use_gpu,
                 model_storage_directory=self.options.model_storage_directory,
+                recog_network=self.options.recog_network,
                 download_enabled=self.options.download_enabled,
+                verbose=False,
             )
     def __call__(
@@ -85,6 +118,7 @@ class EasyOcrModel(BaseOcrModel):
                                 ),
                             )
                             for ix, line in enumerate(result)
+                            if line[2] >= self.options.confidence_threshold
                         ]
                         all_ocr_cells.extend(cells)

docling 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl

docling 2.11.0py3-none-any.whl → 2.13.0py3-none-any.whl