PyPI - docling - Versions diffs - 2.56.1__py3-none-any.whl → 2.58.0__py3-none-any.whl - Mend

docling 2.56.1py3-none-any.whl → 2.58.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (30) hide show

docling/backend/abstract_backend.py +24 -3
docling/backend/asciidoc_backend.py +3 -3
docling/backend/docling_parse_v4_backend.py +15 -4
docling/backend/docx/drawingml/utils.py +131 -0
docling/backend/html_backend.py +130 -20
docling/backend/md_backend.py +27 -5
docling/backend/msexcel_backend.py +115 -27
docling/backend/mspowerpoint_backend.py +2 -2
docling/backend/msword_backend.py +104 -29
docling/backend/pdf_backend.py +9 -2
docling/backend/pypdfium2_backend.py +12 -3
docling/cli/main.py +85 -30
docling/datamodel/asr_model_specs.py +408 -6
docling/datamodel/backend_options.py +82 -0
docling/datamodel/base_models.py +17 -2
docling/datamodel/document.py +81 -48
docling/datamodel/pipeline_options_asr_model.py +21 -1
docling/document_converter.py +37 -45
docling/document_extractor.py +12 -11
docling/models/readingorder_model.py +6 -7
docling/pipeline/asr_pipeline.py +139 -3
docling/pipeline/vlm_pipeline.py +53 -33
docling/utils/api_image_request.py +4 -4
docling/utils/layout_postprocessor.py +23 -24
{docling-2.56.1.dist-info → docling-2.58.0.dist-info}/METADATA +4 -2
{docling-2.56.1.dist-info → docling-2.58.0.dist-info}/RECORD +30 -28
{docling-2.56.1.dist-info → docling-2.58.0.dist-info}/WHEEL +0 -0
{docling-2.56.1.dist-info → docling-2.58.0.dist-info}/entry_points.txt +0 -0
{docling-2.56.1.dist-info → docling-2.58.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.56.1.dist-info → docling-2.58.0.dist-info}/top_level.txt +0 -0

docling/datamodel/backend_options.py ADDED Viewed

@@ -0,0 +1,82 @@
+from pathlib import PurePath
+from typing import Annotated, Literal, Optional, Union
+from pydantic import AnyUrl, BaseModel, Field, SecretStr
+class BaseBackendOptions(BaseModel):
+    """Common options for all declarative document backends."""
+    enable_remote_fetch: bool = Field(
+        False, description="Enable remote resource fetching."
+    )
+    enable_local_fetch: bool = Field(
+        False, description="Enable local resource fetching."
+    )
+class DeclarativeBackendOptions(BaseBackendOptions):
+    """Default backend options for a declarative document backend."""
+    kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
+class HTMLBackendOptions(BaseBackendOptions):
+    """Options specific to the HTML backend.
+    This class can be extended to include options specific to HTML processing.
+    """
+    kind: Literal["html"] = Field("html", exclude=True, repr=False)
+    fetch_images: bool = Field(
+        False,
+        description=(
+            "Whether the backend should access remote or local resources to parse "
+            "images in an HTML document."
+        ),
+    )
+    source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
+        None,
+        description=(
+            "The URI that originates the HTML document. If provided, the backend "
+            "will use it to resolve relative paths in the HTML document."
+        ),
+    )
+class MarkdownBackendOptions(BaseBackendOptions):
+    """Options specific to the Markdown backend."""
+    kind: Literal["md"] = Field("md", exclude=True, repr=False)
+    fetch_images: bool = Field(
+        False,
+        description=(
+            "Whether the backend should access remote or local resources to parse "
+            "images in the markdown document."
+        ),
+    )
+    source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
+        None,
+        description=(
+            "The URI that originates the markdown document. If provided, the backend "
+            "will use it to resolve relative paths in the markdown document."
+        ),
+    )
+class PdfBackendOptions(BaseBackendOptions):
+    """Backend options for pdf document backends."""
+    kind: Literal["pdf"] = Field("pdf", exclude=True, repr=False)
+    password: Optional[SecretStr] = None
+BackendOptions = Annotated[
+    Union[
+        DeclarativeBackendOptions,
+        HTMLBackendOptions,
+        MarkdownBackendOptions,
+        PdfBackendOptions,
+    ],
+    Field(discriminator="kind"),
+]

docling/datamodel/base_models.py CHANGED Viewed

@@ -94,7 +94,7 @@ FormatToExtensions: dict[InputFormat, list[str]] = {
     InputFormat.XML_USPTO: ["xml", "txt"],
     InputFormat.METS_GBS: ["tar.gz"],
     InputFormat.JSON_DOCLING: ["json"],
-    InputFormat.AUDIO: ["wav", "mp3"],
+    InputFormat.AUDIO: ["wav", "mp3", "m4a", "aac", "ogg", "flac", "mp4", "avi", "mov"],
     InputFormat.VTT: ["vtt"],
 }
@@ -128,7 +128,22 @@ FormatToMimeType: dict[InputFormat, list[str]] = {
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
     InputFormat.METS_GBS: ["application/mets+xml"],
     InputFormat.JSON_DOCLING: ["application/json"],
-    InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
+    InputFormat.AUDIO: [
+        "audio/x-wav",
+        "audio/mpeg",
+        "audio/wav",
+        "audio/mp3",
+        "audio/mp4",
+        "audio/m4a",
+        "audio/aac",
+        "audio/ogg",
+        "audio/flac",
+        "audio/x-flac",
+        "video/mp4",
+        "video/avi",
+        "video/x-msvideo",
+        "video/quicktime",
+    ],
     InputFormat.VTT: ["text/vtt"],
 }

docling/datamodel/document.py CHANGED Viewed

@@ -8,14 +8,12 @@ from io import BytesIO
 from pathlib import Path, PurePath
 from typing import (
     TYPE_CHECKING,
-    Any,
-    Dict,
-    List,
+    Annotated,
     Literal,
     Optional,
-    Set,
     Type,
     Union,
+    cast,
 )
 import filetype
@@ -54,8 +52,10 @@ from typing_extensions import deprecated
 from docling.backend.abstract_backend import (
     AbstractDocumentBackend,
+    DeclarativeDocumentBackend,
     PaginatedDocumentBackend,
 )
+from docling.datamodel.backend_options import BackendOptions
 from docling.datamodel.base_models import (
     AssembledUnit,
     ConfidenceReport,
@@ -74,6 +74,7 @@ from docling.utils.utils import create_file_hash
 if TYPE_CHECKING:
     from docling.datamodel.base_models import BaseFormatOption
+    from docling.document_converter import FormatOption
 _log = logging.getLogger(__name__)
@@ -102,29 +103,46 @@ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
 class InputDocument(BaseModel):
-    file: PurePath
-    document_hash: str  # = None
-    valid: bool = True
-    limits: DocumentLimits = DocumentLimits()
-    format: InputFormat  # = None
-    filesize: Optional[int] = None
-    page_count: int = 0
-    _backend: AbstractDocumentBackend  # Internal PDF backend used
+    """A document as an input of a Docling conversion."""
+    file: Annotated[
+        PurePath, Field(description="A path representation the input document.")
+    ]
+    document_hash: Annotated[
+        str,
+        Field(description="A stable hash of the path or stream of the input document."),
+    ]
+    valid: bool = Field(True, description="Whether this is is a valid input document.")
+    backend_options: Optional[BackendOptions] = Field(
+        None, description="Custom options for backends."
+    )
+    limits: DocumentLimits = Field(
+        DocumentLimits(), description="Limits in the input document for the conversion."
+    )
+    format: Annotated[InputFormat, Field(description="The document format.")]
+    filesize: Optional[int] = Field(
+        None, description="Size of the input file, in bytes."
+    )
+    page_count: int = Field(0, description="Number of pages in the input document.")
+    _backend: AbstractDocumentBackend
     def __init__(
         self,
         path_or_stream: Union[BytesIO, Path],
         format: InputFormat,
         backend: Type[AbstractDocumentBackend],
+        backend_options: Optional[BackendOptions] = None,
         filename: Optional[str] = None,
         limits: Optional[DocumentLimits] = None,
-    ):
+    ) -> None:
         super().__init__(
-            file="", document_hash="", format=InputFormat.PDF
+            file="",
+            document_hash="",
+            format=InputFormat.PDF,
+            backend_options=backend_options,
         )  # initialize with dummy values
         self.limits = limits or DocumentLimits()
         self.format = format
@@ -140,7 +158,8 @@ class InputDocument(BaseModel):
             elif isinstance(path_or_stream, BytesIO):
                 assert filename is not None, (
-                    "Can't construct InputDocument from stream without providing filename arg."
+                    "Can't construct InputDocument from stream without providing "
+                    "filename arg."
                 )
                 self.file = PurePath(filename)
                 self.filesize = path_or_stream.getbuffer().nbytes
@@ -175,7 +194,8 @@ class InputDocument(BaseModel):
         except RuntimeError as e:
             self.valid = False
             _log.exception(
-                f"An unexpected error occurred while opening the document {self.file.name}",
+                "An unexpected error occurred while opening the document "
+                "f{self.file.name}",
                 exc_info=e,
             )
             # raise
@@ -185,7 +205,15 @@ class InputDocument(BaseModel):
         backend: Type[AbstractDocumentBackend],
         path_or_stream: Union[BytesIO, Path],
     ) -> None:
-        self._backend = backend(self, path_or_stream=path_or_stream)
+        if self.backend_options:
+            self._backend = backend(
+                self,
+                path_or_stream=path_or_stream,
+                options=self.backend_options,
+            )
+        else:
+            self._backend = backend(self, path_or_stream=path_or_stream)
         if not self._backend.is_valid():
             self.valid = False
@@ -199,11 +227,11 @@ class ConversionResult(BaseModel):
     input: InputDocument
     status: ConversionStatus = ConversionStatus.PENDING  # failure, success
-    errors: List[ErrorItem] = []  # structure to keep errors
+    errors: list[ErrorItem] = []  # structure to keep errors
-    pages: List[Page] = []
+    pages: list[Page] = []
     assembled: AssembledUnit = AssembledUnit()
-    timings: Dict[str, ProfilingItem] = {}
+    timings: dict[str, ProfilingItem] = {}
     confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
     document: DoclingDocument = _EMPTY_DOCLING_DOC
@@ -222,7 +250,7 @@ class _DummyBackend(AbstractDocumentBackend):
         return False
     @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    def supported_formats(cls) -> set[InputFormat]:
         return set()
     @classmethod
@@ -235,7 +263,7 @@ class _DummyBackend(AbstractDocumentBackend):
 class _DocumentConversionInput(BaseModel):
     path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
-    headers: Optional[Dict[str, str]] = None
+    headers: Optional[dict[str, str]] = None
     limits: Optional[DocumentLimits] = DocumentLimits()
     def docs(
@@ -250,33 +278,36 @@ class _DocumentConversionInput(BaseModel):
             )
             format = self._guess_format(obj)
             backend: Type[AbstractDocumentBackend]
-            if format not in format_options.keys():
+            backend_options: Optional[BackendOptions] = None
+            if not format or format not in format_options:
                 _log.error(
-                    f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
+                    f"Input document {obj.name} with format {format} does not match "
+                    f"any allowed format: ({format_options.keys()})"
                 )
                 backend = _DummyBackend
             else:
-                backend = format_options[format].backend
+                options = format_options[format]
+                backend = options.backend
+                if "backend_options" in options.model_fields_set:
+                    backend_options = cast("FormatOption", options).backend_options
+            path_or_stream: Union[BytesIO, Path]
             if isinstance(obj, Path):
-                yield InputDocument(
-                    path_or_stream=obj,
-                    format=format,  # type: ignore[arg-type]
-                    filename=obj.name,
-                    limits=self.limits,
-                    backend=backend,
-                )
+                path_or_stream = obj
             elif isinstance(obj, DocumentStream):
-                yield InputDocument(
-                    path_or_stream=obj.stream,
-                    format=format,  # type: ignore[arg-type]
-                    filename=obj.name,
-                    limits=self.limits,
-                    backend=backend,
-                )
+                path_or_stream = obj.stream
             else:
                 raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
+            yield InputDocument(
+                path_or_stream=path_or_stream,
+                format=format,  # type: ignore[arg-type]
+                filename=obj.name,
+                limits=self.limits,
+                backend=backend,
+                backend_options=backend_options,
+            )
     def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
         content = b""  # empty binary blob
         formats: list[InputFormat] = []
@@ -290,12 +321,13 @@ class _DocumentConversionInput(BaseModel):
                 with obj.open("rb") as f:
                     content = f.read(1024)  # Read first 1KB
             if mime is not None and mime.lower() == "application/zip":
+                mime_root = "application/vnd.openxmlformats-officedocument"
                 if obj.suffixes[-1].lower() == ".xlsx":
-                    mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                    mime = mime_root + ".spreadsheetml.sheet"
                 elif obj.suffixes[-1].lower() == ".docx":
-                    mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                    mime = mime_root + ".wordprocessingml.document"
                 elif obj.suffixes[-1].lower() == ".pptx":
-                    mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+                    mime = mime_root + ".presentationml.presentation"
         elif isinstance(obj, DocumentStream):
             content = obj.stream.read(8192)
@@ -310,12 +342,13 @@ class _DocumentConversionInput(BaseModel):
                 mime = _DocumentConversionInput._mime_from_extension(ext.lower())
             if mime is not None and mime.lower() == "application/zip":
                 objname = obj.name.lower()
+                mime_root = "application/vnd.openxmlformats-officedocument"
                 if objname.endswith(".xlsx"):
-                    mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                    mime = mime_root + ".spreadsheetml.sheet"
                 elif objname.endswith(".docx"):
-                    mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                    mime = mime_root + ".wordprocessingml.document"
                 elif objname.endswith(".pptx"):
-                    mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+                    mime = mime_root + ".presentationml.presentation"
         if mime is not None and mime.lower() == "application/gzip":
             if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):

docling/datamodel/pipeline_options_asr_model.py CHANGED Viewed

@@ -17,7 +17,7 @@ class BaseAsrOptions(BaseModel):
 class InferenceAsrFramework(str, Enum):
-    # MLX = "mlx" # disabled for now
+    MLX = "mlx"
     # TRANSFORMERS = "transformers" # disabled for now
     WHISPER = "whisper"
@@ -55,3 +55,23 @@ class InlineAsrNativeWhisperOptions(InlineAsrOptions):
         AcceleratorDevice.CUDA,
     ]
     word_timestamps: bool = True
+class InlineAsrMlxWhisperOptions(InlineAsrOptions):
+    """
+    MLX Whisper options for Apple Silicon optimization.
+    Uses mlx-whisper library for efficient inference on Apple Silicon devices.
+    """
+    inference_framework: InferenceAsrFramework = InferenceAsrFramework.MLX
+    language: str = "en"
+    task: str = "transcribe"  # "transcribe" or "translate"
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.MPS,  # MLX is optimized for Apple Silicon
+    ]
+    word_timestamps: bool = True
+    no_speech_threshold: float = 0.6  # Threshold for detecting speech
+    logprob_threshold: float = -1.0  # Log probability threshold
+    compression_ratio_threshold: float = 2.4  # Compression ratio threshold

docling/document_converter.py CHANGED Viewed

@@ -9,11 +9,14 @@ from datetime import datetime
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import Optional, Type, Union
-from pydantic import BaseModel, ConfigDict, model_validator, validate_call
+from pydantic import ConfigDict, model_validator, validate_call
+from typing_extensions import Self
-from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+)
 from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.csv_backend import CsvDocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@@ -28,6 +31,12 @@ from docling.backend.noop_backend import NoOpBackend
 from docling.backend.webvtt_backend import WebVTTDocumentBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
+from docling.datamodel.backend_options import (
+    BackendOptions,
+    HTMLBackendOptions,
+    MarkdownBackendOptions,
+    PdfBackendOptions,
+)
 from docling.datamodel.base_models import (
     BaseFormatOption,
     ConversionStatus,
@@ -61,11 +70,13 @@ _PIPELINE_CACHE_LOCK = threading.Lock()
 class FormatOption(BaseFormatOption):
     pipeline_cls: Type[BasePipeline]
+    backend_options: Optional[BackendOptions] = None
     @model_validator(mode="after")
-    def set_optional_field_default(self) -> "FormatOption":
+    def set_optional_field_default(self) -> Self:
         if self.pipeline_options is None:
             self.pipeline_options = self.pipeline_cls.get_default_options()
         return self
@@ -92,6 +103,7 @@ class PowerpointFormatOption(FormatOption):
 class MarkdownFormatOption(FormatOption):
     pipeline_cls: Type = SimplePipeline
     backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
+    backend_options: Optional[MarkdownBackendOptions] = None
 class AsciiDocFormatOption(FormatOption):
@@ -102,6 +114,7 @@ class AsciiDocFormatOption(FormatOption):
 class HTMLFormatOption(FormatOption):
     pipeline_cls: Type = SimplePipeline
     backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
+    backend_options: Optional[HTMLBackendOptions] = None
 class PatentUsptoFormatOption(FormatOption):
@@ -122,6 +135,7 @@ class ImageFormatOption(FormatOption):
 class PdfFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
     backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
+    backend_options: Optional[PdfBackendOptions] = None
 class AudioFormatOption(FormatOption):
@@ -131,46 +145,24 @@ class AudioFormatOption(FormatOption):
 def _get_default_option(format: InputFormat) -> FormatOption:
     format_to_default_options = {
-        InputFormat.CSV: FormatOption(
-            pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
-        ),
-        InputFormat.XLSX: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
-        ),
-        InputFormat.DOCX: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
-        ),
-        InputFormat.PPTX: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
-        ),
-        InputFormat.MD: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
-        ),
-        InputFormat.ASCIIDOC: FormatOption(
-            pipeline_cls=SimplePipeline, backend=AsciiDocBackend
-        ),
-        InputFormat.HTML: FormatOption(
-            pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
-        ),
-        InputFormat.XML_USPTO: FormatOption(
-            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
-        ),
-        InputFormat.XML_JATS: FormatOption(
-            pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
-        ),
+        InputFormat.CSV: CsvFormatOption(),
+        InputFormat.XLSX: ExcelFormatOption(),
+        InputFormat.DOCX: WordFormatOption(),
+        InputFormat.PPTX: PowerpointFormatOption(),
+        InputFormat.MD: MarkdownFormatOption(),
+        InputFormat.ASCIIDOC: AsciiDocFormatOption(),
+        InputFormat.HTML: HTMLFormatOption(),
+        InputFormat.XML_USPTO: PatentUsptoFormatOption(),
+        InputFormat.XML_JATS: XMLJatsFormatOption(),
         InputFormat.METS_GBS: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
         ),
-        InputFormat.IMAGE: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
-        ),
-        InputFormat.PDF: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
-        ),
+        InputFormat.IMAGE: ImageFormatOption(),
+        InputFormat.PDF: PdfFormatOption(),
         InputFormat.JSON_DOCLING: FormatOption(
             pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
         ),
-        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
+        InputFormat.AUDIO: AudioFormatOption(),
         InputFormat.VTT: FormatOption(
             pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
         ),
@@ -186,13 +178,13 @@ class DocumentConverter:
     def __init__(
         self,
-        allowed_formats: Optional[List[InputFormat]] = None,
-        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
+        allowed_formats: Optional[list[InputFormat]] = None,
+        format_options: Optional[dict[InputFormat, FormatOption]] = None,
     ):
         self.allowed_formats = (
             allowed_formats if allowed_formats is not None else list(InputFormat)
         )
-        self.format_to_options: Dict[InputFormat, FormatOption] = {
+        self.format_to_options: dict[InputFormat, FormatOption] = {
             format: (
                 _get_default_option(format=format)
                 if (custom_option := (format_options or {}).get(format)) is None
@@ -200,8 +192,8 @@ class DocumentConverter:
             )
             for format in self.allowed_formats
         }
-        self.initialized_pipelines: Dict[
-            Tuple[Type[BasePipeline], str], BasePipeline
+        self.initialized_pipelines: dict[
+            tuple[Type[BasePipeline], str], BasePipeline
         ] = {}
     def _get_initialized_pipelines(
@@ -228,7 +220,7 @@ class DocumentConverter:
     def convert(
         self,
         source: Union[Path, str, DocumentStream],  # TODO review naming
-        headers: Optional[Dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
         raises_on_error: bool = True,
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
@@ -248,7 +240,7 @@ class DocumentConverter:
     def convert_all(
         self,
         source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
-        headers: Optional[Dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
         raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,

docling/document_extractor.py CHANGED Viewed

@@ -8,9 +8,10 @@ from collections.abc import Iterable, Iterator
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import Optional, Type, Union
 from pydantic import ConfigDict, model_validator, validate_call
+from typing_extensions import Self
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -56,7 +57,7 @@ class ExtractionFormatOption(BaseFormatOption):
     pipeline_cls: Type[BaseExtractionPipeline]
     @model_validator(mode="after")
-    def set_optional_field_default(self) -> "ExtractionFormatOption":
+    def set_optional_field_default(self) -> Self:
         if self.pipeline_options is None:
             # `get_default_options` comes from BaseExtractionPipeline
             self.pipeline_options = self.pipeline_cls.get_default_options()  # type: ignore[assignment]
@@ -70,7 +71,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
     the VLM extractor. This duplication will be removed when we deduplicate
     the format registry between convert/extract.
     """
-    format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
+    format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
         InputFormat.IMAGE: PyPdfiumDocumentBackend,
         InputFormat.PDF: PyPdfiumDocumentBackend,
     }
@@ -98,24 +99,24 @@ class DocumentExtractor:
     def __init__(
         self,
-        allowed_formats: Optional[List[InputFormat]] = None,
+        allowed_formats: Optional[list[InputFormat]] = None,
         extraction_format_options: Optional[
-            Dict[InputFormat, ExtractionFormatOption]
+            dict[InputFormat, ExtractionFormatOption]
         ] = None,
     ) -> None:
-        self.allowed_formats: List[InputFormat] = (
+        self.allowed_formats: list[InputFormat] = (
             allowed_formats if allowed_formats is not None else list(InputFormat)
         )
         # Build per-format options with defaults, then apply any user overrides
         overrides = extraction_format_options or {}
-        self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
+        self.extraction_format_to_options: dict[InputFormat, ExtractionFormatOption] = {
             fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
             for fmt in self.allowed_formats
         }
         # Cache pipelines by (class, options-hash)
-        self._initialized_pipelines: Dict[
-            Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
+        self._initialized_pipelines: dict[
+            tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
         ] = {}
     # ---------------------------- Public API ---------------------------------
@@ -125,7 +126,7 @@ class DocumentExtractor:
         self,
         source: Union[Path, str, DocumentStream],
         template: ExtractionTemplateType,
-        headers: Optional[Dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
         raises_on_error: bool = True,
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
@@ -147,7 +148,7 @@ class DocumentExtractor:
         self,
         source: Iterable[Union[Path, str, DocumentStream]],
         template: ExtractionTemplateType,
-        headers: Optional[Dict[str, str]] = None,
+        headers: Optional[dict[str, str]] = None,
         raises_on_error: bool = True,
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,

docling/models/readingorder_model.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from pathlib import Path
-from typing import Dict, List
 from docling_core.types.doc import (
     DocItemLabel,
@@ -48,8 +47,8 @@ class ReadingOrderModel:
     def _assembled_to_readingorder_elements(
         self, conv_res: ConversionResult
-    ) -> List[ReadingOrderPageElement]:
-        elements: List[ReadingOrderPageElement] = []
+    ) -> list[ReadingOrderPageElement]:
+        elements: list[ReadingOrderPageElement] = []
         page_no_to_pages = {p.page_no: p for p in conv_res.pages}
         for element in conv_res.assembled.elements:
@@ -123,10 +122,10 @@ class ReadingOrderModel:
     def _readingorder_elements_to_docling_doc(
         self,
         conv_res: ConversionResult,
-        ro_elements: List[ReadingOrderPageElement],
-        el_to_captions_mapping: Dict[int, List[int]],
-        el_to_footnotes_mapping: Dict[int, List[int]],
-        el_merges_mapping: Dict[int, List[int]],
+        ro_elements: list[ReadingOrderPageElement],
+        el_to_captions_mapping: dict[int, list[int]],
+        el_to_footnotes_mapping: dict[int, list[int]],
+        el_merges_mapping: dict[int, list[int]],
     ) -> DoclingDocument:
         id_to_elem = {
             RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem

docling 2.56.1__py3-none-any.whl → 2.58.0__py3-none-any.whl

Potentially problematic release.

docling 2.56.1py3-none-any.whl → 2.58.0py3-none-any.whl