PyPI - docling - Versions diffs - 2.57.0__py3-none-any.whl → 2.58.0__py3-none-any.whl - Mend

docling 2.57.0py3-none-any.whl → 2.58.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (28) hide show

docling/backend/abstract_backend.py +24 -3
docling/backend/asciidoc_backend.py +3 -3
docling/backend/docling_parse_v4_backend.py +15 -4
docling/backend/html_backend.py +130 -20
docling/backend/md_backend.py +27 -5
docling/backend/msexcel_backend.py +115 -27
docling/backend/mspowerpoint_backend.py +2 -2
docling/backend/msword_backend.py +18 -18
docling/backend/pdf_backend.py +9 -2
docling/backend/pypdfium2_backend.py +12 -3
docling/cli/main.py +85 -30
docling/datamodel/asr_model_specs.py +408 -6
docling/datamodel/backend_options.py +82 -0
docling/datamodel/base_models.py +17 -2
docling/datamodel/document.py +81 -48
docling/datamodel/pipeline_options_asr_model.py +21 -1
docling/document_converter.py +37 -45
docling/document_extractor.py +12 -11
docling/models/readingorder_model.py +6 -7
docling/pipeline/asr_pipeline.py +139 -3
docling/utils/api_image_request.py +4 -4
docling/utils/layout_postprocessor.py +23 -24
{docling-2.57.0.dist-info → docling-2.58.0.dist-info}/METADATA +4 -2
{docling-2.57.0.dist-info → docling-2.58.0.dist-info}/RECORD +28 -27
{docling-2.57.0.dist-info → docling-2.58.0.dist-info}/WHEEL +0 -0
{docling-2.57.0.dist-info → docling-2.58.0.dist-info}/entry_points.txt +0 -0
{docling-2.57.0.dist-info → docling-2.58.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.57.0.dist-info → docling-2.58.0.dist-info}/top_level.txt +0 -0

docling/datamodel/asr_model_specs.py CHANGED Viewed

@@ -10,13 +10,394 @@ from docling.datamodel.pipeline_options_asr_model import (
     # AsrResponseFormat,
     # ApiAsrOptions,
     InferenceAsrFramework,
+    InlineAsrMlxWhisperOptions,
     InlineAsrNativeWhisperOptions,
     TransformersModelType,
 )
 _log = logging.getLogger(__name__)
-WHISPER_TINY = InlineAsrNativeWhisperOptions(
+def _get_whisper_tiny_model():
+    """
+    Get the best Whisper Tiny model for the current hardware.
+    Automatically selects MLX Whisper Tiny for Apple Silicon (MPS) if available,
+    otherwise falls back to native Whisper Tiny.
+    """
+    # Check if MPS is available (Apple Silicon)
+    try:
+        import torch
+        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    except ImportError:
+        has_mps = False
+    # Check if mlx-whisper is available
+    try:
+        import mlx_whisper  # type: ignore
+        has_mlx_whisper = True
+    except ImportError:
+        has_mlx_whisper = False
+    # Use MLX Whisper if both MPS and mlx-whisper are available
+    if has_mps and has_mlx_whisper:
+        return InlineAsrMlxWhisperOptions(
+            repo_id="mlx-community/whisper-tiny-mlx",
+            inference_framework=InferenceAsrFramework.MLX,
+            language="en",
+            task="transcribe",
+            word_timestamps=True,
+            no_speech_threshold=0.6,
+            logprob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+        )
+    else:
+        return InlineAsrNativeWhisperOptions(
+            repo_id="tiny",
+            inference_framework=InferenceAsrFramework.WHISPER,
+            verbose=True,
+            timestamps=True,
+            word_timestamps=True,
+            temperature=0.0,
+            max_new_tokens=256,
+            max_time_chunk=30.0,
+        )
+# Create the model instance
+WHISPER_TINY = _get_whisper_tiny_model()
+def _get_whisper_small_model():
+    """
+    Get the best Whisper Small model for the current hardware.
+    Automatically selects MLX Whisper Small for Apple Silicon (MPS) if available,
+    otherwise falls back to native Whisper Small.
+    """
+    # Check if MPS is available (Apple Silicon)
+    try:
+        import torch
+        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    except ImportError:
+        has_mps = False
+    # Check if mlx-whisper is available
+    try:
+        import mlx_whisper  # type: ignore
+        has_mlx_whisper = True
+    except ImportError:
+        has_mlx_whisper = False
+    # Use MLX Whisper if both MPS and mlx-whisper are available
+    if has_mps and has_mlx_whisper:
+        return InlineAsrMlxWhisperOptions(
+            repo_id="mlx-community/whisper-small-mlx",
+            inference_framework=InferenceAsrFramework.MLX,
+            language="en",
+            task="transcribe",
+            word_timestamps=True,
+            no_speech_threshold=0.6,
+            logprob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+        )
+    else:
+        return InlineAsrNativeWhisperOptions(
+            repo_id="small",
+            inference_framework=InferenceAsrFramework.WHISPER,
+            verbose=True,
+            timestamps=True,
+            word_timestamps=True,
+            temperature=0.0,
+            max_new_tokens=256,
+            max_time_chunk=30.0,
+        )
+# Create the model instance
+WHISPER_SMALL = _get_whisper_small_model()
+def _get_whisper_medium_model():
+    """
+    Get the best Whisper Medium model for the current hardware.
+    Automatically selects MLX Whisper Medium for Apple Silicon (MPS) if available,
+    otherwise falls back to native Whisper Medium.
+    """
+    # Check if MPS is available (Apple Silicon)
+    try:
+        import torch
+        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    except ImportError:
+        has_mps = False
+    # Check if mlx-whisper is available
+    try:
+        import mlx_whisper  # type: ignore
+        has_mlx_whisper = True
+    except ImportError:
+        has_mlx_whisper = False
+    # Use MLX Whisper if both MPS and mlx-whisper are available
+    if has_mps and has_mlx_whisper:
+        return InlineAsrMlxWhisperOptions(
+            repo_id="mlx-community/whisper-medium-mlx-8bit",
+            inference_framework=InferenceAsrFramework.MLX,
+            language="en",
+            task="transcribe",
+            word_timestamps=True,
+            no_speech_threshold=0.6,
+            logprob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+        )
+    else:
+        return InlineAsrNativeWhisperOptions(
+            repo_id="medium",
+            inference_framework=InferenceAsrFramework.WHISPER,
+            verbose=True,
+            timestamps=True,
+            word_timestamps=True,
+            temperature=0.0,
+            max_new_tokens=256,
+            max_time_chunk=30.0,
+        )
+# Create the model instance
+WHISPER_MEDIUM = _get_whisper_medium_model()
+def _get_whisper_base_model():
+    """
+    Get the best Whisper Base model for the current hardware.
+    Automatically selects MLX Whisper Base for Apple Silicon (MPS) if available,
+    otherwise falls back to native Whisper Base.
+    """
+    # Check if MPS is available (Apple Silicon)
+    try:
+        import torch
+        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    except ImportError:
+        has_mps = False
+    # Check if mlx-whisper is available
+    try:
+        import mlx_whisper  # type: ignore
+        has_mlx_whisper = True
+    except ImportError:
+        has_mlx_whisper = False
+    # Use MLX Whisper if both MPS and mlx-whisper are available
+    if has_mps and has_mlx_whisper:
+        return InlineAsrMlxWhisperOptions(
+            repo_id="mlx-community/whisper-base-mlx",
+            inference_framework=InferenceAsrFramework.MLX,
+            language="en",
+            task="transcribe",
+            word_timestamps=True,
+            no_speech_threshold=0.6,
+            logprob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+        )
+    else:
+        return InlineAsrNativeWhisperOptions(
+            repo_id="base",
+            inference_framework=InferenceAsrFramework.WHISPER,
+            verbose=True,
+            timestamps=True,
+            word_timestamps=True,
+            temperature=0.0,
+            max_new_tokens=256,
+            max_time_chunk=30.0,
+        )
+# Create the model instance
+WHISPER_BASE = _get_whisper_base_model()
+def _get_whisper_large_model():
+    """
+    Get the best Whisper Large model for the current hardware.
+    Automatically selects MLX Whisper Large for Apple Silicon (MPS) if available,
+    otherwise falls back to native Whisper Large.
+    """
+    # Check if MPS is available (Apple Silicon)
+    try:
+        import torch
+        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    except ImportError:
+        has_mps = False
+    # Check if mlx-whisper is available
+    try:
+        import mlx_whisper  # type: ignore
+        has_mlx_whisper = True
+    except ImportError:
+        has_mlx_whisper = False
+    # Use MLX Whisper if both MPS and mlx-whisper are available
+    if has_mps and has_mlx_whisper:
+        return InlineAsrMlxWhisperOptions(
+            repo_id="mlx-community/whisper-large-mlx-8bit",
+            inference_framework=InferenceAsrFramework.MLX,
+            language="en",
+            task="transcribe",
+            word_timestamps=True,
+            no_speech_threshold=0.6,
+            logprob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+        )
+    else:
+        return InlineAsrNativeWhisperOptions(
+            repo_id="large",
+            inference_framework=InferenceAsrFramework.WHISPER,
+            verbose=True,
+            timestamps=True,
+            word_timestamps=True,
+            temperature=0.0,
+            max_new_tokens=256,
+            max_time_chunk=30.0,
+        )
+# Create the model instance
+WHISPER_LARGE = _get_whisper_large_model()
+def _get_whisper_turbo_model():
+    """
+    Get the best Whisper Turbo model for the current hardware.
+    Automatically selects MLX Whisper Turbo for Apple Silicon (MPS) if available,
+    otherwise falls back to native Whisper Turbo.
+    """
+    # Check if MPS is available (Apple Silicon)
+    try:
+        import torch
+        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    except ImportError:
+        has_mps = False
+    # Check if mlx-whisper is available
+    try:
+        import mlx_whisper  # type: ignore
+        has_mlx_whisper = True
+    except ImportError:
+        has_mlx_whisper = False
+    # Use MLX Whisper if both MPS and mlx-whisper are available
+    if has_mps and has_mlx_whisper:
+        return InlineAsrMlxWhisperOptions(
+            repo_id="mlx-community/whisper-turbo",
+            inference_framework=InferenceAsrFramework.MLX,
+            language="en",
+            task="transcribe",
+            word_timestamps=True,
+            no_speech_threshold=0.6,
+            logprob_threshold=-1.0,
+            compression_ratio_threshold=2.4,
+        )
+    else:
+        return InlineAsrNativeWhisperOptions(
+            repo_id="turbo",
+            inference_framework=InferenceAsrFramework.WHISPER,
+            verbose=True,
+            timestamps=True,
+            word_timestamps=True,
+            temperature=0.0,
+            max_new_tokens=256,
+            max_time_chunk=30.0,
+        )
+# Create the model instance
+WHISPER_TURBO = _get_whisper_turbo_model()
+# Explicit MLX Whisper model options for users who want to force MLX usage
+WHISPER_TINY_MLX = InlineAsrMlxWhisperOptions(
+    repo_id="mlx-community/whisper-tiny-mlx",
+    inference_framework=InferenceAsrFramework.MLX,
+    language="en",
+    task="transcribe",
+    word_timestamps=True,
+    no_speech_threshold=0.6,
+    logprob_threshold=-1.0,
+    compression_ratio_threshold=2.4,
+)
+WHISPER_SMALL_MLX = InlineAsrMlxWhisperOptions(
+    repo_id="mlx-community/whisper-small-mlx",
+    inference_framework=InferenceAsrFramework.MLX,
+    language="en",
+    task="transcribe",
+    word_timestamps=True,
+    no_speech_threshold=0.6,
+    logprob_threshold=-1.0,
+    compression_ratio_threshold=2.4,
+)
+WHISPER_MEDIUM_MLX = InlineAsrMlxWhisperOptions(
+    repo_id="mlx-community/whisper-medium-mlx-8bit",
+    inference_framework=InferenceAsrFramework.MLX,
+    language="en",
+    task="transcribe",
+    word_timestamps=True,
+    no_speech_threshold=0.6,
+    logprob_threshold=-1.0,
+    compression_ratio_threshold=2.4,
+)
+WHISPER_BASE_MLX = InlineAsrMlxWhisperOptions(
+    repo_id="mlx-community/whisper-base-mlx",
+    inference_framework=InferenceAsrFramework.MLX,
+    language="en",
+    task="transcribe",
+    word_timestamps=True,
+    no_speech_threshold=0.6,
+    logprob_threshold=-1.0,
+    compression_ratio_threshold=2.4,
+)
+WHISPER_LARGE_MLX = InlineAsrMlxWhisperOptions(
+    repo_id="mlx-community/whisper-large-mlx-8bit",
+    inference_framework=InferenceAsrFramework.MLX,
+    language="en",
+    task="transcribe",
+    word_timestamps=True,
+    no_speech_threshold=0.6,
+    logprob_threshold=-1.0,
+    compression_ratio_threshold=2.4,
+)
+WHISPER_TURBO_MLX = InlineAsrMlxWhisperOptions(
+    repo_id="mlx-community/whisper-turbo",
+    inference_framework=InferenceAsrFramework.MLX,
+    language="en",
+    task="transcribe",
+    word_timestamps=True,
+    no_speech_threshold=0.6,
+    logprob_threshold=-1.0,
+    compression_ratio_threshold=2.4,
+)
+# Explicit Native Whisper model options for users who want to force native usage
+WHISPER_TINY_NATIVE = InlineAsrNativeWhisperOptions(
     repo_id="tiny",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -27,7 +408,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
-WHISPER_SMALL = InlineAsrNativeWhisperOptions(
+WHISPER_SMALL_NATIVE = InlineAsrNativeWhisperOptions(
     repo_id="small",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -38,7 +419,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
-WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
+WHISPER_MEDIUM_NATIVE = InlineAsrNativeWhisperOptions(
     repo_id="medium",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -49,7 +430,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
-WHISPER_BASE = InlineAsrNativeWhisperOptions(
+WHISPER_BASE_NATIVE = InlineAsrNativeWhisperOptions(
     repo_id="base",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -60,7 +441,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
-WHISPER_LARGE = InlineAsrNativeWhisperOptions(
+WHISPER_LARGE_NATIVE = InlineAsrNativeWhisperOptions(
     repo_id="large",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -71,7 +452,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
-WHISPER_TURBO = InlineAsrNativeWhisperOptions(
+WHISPER_TURBO_NATIVE = InlineAsrNativeWhisperOptions(
     repo_id="turbo",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -82,11 +463,32 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
+# Note: The main WHISPER_* models (WHISPER_TURBO, WHISPER_BASE, etc.) automatically
+# select the best implementation (MLX on Apple Silicon, Native elsewhere).
+# Use the explicit _MLX or _NATIVE variants if you need to force a specific implementation.
 class AsrModelType(str, Enum):
+    # Auto-selecting models (choose best implementation for hardware)
     WHISPER_TINY = "whisper_tiny"
     WHISPER_SMALL = "whisper_small"
     WHISPER_MEDIUM = "whisper_medium"
     WHISPER_BASE = "whisper_base"
     WHISPER_LARGE = "whisper_large"
     WHISPER_TURBO = "whisper_turbo"
+    # Explicit MLX models (force MLX implementation)
+    WHISPER_TINY_MLX = "whisper_tiny_mlx"
+    WHISPER_SMALL_MLX = "whisper_small_mlx"
+    WHISPER_MEDIUM_MLX = "whisper_medium_mlx"
+    WHISPER_BASE_MLX = "whisper_base_mlx"
+    WHISPER_LARGE_MLX = "whisper_large_mlx"
+    WHISPER_TURBO_MLX = "whisper_turbo_mlx"
+    # Explicit Native models (force native implementation)
+    WHISPER_TINY_NATIVE = "whisper_tiny_native"
+    WHISPER_SMALL_NATIVE = "whisper_small_native"
+    WHISPER_MEDIUM_NATIVE = "whisper_medium_native"
+    WHISPER_BASE_NATIVE = "whisper_base_native"
+    WHISPER_LARGE_NATIVE = "whisper_large_native"
+    WHISPER_TURBO_NATIVE = "whisper_turbo_native"

docling/datamodel/backend_options.py ADDED Viewed

@@ -0,0 +1,82 @@
+from pathlib import PurePath
+from typing import Annotated, Literal, Optional, Union
+from pydantic import AnyUrl, BaseModel, Field, SecretStr
+class BaseBackendOptions(BaseModel):
+    """Common options for all declarative document backends."""
+    enable_remote_fetch: bool = Field(
+        False, description="Enable remote resource fetching."
+    )
+    enable_local_fetch: bool = Field(
+        False, description="Enable local resource fetching."
+    )
+class DeclarativeBackendOptions(BaseBackendOptions):
+    """Default backend options for a declarative document backend."""
+    kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
+class HTMLBackendOptions(BaseBackendOptions):
+    """Options specific to the HTML backend.
+    This class can be extended to include options specific to HTML processing.
+    """
+    kind: Literal["html"] = Field("html", exclude=True, repr=False)
+    fetch_images: bool = Field(
+        False,
+        description=(
+            "Whether the backend should access remote or local resources to parse "
+            "images in an HTML document."
+        ),
+    )
+    source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
+        None,
+        description=(
+            "The URI that originates the HTML document. If provided, the backend "
+            "will use it to resolve relative paths in the HTML document."
+        ),
+    )
+class MarkdownBackendOptions(BaseBackendOptions):
+    """Options specific to the Markdown backend."""
+    kind: Literal["md"] = Field("md", exclude=True, repr=False)
+    fetch_images: bool = Field(
+        False,
+        description=(
+            "Whether the backend should access remote or local resources to parse "
+            "images in the markdown document."
+        ),
+    )
+    source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
+        None,
+        description=(
+            "The URI that originates the markdown document. If provided, the backend "
+            "will use it to resolve relative paths in the markdown document."
+        ),
+    )
+class PdfBackendOptions(BaseBackendOptions):
+    """Backend options for pdf document backends."""
+    kind: Literal["pdf"] = Field("pdf", exclude=True, repr=False)
+    password: Optional[SecretStr] = None
+BackendOptions = Annotated[
+    Union[
+        DeclarativeBackendOptions,
+        HTMLBackendOptions,
+        MarkdownBackendOptions,
+        PdfBackendOptions,
+    ],
+    Field(discriminator="kind"),
+]

docling/datamodel/base_models.py CHANGED Viewed

@@ -94,7 +94,7 @@ FormatToExtensions: dict[InputFormat, list[str]] = {
     InputFormat.XML_USPTO: ["xml", "txt"],
     InputFormat.METS_GBS: ["tar.gz"],
     InputFormat.JSON_DOCLING: ["json"],
-    InputFormat.AUDIO: ["wav", "mp3"],
+    InputFormat.AUDIO: ["wav", "mp3", "m4a", "aac", "ogg", "flac", "mp4", "avi", "mov"],
     InputFormat.VTT: ["vtt"],
 }
@@ -128,7 +128,22 @@ FormatToMimeType: dict[InputFormat, list[str]] = {
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
     InputFormat.METS_GBS: ["application/mets+xml"],
     InputFormat.JSON_DOCLING: ["application/json"],
-    InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
+    InputFormat.AUDIO: [
+        "audio/x-wav",
+        "audio/mpeg",
+        "audio/wav",
+        "audio/mp3",
+        "audio/mp4",
+        "audio/m4a",
+        "audio/aac",
+        "audio/ogg",
+        "audio/flac",
+        "audio/x-flac",
+        "video/mp4",
+        "video/avi",
+        "video/x-msvideo",
+        "video/quicktime",
+    ],
     InputFormat.VTT: ["text/vtt"],
 }

docling 2.57.0__py3-none-any.whl → 2.58.0__py3-none-any.whl

Potentially problematic release.

docling 2.57.0py3-none-any.whl → 2.58.0py3-none-any.whl