PyPI - docling - Versions diffs - 2.57.0__py3-none-any.whl → 2.59.0__py3-none-any.whl - Mend

docling 2.57.0py3-none-any.whl → 2.59.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (35) hide show

docling/backend/abstract_backend.py +24 -3
docling/backend/asciidoc_backend.py +3 -3
docling/backend/docling_parse_v4_backend.py +15 -4
docling/backend/html_backend.py +130 -20
docling/backend/md_backend.py +27 -5
docling/backend/msexcel_backend.py +121 -29
docling/backend/mspowerpoint_backend.py +2 -2
docling/backend/msword_backend.py +18 -18
docling/backend/pdf_backend.py +9 -2
docling/backend/pypdfium2_backend.py +12 -3
docling/cli/main.py +104 -38
docling/datamodel/asr_model_specs.py +408 -6
docling/datamodel/backend_options.py +82 -0
docling/datamodel/base_models.py +19 -2
docling/datamodel/document.py +81 -48
docling/datamodel/pipeline_options_asr_model.py +21 -1
docling/datamodel/pipeline_options_vlm_model.py +1 -0
docling/document_converter.py +37 -45
docling/document_extractor.py +12 -11
docling/models/api_vlm_model.py +5 -3
docling/models/picture_description_vlm_model.py +5 -1
docling/models/readingorder_model.py +6 -7
docling/models/vlm_models_inline/hf_transformers_model.py +13 -3
docling/models/vlm_models_inline/mlx_model.py +9 -3
docling/models/vlm_models_inline/nuextract_transformers_model.py +13 -3
docling/models/vlm_models_inline/vllm_model.py +42 -8
docling/pipeline/asr_pipeline.py +149 -6
docling/utils/api_image_request.py +20 -9
docling/utils/layout_postprocessor.py +23 -24
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/METADATA +11 -8
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/RECORD +35 -34
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/WHEEL +0 -0
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/entry_points.txt +0 -0
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.57.0.dist-info → docling-2.59.0.dist-info}/top_level.txt +0 -0

docling/models/api_vlm_model.py CHANGED Viewed

@@ -73,7 +73,7 @@ class ApiVlmModel(BasePageModel):
                         # Skip non-GenerationStopper criteria (should have been caught in validation)
                     # Streaming path with early abort support
-                    page_tags = api_image_request_streaming(
+                    page_tags, num_tokens = api_image_request_streaming(
                         image=hi_res_image,
                         prompt=prompt,
                         url=self.vlm_options.url,
@@ -84,7 +84,7 @@ class ApiVlmModel(BasePageModel):
                     )
                 else:
                     # Non-streaming fallback (existing behavior)
-                    page_tags = api_image_request(
+                    page_tags, num_tokens = api_image_request(
                         image=hi_res_image,
                         prompt=prompt,
                         url=self.vlm_options.url,
@@ -94,7 +94,9 @@ class ApiVlmModel(BasePageModel):
                     )
                 page_tags = self.vlm_options.decode_response(page_tags)
-                page.predictions.vlm_response = VlmPrediction(text=page_tags)
+                page.predictions.vlm_response = VlmPrediction(
+                    text=page_tags, num_tokens=num_tokens
+                )
             return page
         with ThreadPoolExecutor(max_workers=self.concurrency) as executor:

docling/models/picture_description_vlm_model.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import sys
 import threading
 from collections.abc import Iterable
 from pathlib import Path
@@ -75,7 +76,10 @@ class PictureDescriptionVlmModel(
                         else "sdpa"
                     ),
                 )
-                self.model = torch.compile(self.model)  # type: ignore
+                if sys.version_info < (3, 14):
+                    self.model = torch.compile(self.model)  # type: ignore
+                else:
+                    self.model.eval()
             self.provenance = f"{self.options.repo_id}"

docling/models/readingorder_model.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from pathlib import Path
-from typing import Dict, List
 from docling_core.types.doc import (
     DocItemLabel,
@@ -48,8 +47,8 @@ class ReadingOrderModel:
     def _assembled_to_readingorder_elements(
         self, conv_res: ConversionResult
-    ) -> List[ReadingOrderPageElement]:
-        elements: List[ReadingOrderPageElement] = []
+    ) -> list[ReadingOrderPageElement]:
+        elements: list[ReadingOrderPageElement] = []
         page_no_to_pages = {p.page_no: p for p in conv_res.pages}
         for element in conv_res.assembled.elements:
@@ -123,10 +122,10 @@ class ReadingOrderModel:
     def _readingorder_elements_to_docling_doc(
         self,
         conv_res: ConversionResult,
-        ro_elements: List[ReadingOrderPageElement],
-        el_to_captions_mapping: Dict[int, List[int]],
-        el_to_footnotes_mapping: Dict[int, List[int]],
-        el_merges_mapping: Dict[int, List[int]],
+        ro_elements: list[ReadingOrderPageElement],
+        el_to_captions_mapping: dict[int, list[int]],
+        el_to_footnotes_mapping: dict[int, list[int]],
+        el_merges_mapping: dict[int, list[int]],
     ) -> DoclingDocument:
         id_to_elem = {
             RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem

docling/models/vlm_models_inline/hf_transformers_model.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import importlib.metadata
 import logging
+import sys
 import time
 from collections.abc import Iterable
 from pathlib import Path
@@ -129,7 +130,10 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
                 trust_remote_code=vlm_options.trust_remote_code,
                 revision=vlm_options.revision,
             )
-            self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
+            if sys.version_info < (3, 14):
+                self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
+            else:
+                self.vlm_model.eval()
             # Load generation config
             self.generation_config = GenerationConfig.from_pretrained(
@@ -363,13 +367,19 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
             decoded_texts = [text.rstrip(pad_token) for text in decoded_texts]
         # -- Optional logging
+        num_tokens = None
         if generated_ids.shape[0] > 0:
+            num_tokens = int(generated_ids[0].shape[0])
             _log.debug(
-                f"Generated {int(generated_ids[0].shape[0])} tokens in {generation_time:.2f}s "
+                f"Generated {num_tokens} tokens in {generation_time:.2f}s "
                 f"for batch size {generated_ids.shape[0]}."
             )
         for text in decoded_texts:
             # Apply decode_response to the output text
             decoded_text = self.vlm_options.decode_response(text)
-            yield VlmPrediction(text=decoded_text, generation_time=generation_time)
+            yield VlmPrediction(
+                text=decoded_text,
+                generation_time=generation_time,
+                num_tokens=num_tokens,
+            )

docling/models/vlm_models_inline/mlx_model.py CHANGED Viewed

@@ -50,9 +50,14 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
                 from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
                 from mlx_vlm.utils import load_config  # type: ignore
             except ImportError:
-                raise ImportError(
-                    "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
-                )
+                if sys.version_info < (3, 14):
+                    raise ImportError(
+                        "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
+                    )
+                else:
+                    raise ImportError(
+                        "mlx-vlm is not installed. It is not yet available on Python 3.14."
+                    )
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
@@ -313,5 +318,6 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
                     text=decoded_output,
                     generation_time=generation_time,
                     generated_tokens=tokens,
+                    num_tokens=len(tokens),
                 )
             _log.debug("MLX model: Released global lock")

docling/models/vlm_models_inline/nuextract_transformers_model.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import sys
 import time
 from collections.abc import Iterable
 from pathlib import Path
@@ -153,7 +154,10 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
                 ),
                 trust_remote_code=vlm_options.trust_remote_code,
             )
-            self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
+            if sys.version_info < (3, 14):
+                self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
+            else:
+                self.vlm_model.eval()
             # Load generation config
             self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
@@ -278,13 +282,19 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
         )
         # Optional logging
+        num_tokens = None
         if generated_ids.shape[0] > 0:  # type: ignore
+            num_tokens = int(generated_ids[0].shape[0])
             _log.debug(
-                f"Generated {int(generated_ids[0].shape[0])} tokens in {generation_time:.2f}s "
+                f"Generated {num_tokens} tokens in {generation_time:.2f}s "
                 f"for batch size {generated_ids.shape[0]}."  # type: ignore
             )
         for text in decoded_texts:
             # Apply decode_response to the output text
             decoded_text = self.vlm_options.decode_response(text)
-            yield VlmPrediction(text=decoded_text, generation_time=generation_time)
+            yield VlmPrediction(
+                text=decoded_text,
+                generation_time=generation_time,
+                num_tokens=num_tokens,
+            )

docling/models/vlm_models_inline/vllm_model.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import sys
 import time
 from collections.abc import Iterable
 from pathlib import Path
@@ -8,7 +9,7 @@ import numpy as np
 from PIL.Image import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
-from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import (
     InlineVlmOptions,
@@ -87,7 +88,7 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
         vlm_options: InlineVlmOptions,
     ):
         self.enabled = enabled
-        self.vlm_options = vlm_options
+        self.vlm_options: InlineVlmOptions = vlm_options
         self.llm = None
         self.sampling_params = None
@@ -100,7 +101,18 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
             return
         from transformers import AutoProcessor
-        from vllm import LLM, SamplingParams
+        try:
+            from vllm import LLM, SamplingParams
+        except ImportError:
+            if sys.version_info < (3, 14):
+                raise ImportError(
+                    "vllm is not installed. Please install it via `pip install vllm`."
+                )
+            else:
+                raise ImportError(
+                    "vllm is not installed. It is not yet available on Python 3.14."
+                )
         # Device selection
         self.device = decide_device(
@@ -222,7 +234,8 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
                     pages_with_images.append(page)
                 if images:
-                    predictions = list(self.process_images(images, user_prompts))
+                    with TimeRecorder(conv_res, "vlm_inference"):
+                        predictions = list(self.process_images(images, user_prompts))
                     for page, prediction in zip(pages_with_images, predictions):
                         page.predictions.vlm_response = prediction
@@ -288,13 +301,34 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
         # Optional debug
         if outputs:
             try:
-                num_tokens = len(outputs[0].outputs[0].token_ids)
-                _log.debug(f"Generated {num_tokens} tokens in {generation_time:.2f}s.")
+                num_tokens_within_batch = len(outputs[0].outputs[0].token_ids)
+                _log.debug(
+                    f"Generated {num_tokens_within_batch} tokens for batch in {generation_time:.2f}s."
+                )
             except Exception:
-                pass
+                num_tokens_within_batch = 0
         # Emit predictions
         for output in outputs:
             text = output.outputs[0].text if output.outputs else ""
+            stop_reason = output.outputs[0].stop_reason if output.outputs else ""
+            generated_tokens = [
+                VlmPredictionToken(token=int(p)) for p in output.outputs[0].token_ids
+            ]
+            num_tokens = len(generated_tokens)
             decoded_text = self.vlm_options.decode_response(text)
-            yield VlmPrediction(text=decoded_text, generation_time=generation_time)
+            if self.vlm_options.track_generated_tokens:
+                yield VlmPrediction(
+                    text=decoded_text,
+                    generation_time=generation_time,
+                    num_tokens=num_tokens,
+                    stop_reason=stop_reason,
+                    generated_tokens=generated_tokens,
+                )
+            else:
+                yield VlmPrediction(
+                    text=decoded_text,
+                    generation_time=generation_time,
+                    num_tokens=num_tokens,
+                    stop_reason=stop_reason,
+                )

docling/pipeline/asr_pipeline.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import logging
 import os
 import re
+import sys
 import tempfile
 from io import BytesIO
 from pathlib import Path
-from typing import List, Optional, Union, cast
+from typing import TYPE_CHECKING, List, Optional, Union, cast
 from docling_core.types.doc import DoclingDocument, DocumentOrigin
@@ -32,6 +33,7 @@ from docling.datamodel.pipeline_options import (
     AsrPipelineOptions,
 )
 from docling.datamodel.pipeline_options_asr_model import (
+    InlineAsrMlxWhisperOptions,
     InlineAsrNativeWhisperOptions,
     # AsrResponseFormat,
     InlineAsrOptions,
@@ -116,9 +118,15 @@ class _NativeWhisperModel:
             try:
                 import whisper  # type: ignore
             except ImportError:
-                raise ImportError(
-                    "whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
-                )
+                if sys.version_info < (3, 14):
+                    raise ImportError(
+                        "whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
+                    )
+                else:
+                    raise ImportError(
+                        "whisper is not installed. Unfortunately its dependencies are not yet available for Python 3.14."
+                    )
             self.asr_options = asr_options
             self.max_tokens = asr_options.max_new_tokens
             self.temperature = asr_options.temperature
@@ -228,22 +236,157 @@ class _NativeWhisperModel:
         return convo
+class _MlxWhisperModel:
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        asr_options: InlineAsrMlxWhisperOptions,
+    ):
+        """
+        Transcriber using MLX Whisper for Apple Silicon optimization.
+        """
+        self.enabled = enabled
+        _log.info(f"artifacts-path: {artifacts_path}")
+        _log.info(f"accelerator_options: {accelerator_options}")
+        if self.enabled:
+            try:
+                import mlx_whisper  # type: ignore
+            except ImportError:
+                raise ImportError(
+                    "mlx-whisper is not installed. Please install it via `pip install mlx-whisper` or do `uv sync --extra asr`."
+                )
+            self.asr_options = asr_options
+            self.mlx_whisper = mlx_whisper
+            self.device = decide_device(
+                accelerator_options.device,
+                supported_devices=asr_options.supported_devices,
+            )
+            _log.info(f"Available device for MLX Whisper: {self.device}")
+            self.model_name = asr_options.repo_id
+            _log.info(f"loading _MlxWhisperModel({self.model_name})")
+            # MLX Whisper models are loaded differently - they use HuggingFace repos
+            self.model_path = self.model_name
+            # Store MLX-specific options
+            self.language = asr_options.language
+            self.task = asr_options.task
+            self.word_timestamps = asr_options.word_timestamps
+            self.no_speech_threshold = asr_options.no_speech_threshold
+            self.logprob_threshold = asr_options.logprob_threshold
+            self.compression_ratio_threshold = asr_options.compression_ratio_threshold
+    def run(self, conv_res: ConversionResult) -> ConversionResult:
+        audio_path: Path = Path(conv_res.input.file).resolve()
+        try:
+            conversation = self.transcribe(audio_path)
+            # Ensure we have a proper DoclingDocument
+            origin = DocumentOrigin(
+                filename=conv_res.input.file.name or "audio.wav",
+                mimetype="audio/x-wav",
+                binary_hash=conv_res.input.document_hash,
+            )
+            conv_res.document = DoclingDocument(
+                name=conv_res.input.file.stem or "audio.wav", origin=origin
+            )
+            for citem in conversation:
+                conv_res.document.add_text(
+                    label=DocItemLabel.TEXT, text=citem.to_string()
+                )
+            conv_res.status = ConversionStatus.SUCCESS
+            return conv_res
+        except Exception as exc:
+            _log.error(f"MLX Audio transcription has an error: {exc}")
+        conv_res.status = ConversionStatus.FAILURE
+        return conv_res
+    def transcribe(self, fpath: Path) -> list[_ConversationItem]:
+        """
+        Transcribe audio using MLX Whisper.
+        Args:
+            fpath: Path to audio file
+        Returns:
+            List of conversation items with timestamps
+        """
+        result = self.mlx_whisper.transcribe(
+            str(fpath),
+            path_or_hf_repo=self.model_path,
+            language=self.language,
+            task=self.task,
+            word_timestamps=self.word_timestamps,
+            no_speech_threshold=self.no_speech_threshold,
+            logprob_threshold=self.logprob_threshold,
+            compression_ratio_threshold=self.compression_ratio_threshold,
+        )
+        convo: list[_ConversationItem] = []
+        # MLX Whisper returns segments similar to native Whisper
+        for segment in result.get("segments", []):
+            item = _ConversationItem(
+                start_time=segment.get("start"),
+                end_time=segment.get("end"),
+                text=segment.get("text", "").strip(),
+                words=[],
+            )
+            # Add word-level timestamps if available
+            if self.word_timestamps and "words" in segment:
+                item.words = []
+                for word_data in segment["words"]:
+                    item.words.append(
+                        _ConversationWord(
+                            start_time=word_data.get("start"),
+                            end_time=word_data.get("end"),
+                            text=word_data.get("word", ""),
+                        )
+                    )
+            convo.append(item)
+        return convo
 class AsrPipeline(BasePipeline):
     def __init__(self, pipeline_options: AsrPipelineOptions):
         super().__init__(pipeline_options)
         self.keep_backend = True
         self.pipeline_options: AsrPipelineOptions = pipeline_options
+        self._model: Union[_NativeWhisperModel, _MlxWhisperModel]
         if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
-            asr_options: InlineAsrNativeWhisperOptions = (
+            native_asr_options: InlineAsrNativeWhisperOptions = (
                 self.pipeline_options.asr_options
             )
             self._model = _NativeWhisperModel(
                 enabled=True,  # must be always enabled for this pipeline to make sense.
                 artifacts_path=self.artifacts_path,
                 accelerator_options=pipeline_options.accelerator_options,
-                asr_options=asr_options,
+                asr_options=native_asr_options,
+            )
+        elif isinstance(self.pipeline_options.asr_options, InlineAsrMlxWhisperOptions):
+            mlx_asr_options: InlineAsrMlxWhisperOptions = (
+                self.pipeline_options.asr_options
+            )
+            self._model = _MlxWhisperModel(
+                enabled=True,  # must be always enabled for this pipeline to make sense.
+                artifacts_path=self.artifacts_path,
+                accelerator_options=pipeline_options.accelerator_options,
+                asr_options=mlx_asr_options,
             )
         else:
             _log.error(f"No model support for {self.pipeline_options.asr_options}")

docling/utils/api_image_request.py CHANGED Viewed

@@ -2,7 +2,7 @@ import base64
 import json
 import logging
 from io import BytesIO
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 import requests
 from PIL import Image
@@ -19,9 +19,9 @@ def api_image_request(
     prompt: str,
     url: AnyUrl,
     timeout: float = 20,
-    headers: Optional[Dict[str, str]] = None,
+    headers: Optional[dict[str, str]] = None,
     **params,
-) -> str:
+) -> Tuple[str, Optional[int]]:
     img_io = BytesIO()
     image.save(img_io, "PNG")
     image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
@@ -60,7 +60,8 @@ def api_image_request(
     api_resp = OpenAiApiResponse.model_validate_json(r.text)
     generated_text = api_resp.choices[0].message.content.strip()
-    return generated_text
+    num_tokens = api_resp.usage.total_tokens
+    return generated_text, num_tokens
 def api_image_request_streaming(
@@ -69,10 +70,10 @@ def api_image_request_streaming(
     url: AnyUrl,
     *,
     timeout: float = 20,
-    headers: Optional[Dict[str, str]] = None,
-    generation_stoppers: List[GenerationStopper] = [],
+    headers: Optional[dict[str, str]] = None,
+    generation_stoppers: list[GenerationStopper] = [],
     **params,
-) -> str:
+) -> Tuple[str, Optional[int]]:
     """
     Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
     Parses SSE lines: 'data: {json}\\n\\n', terminated by 'data: [DONE]'.
@@ -150,6 +151,16 @@ def api_image_request_streaming(
                 _log.debug("Unexpected SSE chunk shape: %s", e)
                 piece = ""
+            # Try to extract token count
+            num_tokens = None
+            try:
+                if "usage" in obj:
+                    usage = obj["usage"]
+                    num_tokens = usage.get("total_tokens")
+            except Exception as e:
+                num_tokens = None
+                _log.debug("Usage key not included in response: %s", e)
             if piece:
                 full_text.append(piece)
                 for stopper in generation_stoppers:
@@ -162,6 +173,6 @@ def api_image_request_streaming(
                         # closing the connection when we exit the 'with' block.
                         # vLLM/OpenAI-compatible servers will detect the client disconnect
                         # and abort the request server-side.
-                        return "".join(full_text)
+                        return "".join(full_text), num_tokens
-        return "".join(full_text)
+        return "".join(full_text), num_tokens

docling 2.57.0__py3-none-any.whl → 2.59.0__py3-none-any.whl

Potentially problematic release.

docling 2.57.0py3-none-any.whl → 2.59.0py3-none-any.whl