PyPI - docling - Versions diffs - 2.46.0__py3-none-any.whl → 2.47.0__py3-none-any.whl - Mend

docling 2.46.0py3-none-any.whl → 2.47.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

docling/backend/html_backend.py +111 -13
docling/backend/msword_backend.py +126 -16
docling/cli/main.py +14 -0
docling/cli/models.py +56 -0
docling/datamodel/base_models.py +1 -1
docling/datamodel/pipeline_options.py +3 -0
docling/datamodel/pipeline_options_vlm_model.py +5 -0
docling/datamodel/vlm_model_specs.py +114 -1
docling/models/base_model.py +95 -2
docling/models/page_preprocessing_model.py +5 -1
docling/models/picture_description_vlm_model.py +4 -2
docling/models/vlm_models_inline/__init__.py +1 -0
docling/models/vlm_models_inline/hf_transformers_model.py +179 -79
docling/models/vlm_models_inline/mlx_model.py +179 -68
docling/models/vlm_models_inline/vllm_model.py +235 -0
docling/pipeline/threaded_standard_pdf_pipeline.py +1 -1
docling/pipeline/vlm_pipeline.py +14 -1
docling/utils/layout_postprocessor.py +51 -43
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/METADATA +2 -1
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/RECORD +24 -23
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/WHEEL +0 -0
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/entry_points.txt +0 -0
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/top_level.txt +0 -0

docling/models/vlm_models_inline/mlx_model.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import logging
+import threading
 import time
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
+import numpy as np
+from PIL.Image import Image
 from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
@@ -10,7 +14,7 @@ from docling.datamodel.accelerator_options import (
 from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BaseVlmPageModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
 )
@@ -18,8 +22,12 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
+# Global lock for MLX model calls - MLX models are not thread-safe
+# All MLX models share this lock to prevent concurrent MLX operations
+_MLX_GLOBAL_LOCK = threading.Lock()
-class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
+class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         enabled: bool,
@@ -63,87 +71,190 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
-        for page in page_batch:
+        page_list = list(page_batch)
+        if not page_list:
+            return
+        valid_pages = []
+        invalid_pages = []
+        for page in page_list:
             assert page._backend is not None
             if not page._backend.is_valid():
-                yield page
+                invalid_pages.append(page)
             else:
-                with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
-                    assert page.size is not None
+                valid_pages.append(page)
+        # Process valid pages in batch
+        if valid_pages:
+            with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
+                # Prepare images and prompts for batch processing
+                images = []
+                user_prompts = []
+                pages_with_images = []
+                for page in valid_pages:
+                    assert page.size is not None
                     hi_res_image = page.get_image(
                         scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                     )
+                    # Only process pages with valid images
                     if hi_res_image is not None:
-                        im_width, im_height = hi_res_image.size
+                        images.append(hi_res_image)
-                    # populate page_tags with predicted doc tags
-                    page_tags = ""
+                        # Define prompt structure
+                        if callable(self.vlm_options.prompt):
+                            user_prompt = self.vlm_options.prompt(page.parsed_page)
+                        else:
+                            user_prompt = self.vlm_options.prompt
-                    if hi_res_image:
-                        if hi_res_image.mode != "RGB":
-                            hi_res_image = hi_res_image.convert("RGB")
+                        user_prompts.append(user_prompt)
+                        pages_with_images.append(page)
-                    user_prompt = self.vlm_options.build_prompt(page.parsed_page)
-                    prompt = self.apply_chat_template(
-                        self.processor, self.config, user_prompt, num_images=1
-                    )
+                # Use process_images for the actual inference
+                if images:  # Only if we have valid images
+                    predictions = list(self.process_images(images, user_prompts))
-                    start_time = time.time()
-                    _log.debug("start generating ...")
-                    # Call model to generate:
-                    tokens: list[VlmPredictionToken] = []
-                    output = ""
-                    for token in self.stream_generate(
-                        self.vlm_model,
-                        self.processor,
-                        prompt,
-                        [hi_res_image],
-                        max_tokens=self.max_tokens,
-                        verbose=False,
-                        temp=self.temperature,
-                    ):
-                        if len(token.logprobs.shape) == 1:
-                            tokens.append(
-                                VlmPredictionToken(
-                                    text=token.text,
-                                    token=token.token,
-                                    logprob=token.logprobs[token.token],
-                                )
-                            )
-                        elif (
-                            len(token.logprobs.shape) == 2
-                            and token.logprobs.shape[0] == 1
-                        ):
-                            tokens.append(
-                                VlmPredictionToken(
-                                    text=token.text,
-                                    token=token.token,
-                                    logprob=token.logprobs[0, token.token],
-                                )
+                    # Attach results to pages
+                    for page, prediction in zip(pages_with_images, predictions):
+                        page.predictions.vlm_response = prediction
+        # Yield all pages (valid and invalid)
+        for page in invalid_pages:
+            yield page
+        for page in valid_pages:
+            yield page
+    def process_images(
+        self,
+        image_batch: Iterable[Union[Image, np.ndarray]],
+        prompt: Union[str, list[str]],
+    ) -> Iterable[VlmPrediction]:
+        """Process raw images without page metadata.
+        Args:
+            image_batch: Iterable of PIL Images or numpy arrays
+            prompt: Either:
+                - str: Single prompt used for all images
+                - list[str]: List of prompts (one per image, must match image count)
+        Raises:
+            ValueError: If prompt list length doesn't match image count.
+        """
+        # Convert image batch to list for length validation
+        image_list = list(image_batch)
+        if len(image_list) == 0:
+            return
+        # Handle prompt parameter
+        if isinstance(prompt, str):
+            # Single prompt for all images
+            user_prompts = [prompt] * len(image_list)
+        elif isinstance(prompt, list):
+            # List of prompts (one per image)
+            if len(prompt) != len(image_list):
+                raise ValueError(
+                    f"Number of prompts ({len(prompt)}) must match number of images ({len(image_list)})"
+                )
+            user_prompts = prompt
+        else:
+            raise ValueError(f"prompt must be str or list[str], got {type(prompt)}")
+        # MLX models are not thread-safe - use global lock to serialize access
+        with _MLX_GLOBAL_LOCK:
+            _log.debug("MLX model: Acquired global lock for thread safety")
+            for image, user_prompt in zip(image_list, user_prompts):
+                # Convert numpy array to PIL Image if needed
+                if isinstance(image, np.ndarray):
+                    if image.ndim == 3 and image.shape[2] in [3, 4]:
+                        # RGB or RGBA array
+                        from PIL import Image as PILImage
+                        image = PILImage.fromarray(image.astype(np.uint8))
+                    elif image.ndim == 2:
+                        # Grayscale array
+                        from PIL import Image as PILImage
+                        image = PILImage.fromarray(image.astype(np.uint8), mode="L")
+                    else:
+                        raise ValueError(
+                            f"Unsupported numpy array shape: {image.shape}"
+                        )
+                # Ensure image is in RGB mode (handles RGBA, L, etc.)
+                if image.mode != "RGB":
+                    image = image.convert("RGB")
+                # Use the MLX chat template approach like in the __call__ method
+                formatted_prompt = self.apply_chat_template(
+                    self.processor, self.config, user_prompt, num_images=1
+                )
+                # Stream generate with stop strings support
+                start_time = time.time()
+                _log.debug("start generating ...")
+                tokens: list[VlmPredictionToken] = []
+                output = ""
+                # Use stream_generate for proper stop string handling
+                for token in self.stream_generate(
+                    self.vlm_model,
+                    self.processor,
+                    formatted_prompt,
+                    [image],  # MLX stream_generate expects list of images
+                    max_tokens=self.max_tokens,
+                    verbose=False,
+                    temp=self.temperature,
+                ):
+                    # Collect token information
+                    if len(token.logprobs.shape) == 1:
+                        tokens.append(
+                            VlmPredictionToken(
+                                text=token.text,
+                                token=token.token,
+                                logprob=token.logprobs[token.token],
                             )
-                        else:
-                            _log.warning(
-                                f"incompatible shape for logprobs: {token.logprobs.shape}"
+                        )
+                    elif (
+                        len(token.logprobs.shape) == 2 and token.logprobs.shape[0] == 1
+                    ):
+                        tokens.append(
+                            VlmPredictionToken(
+                                text=token.text,
+                                token=token.token,
+                                logprob=token.logprobs[0, token.token],
                             )
+                        )
+                    else:
+                        _log.warning(
+                            f"incompatible shape for logprobs: {token.logprobs.shape}"
+                        )
-                        output += token.text
-                        if "</doctag>" in token.text:
+                    output += token.text
+                    # Check for any configured stop strings
+                    if self.vlm_options.stop_strings:
+                        if any(
+                            stop_str in output
+                            for stop_str in self.vlm_options.stop_strings
+                        ):
+                            _log.debug("Stopping generation due to stop string match")
                             break
-                    generation_time = time.time() - start_time
-                    page_tags = output
+                generation_time = time.time() - start_time
-                    _log.debug(
-                        f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
-                    )
-                    page_tags = self.vlm_options.decode_response(page_tags)
-                    page.predictions.vlm_response = VlmPrediction(
-                        text=page_tags,
-                        generation_time=generation_time,
-                        generated_tokens=tokens,
-                    )
+                _log.debug(
+                    f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time:.1f} tokens/sec)."
+                )
-                yield page
+                # Apply decode_response to the output before yielding
+                decoded_output = self.vlm_options.decode_response(output)
+                yield VlmPrediction(
+                    text=decoded_output,
+                    generation_time=generation_time,
+                    generated_tokens=tokens,
+                )
+            _log.debug("MLX model: Released global lock")

docling/models/vlm_models_inline/vllm_model.py ADDED Viewed

@@ -0,0 +1,235 @@
+import logging
+import time
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+import numpy as np
+from PIL.Image import Image
+from docling.datamodel.accelerator_options import (
+    AcceleratorOptions,
+)
+from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options_vlm_model import (
+    InlineVlmOptions,
+    TransformersPromptStyle,
+)
+from docling.models.base_model import BaseVlmPageModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+_log = logging.getLogger(__name__)
+class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        vlm_options: InlineVlmOptions,
+    ):
+        self.enabled = enabled
+        self.vlm_options = vlm_options
+        if self.enabled:
+            from transformers import AutoProcessor
+            from vllm import LLM, SamplingParams
+            self.device = decide_device(
+                accelerator_options.device,
+                supported_devices=vlm_options.supported_devices,
+            )
+            _log.debug(f"Available device for VLM: {self.device}")
+            self.max_new_tokens = vlm_options.max_new_tokens
+            self.temperature = vlm_options.temperature
+            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+            if artifacts_path is None:
+                artifacts_path = self.download_models(self.vlm_options.repo_id)
+            elif (artifacts_path / repo_cache_folder).exists():
+                artifacts_path = artifacts_path / repo_cache_folder
+            # Initialize VLLM LLM
+            llm_kwargs: Dict[str, Any] = {
+                "model": str(artifacts_path),
+                "limit_mm_per_prompt": {"image": 1},
+                "trust_remote_code": vlm_options.trust_remote_code,
+                "model_impl": "transformers",
+                "gpu_memory_utilization": 0.3,  # hardcoded for now, leaves room for ~3 different models.
+            }
+            # Add device-specific configurations
+            if self.device == "cpu":
+                llm_kwargs["device"] = "cpu"
+            # Add quantization if specified
+            if vlm_options.quantized:
+                if vlm_options.load_in_8bit:
+                    llm_kwargs["quantization"] = "bitsandbytes"
+            self.llm = LLM(**llm_kwargs)
+            # Initialize processor for prompt formatting
+            self.processor = AutoProcessor.from_pretrained(
+                artifacts_path,
+                trust_remote_code=vlm_options.trust_remote_code,
+            )
+            # Set up sampling parameters
+            self.sampling_params = SamplingParams(
+                temperature=self.temperature,
+                max_tokens=self.max_new_tokens,
+                stop=vlm_options.stop_strings if vlm_options.stop_strings else None,
+                **vlm_options.extra_generation_config,
+            )
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        page_list = list(page_batch)
+        if not page_list:
+            return
+        valid_pages = []
+        invalid_pages = []
+        for page in page_list:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                invalid_pages.append(page)
+            else:
+                valid_pages.append(page)
+        # Process valid pages in batch
+        if valid_pages:
+            with TimeRecorder(conv_res, "vlm"):
+                # Prepare images and prompts for batch processing
+                images = []
+                user_prompts = []
+                pages_with_images = []
+                for page in valid_pages:
+                    assert page.size is not None
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
+                    # Only process pages with valid images
+                    if hi_res_image is not None:
+                        images.append(hi_res_image)
+                        # Define prompt structure
+                        if callable(self.vlm_options.prompt):
+                            user_prompt = self.vlm_options.prompt(page.parsed_page)
+                        else:
+                            user_prompt = self.vlm_options.prompt
+                        user_prompts.append(user_prompt)
+                        pages_with_images.append(page)
+                # Use process_images for the actual inference
+                if images:  # Only if we have valid images
+                    predictions = list(self.process_images(images, user_prompts))
+                    # Attach results to pages
+                    for page, prediction in zip(pages_with_images, predictions):
+                        page.predictions.vlm_response = prediction
+        # Yield all pages (valid and invalid)
+        for page in invalid_pages:
+            yield page
+        for page in valid_pages:
+            yield page
+    def process_images(
+        self,
+        image_batch: Iterable[Union[Image, np.ndarray]],
+        prompt: Union[str, list[str]],
+    ) -> Iterable[VlmPrediction]:
+        """Process raw images without page metadata in a single batched inference call.
+        Args:
+            image_batch: Iterable of PIL Images or numpy arrays
+            prompt: Either:
+                - str: Single prompt used for all images
+                - list[str]: List of prompts (one per image, must match image count)
+        Raises:
+            ValueError: If prompt list length doesn't match image count.
+        """
+        pil_images: list[Image] = []
+        for img in image_batch:
+            # Convert numpy array to PIL Image if needed
+            if isinstance(img, np.ndarray):
+                if img.ndim == 3 and img.shape[2] in [3, 4]:
+                    from PIL import Image as PILImage
+                    pil_img = PILImage.fromarray(img.astype(np.uint8))
+                elif img.ndim == 2:
+                    from PIL import Image as PILImage
+                    pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
+                else:
+                    raise ValueError(f"Unsupported numpy array shape: {img.shape}")
+            else:
+                pil_img = img
+            # Ensure image is in RGB mode (handles RGBA, L, etc.)
+            if pil_img.mode != "RGB":
+                pil_img = pil_img.convert("RGB")
+            pil_images.append(pil_img)
+        if len(pil_images) == 0:
+            return
+        # Handle prompt parameter
+        if isinstance(prompt, str):
+            # Single prompt for all images
+            user_prompts = [prompt] * len(pil_images)
+        elif isinstance(prompt, list):
+            # List of prompts (one per image)
+            if len(prompt) != len(pil_images):
+                raise ValueError(
+                    f"Number of prompts ({len(prompt)}) must match number of images ({len(pil_images)})"
+                )
+            user_prompts = prompt
+        else:
+            raise ValueError(f"prompt must be str or list[str], got {type(prompt)}")
+        # Format prompts individually
+        prompts: list[str] = [
+            self.formulate_prompt(user_prompt) for user_prompt in user_prompts
+        ]
+        # Prepare VLLM inputs
+        llm_inputs = []
+        for prompt, image in zip(prompts, pil_images):
+            llm_inputs.append({"prompt": prompt, "multi_modal_data": {"image": image}})
+        start_time = time.time()
+        outputs = self.llm.generate(llm_inputs, sampling_params=self.sampling_params)  # type: ignore
+        generation_time = time.time() - start_time
+        # Logging tokens count for the first sample as a representative metric
+        if len(outputs) > 0:
+            num_tokens = len(outputs[0].outputs[0].token_ids)
+            _log.debug(
+                f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
+            )
+        for output in outputs:
+            # Apply decode_response to the output text
+            decoded_text = self.vlm_options.decode_response(output.outputs[0].text)
+            yield VlmPrediction(text=decoded_text, generation_time=generation_time)

docling/pipeline/threaded_standard_pdf_pipeline.py CHANGED Viewed

@@ -194,7 +194,7 @@ class ThreadedPipelineStage:
             return
         self._running = True
         self._thread = threading.Thread(
-            target=self._run, name=f"Stage-{self.name}", daemon=False
+            target=self._run, name=f"Stage-{self.name}", daemon=True
         )
         self._thread.start()

docling/pipeline/vlm_pipeline.py CHANGED Viewed

@@ -103,6 +103,17 @@ class VlmPipeline(PaginatedPipeline):
                         vlm_options=vlm_options,
                     ),
                 ]
+            elif vlm_options.inference_framework == InferenceFramework.VLLM:
+                from docling.models.vlm_models_inline.vllm_model import VllmVlmModel
+                self.build_pipe = [
+                    VllmVlmModel(
+                        enabled=True,  # must be always enabled for this pipeline to make sense.
+                        artifacts_path=artifacts_path,
+                        accelerator_options=pipeline_options.accelerator_options,
+                        vlm_options=vlm_options,
+                    ),
+                ]
             else:
                 raise ValueError(
                     f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
@@ -117,7 +128,9 @@ class VlmPipeline(PaginatedPipeline):
             page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
             if page._backend is not None and page._backend.is_valid():
                 page.size = page._backend.get_size()
-                page.parsed_page = page._backend.get_segmented_page()
+                if self.force_backend_text:
+                    page.parsed_page = page._backend.get_segmented_page()
         return page

docling 2.46.0__py3-none-any.whl → 2.47.0__py3-none-any.whl

docling 2.46.0py3-none-any.whl → 2.47.0py3-none-any.whl