PyPI - docling - Versions diffs - 2.46.0__py3-none-any.whl → 2.47.0__py3-none-any.whl - Mend

docling 2.46.0py3-none-any.whl → 2.47.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

docling/backend/html_backend.py +111 -13
docling/backend/msword_backend.py +126 -16
docling/cli/main.py +14 -0
docling/cli/models.py +56 -0
docling/datamodel/base_models.py +1 -1
docling/datamodel/pipeline_options.py +3 -0
docling/datamodel/pipeline_options_vlm_model.py +5 -0
docling/datamodel/vlm_model_specs.py +114 -1
docling/models/base_model.py +95 -2
docling/models/page_preprocessing_model.py +5 -1
docling/models/picture_description_vlm_model.py +4 -2
docling/models/vlm_models_inline/__init__.py +1 -0
docling/models/vlm_models_inline/hf_transformers_model.py +179 -79
docling/models/vlm_models_inline/mlx_model.py +179 -68
docling/models/vlm_models_inline/vllm_model.py +235 -0
docling/pipeline/threaded_standard_pdf_pipeline.py +1 -1
docling/pipeline/vlm_pipeline.py +14 -1
docling/utils/layout_postprocessor.py +51 -43
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/METADATA +2 -1
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/RECORD +24 -23
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/WHEEL +0 -0
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/entry_points.txt +0 -0
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.46.0.dist-info → docling-2.47.0.dist-info}/top_level.txt +0 -0

docling/datamodel/vlm_model_specs.py CHANGED Viewed

@@ -12,6 +12,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
     InlineVlmOptions,
     ResponseFormat,
     TransformersModelType,
+    TransformersPromptStyle,
 )
 _log = logging.getLogger(__name__)
@@ -26,6 +27,7 @@ SMOLDOCLING_MLX = InlineVlmOptions(
     supported_devices=[AcceleratorDevice.MPS],
     scale=2.0,
     temperature=0.0,
+    stop_strings=["</doctag>", "<end_of_utterance>"],
 )
 SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
@@ -33,16 +35,74 @@ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
     prompt="Convert this page to docling.",
     response_format=ResponseFormat.DOCTAGS,
     inference_framework=InferenceFramework.TRANSFORMERS,
-    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
     supported_devices=[
         AcceleratorDevice.CPU,
         AcceleratorDevice.CUDA,
+    ],
+    torch_dtype="bfloat16",
+    scale=2.0,
+    temperature=0.0,
+    stop_strings=["</doctag>", "<end_of_utterance>"],
+)
+SMOLDOCLING_VLLM = InlineVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.VLLM,
+    supported_devices=[
+        AcceleratorDevice.CUDA,
+    ],
+    scale=2.0,
+    temperature=0.0,
+    stop_strings=["</doctag>", "<end_of_utterance>"],
+)
+# SmolVLM-256M-Instruct
+SMOLVLM256_TRANSFORMERS = InlineVlmOptions(
+    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
+    prompt="Transcribe this image to plain text.",
+    response_format=ResponseFormat.PLAINTEXT,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+    supported_devices=[
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        # AcceleratorDevice.MPS,
+    ],
+    torch_dtype="bfloat16",
+    scale=2.0,
+    temperature=0.0,
+)
+# SmolVLM2-2.2b-Instruct
+SMOLVLM256_MLX = InlineVlmOptions(
+    repo_id="moot20/SmolVLM-256M-Instruct-MLX",
+    prompt="Extract the text.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.MLX,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+    supported_devices=[
         AcceleratorDevice.MPS,
     ],
     scale=2.0,
     temperature=0.0,
 )
+SMOLVLM256_VLLM = InlineVlmOptions(
+    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
+    prompt="Transcribe this image to plain text.",
+    response_format=ResponseFormat.PLAINTEXT,
+    inference_framework=InferenceFramework.VLLM,
+    supported_devices=[
+        AcceleratorDevice.CUDA,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
 # GraniteVision
 GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
     repo_id="ibm-granite/granite-vision-3.2-2b",
@@ -59,6 +119,18 @@ GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
     temperature=0.0,
 )
+GRANITE_VISION_VLLM = InlineVlmOptions(
+    repo_id="ibm-granite/granite-vision-3.2-2b",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.VLLM,
+    supported_devices=[
+        AcceleratorDevice.CUDA,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
 GRANITE_VISION_OLLAMA = ApiVlmOptions(
     url=AnyUrl("http://localhost:11434/v1/chat/completions"),
     params={"model": "granite3.2-vision:2b"},
@@ -116,6 +188,26 @@ QWEN25_VL_3B_MLX = InlineVlmOptions(
     temperature=0.0,
 )
+# GoT 2.0
+GOT2_TRANSFORMERS = InlineVlmOptions(
+    repo_id="stepfun-ai/GOT-OCR-2.0-hf",
+    prompt="",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_prompt_style=TransformersPromptStyle.NONE,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+    supported_devices=[
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        #    AcceleratorDevice.MPS,
+    ],
+    scale=2.0,
+    temperature=0.0,
+    stop_strings=["<|im_end|>"],
+    extra_processor_kwargs={"format": True},
+)
 # Gemma-3
 GEMMA3_12B_MLX = InlineVlmOptions(
     repo_id="mlx-community/gemma-3-12b-it-bf16",
@@ -137,8 +229,29 @@ GEMMA3_27B_MLX = InlineVlmOptions(
     temperature=0.0,
 )
+# Dolphin
+DOLPHIN_TRANSFORMERS = InlineVlmOptions(
+    repo_id="ByteDance/Dolphin",
+    prompt="<s>Read text in the image. <Answer/>",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+    transformers_prompt_style=TransformersPromptStyle.RAW,
+    supported_devices=[
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.MPS,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
 class VlmModelType(str, Enum):
     SMOLDOCLING = "smoldocling"
+    SMOLDOCLING_VLLM = "smoldocling_vllm"
     GRANITE_VISION = "granite_vision"
+    GRANITE_VISION_VLLM = "granite_vision_vllm"
     GRANITE_VISION_OLLAMA = "granite_vision_ollama"
+    GOT_OCR_2 = "got_ocr_2"

docling/models/base_model.py CHANGED Viewed

@@ -1,13 +1,24 @@
+import logging
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
-from typing import Generic, Optional, Protocol, Type
+from typing import Any, Generic, Optional, Protocol, Type, Union
+import numpy as np
 from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
+from PIL.Image import Image
 from typing_extensions import TypeVar
-from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
+from docling.datamodel.base_models import (
+    ItemAndImageEnrichmentElement,
+    Page,
+    VlmPrediction,
+)
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import BaseOptions
+from docling.datamodel.pipeline_options_vlm_model import (
+    InlineVlmOptions,
+    TransformersPromptStyle,
+)
 from docling.datamodel.settings import settings
@@ -26,6 +37,88 @@ class BasePageModel(ABC):
         pass
+class BaseVlmModel(ABC):
+    """Base class for Vision-Language Models that adds image processing capability."""
+    @abstractmethod
+    def process_images(
+        self,
+        image_batch: Iterable[Union[Image, np.ndarray]],
+        prompt: Union[str, list[str]],
+    ) -> Iterable[VlmPrediction]:
+        """Process raw images without page metadata.
+        Args:
+            image_batch: Iterable of PIL Images or numpy arrays
+            prompt: Either:
+                - str: Single prompt used for all images
+                - list[str]: List of prompts (one per image, must match image count)
+        Raises:
+            ValueError: If prompt list length doesn't match image count.
+        """
+class BaseVlmPageModel(BasePageModel, BaseVlmModel):
+    """Base implementation for VLM models that inherit from BasePageModel.
+    Provides a default __call__ implementation that extracts images from pages,
+    processes them using process_images, and attaches results back to pages.
+    """
+    # Type annotations for attributes that subclasses must initialize
+    vlm_options: InlineVlmOptions
+    processor: Any
+    @abstractmethod
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        """Extract images from pages, process them, and attach results back."""
+    def formulate_prompt(self, user_prompt: str) -> str:
+        """Formulate a prompt for the VLM."""
+        _log = logging.getLogger(__name__)
+        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
+            return user_prompt
+        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+            _log.debug("Using specialized prompt for Phi-4")
+            # Note: This might need adjustment for VLLM vs transformers
+            user_prompt_prefix = "<|user|>"
+            assistant_prompt = "<|assistant|>"
+            prompt_suffix = "<|end|>"
+            prompt = f"{user_prompt_prefix}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
+            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
+            return prompt
+        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "This is a page from a document.",
+                        },
+                        {"type": "image"},
+                        {"type": "text", "text": user_prompt},
+                    ],
+                }
+            ]
+            prompt = self.processor.apply_chat_template(
+                messages, add_generation_prompt=True
+            )
+            return prompt
+        raise RuntimeError(
+            f"Unknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
+        )
 EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)

docling/models/page_preprocessing_model.py CHANGED Viewed

@@ -17,6 +17,9 @@ from docling.utils.profiling import TimeRecorder
 class PagePreprocessingOptions(BaseModel):
     images_scale: Optional[float]
+    skip_cell_extraction: bool = (
+        False  # Skip text cell extraction for VLM-only processing
+    )
 class PagePreprocessingModel(BasePageModel):
@@ -41,7 +44,8 @@ class PagePreprocessingModel(BasePageModel):
             else:
                 with TimeRecorder(conv_res, "page_parse"):
                     page = self._populate_page_images(page)
-                    page = self._parse_page_cells(conv_res, page)
+                    if not self.options.skip_cell_extraction:
+                        page = self._parse_page_cells(conv_res, page)
                 yield page
     # Generate the page image and store it in the page object

docling/models/picture_description_vlm_model.py CHANGED Viewed

@@ -4,6 +4,7 @@ from pathlib import Path
 from typing import Optional, Type, Union
 from PIL import Image
+from transformers import AutoModelForImageTextToText
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
@@ -63,7 +64,7 @@ class PictureDescriptionVlmModel(
             # Initialize processor and model
             with _model_init_lock:
                 self.processor = AutoProcessor.from_pretrained(artifacts_path)
-                self.model = AutoModelForVision2Seq.from_pretrained(
+                self.model = AutoModelForImageTextToText.from_pretrained(
                     artifacts_path,
                     device_map=self.device,
                     torch_dtype=torch.bfloat16,
@@ -71,9 +72,10 @@ class PictureDescriptionVlmModel(
                         "flash_attention_2"
                         if self.device.startswith("cuda")
                         and accelerator_options.cuda_use_flash_attention2
-                        else "eager"
+                        else "sdpa"
                     ),
                 )
+                self.model = torch.compile(self.model)  # type: ignore
             self.provenance = f"{self.options.repo_id}"

docling/models/vlm_models_inline/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@
1	+

docling/models/vlm_models_inline/hf_transformers_model.py CHANGED Viewed

@@ -3,7 +3,11 @@ import logging
 import time
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Union
+import numpy as np
+from PIL.Image import Image
+from transformers import StoppingCriteriaList, StopStringCriteria
 from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
@@ -15,7 +19,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
     TransformersModelType,
     TransformersPromptStyle,
 )
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BaseVlmPageModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
 )
@@ -25,7 +29,7 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
-class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
+class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         enabled: bool,
@@ -103,6 +107,8 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                 artifacts_path,
                 trust_remote_code=vlm_options.trust_remote_code,
             )
+            self.processor.tokenizer.padding_side = "left"
             self.vlm_model = model_cls.from_pretrained(
                 artifacts_path,
                 device_map=self.device,
@@ -111,10 +117,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                     "flash_attention_2"
                     if self.device.startswith("cuda")
                     and accelerator_options.cuda_use_flash_attention2
-                    else "eager"
+                    else "sdpa"
                 ),
                 trust_remote_code=vlm_options.trust_remote_code,
             )
+            self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
             # Load generation config
             self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
@@ -122,93 +129,186 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
-        for page in page_batch:
+        page_list = list(page_batch)
+        if not page_list:
+            return
+        valid_pages = []
+        invalid_pages = []
+        for page in page_list:
             assert page._backend is not None
             if not page._backend.is_valid():
-                yield page
+                invalid_pages.append(page)
             else:
-                with TimeRecorder(conv_res, "vlm"):
-                    assert page.size is not None
+                valid_pages.append(page)
+        # Process valid pages in batch
+        if valid_pages:
+            with TimeRecorder(conv_res, "vlm"):
+                # Prepare images and prompts for batch processing
+                images = []
+                user_prompts = []
+                pages_with_images = []
+                for page in valid_pages:
+                    assert page.size is not None
                     hi_res_image = page.get_image(
                         scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                     )
-                    # Define prompt structure
-                    user_prompt = self.vlm_options.build_prompt(page.parsed_page)
-                    prompt = self.formulate_prompt(user_prompt)
-                    inputs = self.processor(
-                        text=prompt, images=[hi_res_image], return_tensors="pt"
-                    ).to(self.device)
-                    start_time = time.time()
-                    # Call model to generate:
-                    generated_ids = self.vlm_model.generate(
-                        **inputs,
-                        max_new_tokens=self.max_new_tokens,
-                        use_cache=self.use_cache,
-                        temperature=self.temperature,
-                        generation_config=self.generation_config,
-                        **self.vlm_options.extra_generation_config,
-                    )
+                    # Only process pages with valid images
+                    if hi_res_image is not None:
+                        images.append(hi_res_image)
-                    generation_time = time.time() - start_time
-                    generated_texts = self.processor.batch_decode(
-                        generated_ids[:, inputs["input_ids"].shape[1] :],
-                        skip_special_tokens=False,
-                    )[0]
+                        # Define prompt structure
+                        user_prompt = self.vlm_options.build_prompt(page.parsed_page)
-                    num_tokens = len(generated_ids[0])
-                    _log.debug(
-                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
-                    )
-                    generated_texts = self.vlm_options.decode_response(generated_texts)
-                    page.predictions.vlm_response = VlmPrediction(
-                        text=generated_texts,
-                        generation_time=generation_time,
+                        user_prompts.append(user_prompt)
+                        pages_with_images.append(page)
+                # Use process_images for the actual inference
+                if images:  # Only if we have valid images
+                    predictions = list(self.process_images(images, user_prompts))
+                    # Attach results to pages
+                    for page, prediction in zip(pages_with_images, predictions):
+                        page.predictions.vlm_response = prediction
+        # Yield all pages (valid and invalid)
+        for page in invalid_pages:
+            yield page
+        for page in valid_pages:
+            yield page
+    def process_images(
+        self,
+        image_batch: Iterable[Union[Image, np.ndarray]],
+        prompt: Union[str, list[str]],
+    ) -> Iterable[VlmPrediction]:
+        """
+        Batched inference for Hugging Face Image-Text-to-Text VLMs (e.g., SmolDocling / SmolVLM).
+        - Lets the processor handle all padding & batching for text+images.
+        - Trims generated sequences per row using attention_mask (no pad-id fallbacks).
+        - Keeps your formulate_prompt() exactly as-is.
+        """
+        import numpy as np
+        import torch
+        from PIL import Image as PILImage
+        # -- Normalize images to RGB PIL (SmolDocling & friends accept PIL/np via processor)
+        pil_images: list[Image] = []
+        for img in image_batch:
+            if isinstance(img, np.ndarray):
+                if img.ndim == 3 and img.shape[2] in (3, 4):
+                    pil_img = PILImage.fromarray(img.astype(np.uint8))
+                elif img.ndim == 2:
+                    pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
+                else:
+                    raise ValueError(f"Unsupported numpy array shape: {img.shape}")
+            else:
+                pil_img = img
+            if pil_img.mode != "RGB":
+                pil_img = pil_img.convert("RGB")
+            pil_images.append(pil_img)
+        if not pil_images:
+            return
+        # -- Normalize prompts (1 per image)
+        if isinstance(prompt, str):
+            user_prompts = [prompt] * len(pil_images)
+        else:
+            if len(prompt) != len(pil_images):
+                raise ValueError(
+                    f"Number of prompts ({len(prompt)}) must match number of images ({len(pil_images)})"
+                )
+            user_prompts = prompt
+        # Use your prompt formatter verbatim
+        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE:
+            inputs = self.processor(
+                pil_images,
+                return_tensors="pt",
+                padding=True,  # pad across batch for both text and vision
+                **self.vlm_options.extra_processor_kwargs,
+            )
+        else:
+            prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts]
+            # -- Processor performs BOTH text+image preprocessing + batch padding (recommended)
+            inputs = self.processor(
+                text=prompts,
+                images=pil_images,
+                return_tensors="pt",
+                padding=True,  # pad across batch for both text and vision
+                **self.vlm_options.extra_processor_kwargs,
+            )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # -- Optional stopping criteria
+        stopping_criteria = None
+        if self.vlm_options.stop_strings:
+            stopping_criteria = StoppingCriteriaList(
+                [
+                    StopStringCriteria(
+                        stop_strings=self.vlm_options.stop_strings,
+                        tokenizer=self.processor.tokenizer,
                     )
+                ]
+            )
-                yield page
-    def formulate_prompt(self, user_prompt: str) -> str:
-        """Formulate a prompt for the VLM."""
-        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
-            return user_prompt
-        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
-            _log.debug("Using specialized prompt for Phi-4")
-            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
-            user_prompt = "<|user|>"
-            assistant_prompt = "<|assistant|>"
-            prompt_suffix = "<|end|>"
-            prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
-            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
-            return prompt
-        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "This is a page from a document.",
-                        },
-                        {"type": "image"},
-                        {"type": "text", "text": user_prompt},
-                    ],
-                }
-            ]
-            prompt = self.processor.apply_chat_template(
-                messages, add_generation_prompt=False
+        # -- Generate (Image-Text-to-Text class expects these inputs from processor)
+        gen_kwargs = {
+            **inputs,
+            "max_new_tokens": self.max_new_tokens,
+            "use_cache": self.use_cache,
+            "generation_config": self.generation_config,
+            **self.vlm_options.extra_generation_config,
+        }
+        if self.temperature > 0:
+            gen_kwargs["do_sample"] = True
+            gen_kwargs["temperature"] = self.temperature
+        else:
+            gen_kwargs["do_sample"] = False
+        if stopping_criteria is not None:
+            gen_kwargs["stopping_criteria"] = stopping_criteria
+        start_time = time.time()
+        with torch.inference_mode():
+            generated_ids = self.vlm_model.generate(**gen_kwargs)
+        generation_time = time.time() - start_time
+        input_len = inputs["input_ids"].shape[1]  # common right-aligned prompt length
+        trimmed_sequences = generated_ids[:, input_len:]  # only newly generated tokens
+        # -- Decode with the processor/tokenizer (skip specials, keep DocTags as text)
+        decode_fn = getattr(self.processor, "batch_decode", None)
+        if decode_fn is None and getattr(self.processor, "tokenizer", None) is not None:
+            decode_fn = self.processor.tokenizer.batch_decode
+        if decode_fn is None:
+            raise RuntimeError(
+                "Neither processor.batch_decode nor tokenizer.batch_decode is available."
             )
-            return prompt
-        raise RuntimeError(
-            f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
+        decoded_texts: list[str] = decode_fn(
+            trimmed_sequences, skip_special_tokens=False
         )
+        # -- Clip off pad tokens from decoded texts
+        pad_token = self.processor.tokenizer.pad_token
+        if pad_token:
+            decoded_texts = [text.rstrip(pad_token) for text in decoded_texts]
+        # -- Optional logging
+        if generated_ids.shape[0] > 0:
+            _log.debug(
+                f"Generated {int(generated_ids[0].shape[0])} tokens in {generation_time:.2f}s "
+                f"for batch size {generated_ids.shape[0]}."
+            )
+        for text in decoded_texts:
+            # Apply decode_response to the output text
+            decoded_text = self.vlm_options.decode_response(text)
+            yield VlmPrediction(text=decoded_text, generation_time=generation_time)

docling 2.46.0__py3-none-any.whl → 2.47.0__py3-none-any.whl

docling 2.46.0py3-none-any.whl → 2.47.0py3-none-any.whl