PyPI - docling - Versions diffs - 2.45.0__py3-none-any.whl → 2.47.0__py3-none-any.whl - Mend

docling 2.45.0py3-none-any.whl → 2.47.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

docling/backend/docling_parse_v4_backend.py +61 -27
docling/backend/html_backend.py +119 -17
docling/backend/msword_backend.py +126 -16
docling/cli/main.py +14 -0
docling/cli/models.py +56 -0
docling/datamodel/base_models.py +1 -1
docling/datamodel/pipeline_options.py +4 -3
docling/datamodel/pipeline_options_vlm_model.py +5 -0
docling/datamodel/vlm_model_specs.py +114 -1
docling/models/base_model.py +95 -2
docling/models/code_formula_model.py +87 -76
docling/models/page_preprocessing_model.py +5 -1
docling/models/picture_description_vlm_model.py +4 -2
docling/models/tesseract_ocr_cli_model.py +4 -2
docling/models/vlm_models_inline/__init__.py +1 -0
docling/models/vlm_models_inline/hf_transformers_model.py +179 -79
docling/models/vlm_models_inline/mlx_model.py +179 -68
docling/models/vlm_models_inline/vllm_model.py +235 -0
docling/pipeline/base_pipeline.py +7 -1
docling/pipeline/threaded_standard_pdf_pipeline.py +7 -5
docling/pipeline/vlm_pipeline.py +14 -1
docling/utils/layout_postprocessor.py +51 -43
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/METADATA +3 -2
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/RECORD +28 -27
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/WHEEL +0 -0
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/entry_points.txt +0 -0
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/top_level.txt +0 -0

docling/models/code_formula_model.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import re
-from collections import Counter
 from collections.abc import Iterable
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
@@ -13,10 +12,11 @@ from docling_core.types.doc import (
     TextItem,
 )
 from docling_core.types.doc.labels import CodeLanguageLabel
-from PIL import Image, ImageOps
+from PIL import Image
 from pydantic import BaseModel
+from transformers import AutoModelForImageTextToText, AutoProcessor
-from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement
 from docling.models.base_model import BaseItemAndImageEnrichmentModel
 from docling.models.utils.hf_model_download import download_hf_model
@@ -65,9 +65,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         Processes the given batch of elements and enriches them with predictions.
     """
-    _model_repo_folder = "ds4sd--CodeFormula"
+    _model_repo_folder = "ds4sd--CodeFormulaV2"
     elements_batch_size = 5
-    images_scale = 1.66  # = 120 dpi, aligned with training data resolution
+    images_scale = 1.67  # = 120 dpi, aligned with training data resolution
     expansion_factor = 0.18
     def __init__(
@@ -95,10 +95,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         self.options = options
         if self.enabled:
-            device = decide_device(accelerator_options.device)
-            from docling_ibm_models.code_formula_model.code_formula_predictor import (
-                CodeFormulaPredictor,
+            self.device = decide_device(
+                accelerator_options.device,
+                supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
             )
             if artifacts_path is None:
@@ -106,11 +105,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
             else:
                 artifacts_path = artifacts_path / self._model_repo_folder
-            self.code_formula_model = CodeFormulaPredictor(
-                artifacts_path=str(artifacts_path),
-                device=device,
-                num_threads=accelerator_options.num_threads,
+            self._processor = AutoProcessor.from_pretrained(
+                artifacts_path,
+            )
+            self._model_max_length = self._processor.tokenizer.model_max_length
+            self._model = AutoModelForImageTextToText.from_pretrained(
+                artifacts_path, device_map=self.device
             )
+            self._model.eval()
     @staticmethod
     def download_models(
@@ -119,8 +121,8 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         progress: bool = False,
     ) -> Path:
         return download_hf_model(
-            repo_id="ds4sd/CodeFormula",
-            revision="v1.0.2",
+            repo_id="ds4sd/CodeFormulaV2",
+            revision="main",
             local_dir=local_dir,
             force=force,
             progress=progress,
@@ -172,7 +174,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
                 - The second element is the extracted language if a match is found;
                 otherwise, `None`.
         """
-        pattern = r"^<_([^_>]+)_>\s(.*)"
+        pattern = r"^<_([^_>]+)_>\s*(.*)"
         match = re.match(pattern, input_string, flags=re.DOTALL)
         if match:
             language = str(match.group(1))  # the captured programming language
@@ -203,81 +205,74 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         except ValueError:
             return CodeLanguageLabel.UNKNOWN
-    def _get_most_frequent_edge_color(self, pil_img: Image.Image):
+    def _get_prompt(self, label: str) -> str:
         """
-        Compute the most frequent color along the outer edges of a PIL image.
+        Constructs the prompt for the model based on the input label.
         Parameters
         ----------
-            pil_img : Image.Image
-                A PIL Image in any mode (L, RGB, RGBA, etc.).
+        label : str
+            The type of input, either 'code' or 'formula'.
         Returns
         -------
-            (int) or (tuple): The most common edge color as a scalar (for grayscale) or
-                tuple (for RGB/RGBA).
+        str
+            The constructed prompt including necessary tokens and query.
+        Raises
+        ------
+        NotImplementedError
+            If the label is not 'code' or 'formula'.
         """
-        # Convert to NumPy array for easy pixel access
-        img_np = np.array(pil_img)
+        if label == "code":
+            query = "<code>"
+        elif label == "formula":
+            query = "<formula>"
+        else:
+            raise NotImplementedError("Label must be either code or formula")
-        if img_np.ndim == 2:
-            # Grayscale-like image: shape (H, W)
-            # Extract edges: top row, bottom row, left col, right col
-            top = img_np[0, :]  # shape (W,)
-            bottom = img_np[-1, :]  # shape (W,)
-            left = img_np[:, 0]  # shape (H,)
-            right = img_np[:, -1]  # shape (H,)
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": query}],
+            },
+        ]
-            # Concatenate all edges
-            edges = np.concatenate([top, bottom, left, right])
+        prompt = self._processor.apply_chat_template(
+            messages, add_generation_prompt=True
+        )
-            # Count frequencies
-            freq = Counter(edges.tolist())
-            most_common_value, _ = freq.most_common(1)[0]
-            return int(most_common_value)  # single channel color
+        return prompt
-        else:
-            # Color image: shape (H, W, C)
-            top = img_np[0, :, :]  # shape (W, C)
-            bottom = img_np[-1, :, :]  # shape (W, C)
-            left = img_np[:, 0, :]  # shape (H, C)
-            right = img_np[:, -1, :]  # shape (H, C)
-            # Concatenate edges along first axis
-            edges = np.concatenate([top, bottom, left, right], axis=0)
-            # Convert each color to a tuple for counting
-            edges_as_tuples = [tuple(pixel) for pixel in edges]
-            freq = Counter(edges_as_tuples)
-            most_common_value, _ = freq.most_common(1)[0]
-            return most_common_value  # e.g. (R, G, B) or (R, G, B, A)
-    def _pad_with_most_frequent_edge_color(
-        self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int]
-    ):
+    def _post_process(self, texts: list[str]) -> list[str]:
         """
-        Pads an image (PIL or NumPy array) using the most frequent edge color.
+        Processes a list of text strings by truncating at '<end_of_utterance>' and
+        removing a predefined set of unwanted substrings.
         Parameters
         ----------
-            img : Union[Image.Image, np.ndarray]
-                The original image.
-            padding : tuple
-                Padding (left, top, right, bottom) in pixels.
+        texts : list[str]
+            A list of strings to be post-processed.
         Returns
         -------
-            Image.Image: A new PIL image with the specified padding.
+        list[str]
+            A list of cleaned strings with specified substrings removed and truncated at
+                '<end_of_utterance>' if present.
         """
-        if isinstance(img, np.ndarray):
-            pil_img = Image.fromarray(img)
-        else:
-            pil_img = img
+        to_remove = ["</code>", "</formula>", "<loc_0><loc_0><loc_500><loc_500>"]
-        most_freq_color = self._get_most_frequent_edge_color(pil_img)
+        def clean_text(text: str) -> str:
+            idx = text.find("<end_of_utterance>")
+            if idx != -1:
+                text = text[:idx]
-        padded_img = ImageOps.expand(pil_img, border=padding, fill=most_freq_color)
-        return padded_img
+            for token in to_remove:
+                if token in text:
+                    text = text.replace(token, "")
+            return text.lstrip()
+        return [clean_text(t) for t in texts]
     def __call__(
         self,
@@ -308,14 +303,30 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         images: List[Union[Image.Image, np.ndarray]] = []
         elements: List[TextItem] = []
         for el in element_batch:
-            assert isinstance(el.item, TextItem)
-            elements.append(el.item)
-            labels.append(el.item.label)
-            images.append(
-                self._pad_with_most_frequent_edge_color(el.image, (20, 10, 20, 10))
-            )
+            elements.append(el.item)  # type: ignore[arg-type]
+            labels.append(el.item.label)  # type: ignore[attr-defined]
+            images.append(el.image)
+        prompts = [self._get_prompt(label) for label in labels]
+        inputs = self._processor(
+            text=prompts,
+            images=images,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.device)
-        outputs = self.code_formula_model.predict(images, labels)
+        gen_kwargs = dict(
+            max_new_tokens=self._model_max_length - inputs.input_ids.shape[1],
+            use_cache=True,
+            do_sample=False,
+        )
+        generated_ids = self._model.generate(**inputs, **gen_kwargs)
+        outputs = self._processor.batch_decode(
+            generated_ids[:, inputs.input_ids.shape[1] :], skip_special_tokens=False
+        )
+        outputs = self._post_process(outputs)
         for item, output in zip(elements, outputs):
             if isinstance(item, CodeItem):

docling/models/page_preprocessing_model.py CHANGED Viewed

@@ -17,6 +17,9 @@ from docling.utils.profiling import TimeRecorder
 class PagePreprocessingOptions(BaseModel):
     images_scale: Optional[float]
+    skip_cell_extraction: bool = (
+        False  # Skip text cell extraction for VLM-only processing
+    )
 class PagePreprocessingModel(BasePageModel):
@@ -41,7 +44,8 @@ class PagePreprocessingModel(BasePageModel):
             else:
                 with TimeRecorder(conv_res, "page_parse"):
                     page = self._populate_page_images(page)
-                    page = self._parse_page_cells(conv_res, page)
+                    if not self.options.skip_cell_extraction:
+                        page = self._parse_page_cells(conv_res, page)
                 yield page
     # Generate the page image and store it in the page object

docling/models/picture_description_vlm_model.py CHANGED Viewed

@@ -4,6 +4,7 @@ from pathlib import Path
 from typing import Optional, Type, Union
 from PIL import Image
+from transformers import AutoModelForImageTextToText
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
@@ -63,7 +64,7 @@ class PictureDescriptionVlmModel(
             # Initialize processor and model
             with _model_init_lock:
                 self.processor = AutoProcessor.from_pretrained(artifacts_path)
-                self.model = AutoModelForVision2Seq.from_pretrained(
+                self.model = AutoModelForImageTextToText.from_pretrained(
                     artifacts_path,
                     device_map=self.device,
                     torch_dtype=torch.bfloat16,
@@ -71,9 +72,10 @@ class PictureDescriptionVlmModel(
                         "flash_attention_2"
                         if self.device.startswith("cuda")
                         and accelerator_options.cuda_use_flash_attention2
-                        else "eager"
+                        else "sdpa"
                     ),
                 )
+                self.model = torch.compile(self.model)  # type: ignore
             self.provenance = f"{self.options.repo_id}"

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -320,6 +320,8 @@ class TesseractOcrCliModel(BaseOcrModel):
 def _parse_orientation(df_osd: pd.DataFrame) -> int:
-    orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
-    orientation = parse_tesseract_orientation(orientations[0].strip())
+    # For strictly optimal performance with invariant dataframe format:
+    mask = df_osd["key"].to_numpy() == "Orientation in degrees"
+    orientation_val = df_osd["value"].to_numpy()[mask][0]
+    orientation = parse_tesseract_orientation(orientation_val.strip())
     return orientation

docling/models/vlm_models_inline/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@
1	+

docling/models/vlm_models_inline/hf_transformers_model.py CHANGED Viewed

@@ -3,7 +3,11 @@ import logging
 import time
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Union
+import numpy as np
+from PIL.Image import Image
+from transformers import StoppingCriteriaList, StopStringCriteria
 from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
@@ -15,7 +19,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
     TransformersModelType,
     TransformersPromptStyle,
 )
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BaseVlmPageModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
 )
@@ -25,7 +29,7 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
-class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
+class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         enabled: bool,
@@ -103,6 +107,8 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                 artifacts_path,
                 trust_remote_code=vlm_options.trust_remote_code,
             )
+            self.processor.tokenizer.padding_side = "left"
             self.vlm_model = model_cls.from_pretrained(
                 artifacts_path,
                 device_map=self.device,
@@ -111,10 +117,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                     "flash_attention_2"
                     if self.device.startswith("cuda")
                     and accelerator_options.cuda_use_flash_attention2
-                    else "eager"
+                    else "sdpa"
                 ),
                 trust_remote_code=vlm_options.trust_remote_code,
             )
+            self.vlm_model = torch.compile(self.vlm_model)  # type: ignore
             # Load generation config
             self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
@@ -122,93 +129,186 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
-        for page in page_batch:
+        page_list = list(page_batch)
+        if not page_list:
+            return
+        valid_pages = []
+        invalid_pages = []
+        for page in page_list:
             assert page._backend is not None
             if not page._backend.is_valid():
-                yield page
+                invalid_pages.append(page)
             else:
-                with TimeRecorder(conv_res, "vlm"):
-                    assert page.size is not None
+                valid_pages.append(page)
+        # Process valid pages in batch
+        if valid_pages:
+            with TimeRecorder(conv_res, "vlm"):
+                # Prepare images and prompts for batch processing
+                images = []
+                user_prompts = []
+                pages_with_images = []
+                for page in valid_pages:
+                    assert page.size is not None
                     hi_res_image = page.get_image(
                         scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                     )
-                    # Define prompt structure
-                    user_prompt = self.vlm_options.build_prompt(page.parsed_page)
-                    prompt = self.formulate_prompt(user_prompt)
-                    inputs = self.processor(
-                        text=prompt, images=[hi_res_image], return_tensors="pt"
-                    ).to(self.device)
-                    start_time = time.time()
-                    # Call model to generate:
-                    generated_ids = self.vlm_model.generate(
-                        **inputs,
-                        max_new_tokens=self.max_new_tokens,
-                        use_cache=self.use_cache,
-                        temperature=self.temperature,
-                        generation_config=self.generation_config,
-                        **self.vlm_options.extra_generation_config,
-                    )
+                    # Only process pages with valid images
+                    if hi_res_image is not None:
+                        images.append(hi_res_image)
-                    generation_time = time.time() - start_time
-                    generated_texts = self.processor.batch_decode(
-                        generated_ids[:, inputs["input_ids"].shape[1] :],
-                        skip_special_tokens=False,
-                    )[0]
+                        # Define prompt structure
+                        user_prompt = self.vlm_options.build_prompt(page.parsed_page)
-                    num_tokens = len(generated_ids[0])
-                    _log.debug(
-                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
-                    )
-                    generated_texts = self.vlm_options.decode_response(generated_texts)
-                    page.predictions.vlm_response = VlmPrediction(
-                        text=generated_texts,
-                        generation_time=generation_time,
+                        user_prompts.append(user_prompt)
+                        pages_with_images.append(page)
+                # Use process_images for the actual inference
+                if images:  # Only if we have valid images
+                    predictions = list(self.process_images(images, user_prompts))
+                    # Attach results to pages
+                    for page, prediction in zip(pages_with_images, predictions):
+                        page.predictions.vlm_response = prediction
+        # Yield all pages (valid and invalid)
+        for page in invalid_pages:
+            yield page
+        for page in valid_pages:
+            yield page
+    def process_images(
+        self,
+        image_batch: Iterable[Union[Image, np.ndarray]],
+        prompt: Union[str, list[str]],
+    ) -> Iterable[VlmPrediction]:
+        """
+        Batched inference for Hugging Face Image-Text-to-Text VLMs (e.g., SmolDocling / SmolVLM).
+        - Lets the processor handle all padding & batching for text+images.
+        - Trims generated sequences per row using attention_mask (no pad-id fallbacks).
+        - Keeps your formulate_prompt() exactly as-is.
+        """
+        import numpy as np
+        import torch
+        from PIL import Image as PILImage
+        # -- Normalize images to RGB PIL (SmolDocling & friends accept PIL/np via processor)
+        pil_images: list[Image] = []
+        for img in image_batch:
+            if isinstance(img, np.ndarray):
+                if img.ndim == 3 and img.shape[2] in (3, 4):
+                    pil_img = PILImage.fromarray(img.astype(np.uint8))
+                elif img.ndim == 2:
+                    pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
+                else:
+                    raise ValueError(f"Unsupported numpy array shape: {img.shape}")
+            else:
+                pil_img = img
+            if pil_img.mode != "RGB":
+                pil_img = pil_img.convert("RGB")
+            pil_images.append(pil_img)
+        if not pil_images:
+            return
+        # -- Normalize prompts (1 per image)
+        if isinstance(prompt, str):
+            user_prompts = [prompt] * len(pil_images)
+        else:
+            if len(prompt) != len(pil_images):
+                raise ValueError(
+                    f"Number of prompts ({len(prompt)}) must match number of images ({len(pil_images)})"
+                )
+            user_prompts = prompt
+        # Use your prompt formatter verbatim
+        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE:
+            inputs = self.processor(
+                pil_images,
+                return_tensors="pt",
+                padding=True,  # pad across batch for both text and vision
+                **self.vlm_options.extra_processor_kwargs,
+            )
+        else:
+            prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts]
+            # -- Processor performs BOTH text+image preprocessing + batch padding (recommended)
+            inputs = self.processor(
+                text=prompts,
+                images=pil_images,
+                return_tensors="pt",
+                padding=True,  # pad across batch for both text and vision
+                **self.vlm_options.extra_processor_kwargs,
+            )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # -- Optional stopping criteria
+        stopping_criteria = None
+        if self.vlm_options.stop_strings:
+            stopping_criteria = StoppingCriteriaList(
+                [
+                    StopStringCriteria(
+                        stop_strings=self.vlm_options.stop_strings,
+                        tokenizer=self.processor.tokenizer,
                     )
+                ]
+            )
-                yield page
-    def formulate_prompt(self, user_prompt: str) -> str:
-        """Formulate a prompt for the VLM."""
-        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
-            return user_prompt
-        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
-            _log.debug("Using specialized prompt for Phi-4")
-            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
-            user_prompt = "<|user|>"
-            assistant_prompt = "<|assistant|>"
-            prompt_suffix = "<|end|>"
-            prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
-            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
-            return prompt
-        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "This is a page from a document.",
-                        },
-                        {"type": "image"},
-                        {"type": "text", "text": user_prompt},
-                    ],
-                }
-            ]
-            prompt = self.processor.apply_chat_template(
-                messages, add_generation_prompt=False
+        # -- Generate (Image-Text-to-Text class expects these inputs from processor)
+        gen_kwargs = {
+            **inputs,
+            "max_new_tokens": self.max_new_tokens,
+            "use_cache": self.use_cache,
+            "generation_config": self.generation_config,
+            **self.vlm_options.extra_generation_config,
+        }
+        if self.temperature > 0:
+            gen_kwargs["do_sample"] = True
+            gen_kwargs["temperature"] = self.temperature
+        else:
+            gen_kwargs["do_sample"] = False
+        if stopping_criteria is not None:
+            gen_kwargs["stopping_criteria"] = stopping_criteria
+        start_time = time.time()
+        with torch.inference_mode():
+            generated_ids = self.vlm_model.generate(**gen_kwargs)
+        generation_time = time.time() - start_time
+        input_len = inputs["input_ids"].shape[1]  # common right-aligned prompt length
+        trimmed_sequences = generated_ids[:, input_len:]  # only newly generated tokens
+        # -- Decode with the processor/tokenizer (skip specials, keep DocTags as text)
+        decode_fn = getattr(self.processor, "batch_decode", None)
+        if decode_fn is None and getattr(self.processor, "tokenizer", None) is not None:
+            decode_fn = self.processor.tokenizer.batch_decode
+        if decode_fn is None:
+            raise RuntimeError(
+                "Neither processor.batch_decode nor tokenizer.batch_decode is available."
             )
-            return prompt
-        raise RuntimeError(
-            f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
+        decoded_texts: list[str] = decode_fn(
+            trimmed_sequences, skip_special_tokens=False
         )
+        # -- Clip off pad tokens from decoded texts
+        pad_token = self.processor.tokenizer.pad_token
+        if pad_token:
+            decoded_texts = [text.rstrip(pad_token) for text in decoded_texts]
+        # -- Optional logging
+        if generated_ids.shape[0] > 0:
+            _log.debug(
+                f"Generated {int(generated_ids[0].shape[0])} tokens in {generation_time:.2f}s "
+                f"for batch size {generated_ids.shape[0]}."
+            )
+        for text in decoded_texts:
+            # Apply decode_response to the output text
+            decoded_text = self.vlm_options.decode_response(text)
+            yield VlmPrediction(text=decoded_text, generation_time=generation_time)

docling 2.45.0__py3-none-any.whl → 2.47.0__py3-none-any.whl

docling 2.45.0py3-none-any.whl → 2.47.0py3-none-any.whl