PyPI - docling - Versions diffs - 2.53.0__py3-none-any.whl → 2.55.0__py3-none-any.whl - Mend

docling 2.53.0py3-none-any.whl → 2.55.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

docling/backend/asciidoc_backend.py +1 -1
docling/backend/html_backend.py +254 -136
docling/backend/md_backend.py +4 -1
docling/backend/msword_backend.py +177 -76
docling/backend/webvtt_backend.py +572 -0
docling/backend/xml/jats_backend.py +111 -7
docling/backend/xml/uspto_backend.py +1 -1
docling/cli/main.py +5 -0
docling/datamodel/base_models.py +23 -23
docling/datamodel/document.py +2 -0
docling/datamodel/pipeline_options_vlm_model.py +13 -2
docling/datamodel/vlm_model_specs.py +9 -0
docling/document_converter.py +4 -0
docling/models/api_vlm_model.py +45 -16
docling/models/base_model.py +2 -1
docling/models/readingorder_model.py +1 -1
docling/models/table_structure_model.py +3 -3
docling/models/utils/generation_utils.py +157 -0
docling/models/utils/hf_model_download.py +6 -1
docling/models/vlm_models_inline/hf_transformers_model.py +75 -14
docling/models/vlm_models_inline/mlx_model.py +58 -1
docling/models/vlm_models_inline/vllm_model.py +189 -124
docling/utils/api_image_request.py +107 -1
{docling-2.53.0.dist-info → docling-2.55.0.dist-info}/METADATA +5 -5
{docling-2.53.0.dist-info → docling-2.55.0.dist-info}/RECORD +29 -27
{docling-2.53.0.dist-info → docling-2.55.0.dist-info}/WHEEL +0 -0
{docling-2.53.0.dist-info → docling-2.55.0.dist-info}/entry_points.txt +0 -0
{docling-2.53.0.dist-info → docling-2.55.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.53.0.dist-info → docling-2.55.0.dist-info}/top_level.txt +0 -0

docling/models/vlm_models_inline/vllm_model.py CHANGED Viewed

@@ -7,9 +7,7 @@ from typing import Any, Dict, Optional, Union
 import numpy as np
 from PIL.Image import Image
-from docling.datamodel.accelerator_options import (
-    AcceleratorOptions,
-)
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import (
@@ -17,9 +15,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
     TransformersPromptStyle,
 )
 from docling.models.base_model import BaseVlmPageModel
-from docling.models.utils.hf_model_download import (
-    HuggingFaceModelDownloadMixin,
-)
+from docling.models.utils.hf_model_download import HuggingFaceModelDownloadMixin
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
@@ -27,6 +23,62 @@ _log = logging.getLogger(__name__)
 class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
+    """
+    vLLM-backed vision-language model that accepts PIL images (or numpy arrays)
+    via vLLM's multi_modal_data, with prompt formatting handled by formulate_prompt().
+    """
+    # --------- Allowlist of vLLM args ---------
+    # SamplingParams (runtime generation controls)
+    _VLLM_SAMPLING_KEYS = {
+        # Core
+        "max_tokens",
+        "temperature",
+        "top_p",
+        "top_k",
+        # Penalties
+        "presence_penalty",
+        "frequency_penalty",
+        "repetition_penalty",
+        # Stops / outputs
+        "stop",
+        "stop_token_ids",
+        "skip_special_tokens",
+        "spaces_between_special_tokens",
+        # Search / length
+        "n",
+        "best_of",
+        "length_penalty",
+        "early_stopping",
+        # Misc
+        "logprobs",
+        "prompt_logprobs",
+        "min_p",
+        "seed",
+    }
+    # LLM(...) / EngineArgs (engine/load-time controls)
+    _VLLM_ENGINE_KEYS = {
+        # Model/tokenizer/impl
+        "tokenizer",
+        "tokenizer_mode",
+        "download_dir",
+        # Parallelism / memory / lengths
+        "tensor_parallel_size",
+        "pipeline_parallel_size",
+        "gpu_memory_utilization",
+        "max_model_len",
+        "max_num_batched_tokens",
+        "kv_cache_dtype",
+        "dtype",
+        # Quantization (coarse switch)
+        "quantization",
+        # Multimodal limits
+        "limit_mm_per_prompt",
+        # Execution toggles
+        "enforce_eager",
+    }
     def __init__(
         self,
         enabled: bool,
@@ -35,120 +87,147 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
         vlm_options: InlineVlmOptions,
     ):
         self.enabled = enabled
         self.vlm_options = vlm_options
-        if self.enabled:
-            from transformers import AutoProcessor
-            from vllm import LLM, SamplingParams
-            self.device = decide_device(
-                accelerator_options.device,
-                supported_devices=vlm_options.supported_devices,
-            )
-            _log.debug(f"Available device for VLM: {self.device}")
-            self.max_new_tokens = vlm_options.max_new_tokens
-            self.temperature = vlm_options.temperature
-            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+        self.llm = None
+        self.sampling_params = None
+        self.processor = None  # used for CHAT templating in formulate_prompt()
+        self.device = "cpu"
+        self.max_new_tokens = vlm_options.max_new_tokens
+        self.temperature = vlm_options.temperature
-            if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
-            elif (artifacts_path / repo_cache_folder).exists():
-                artifacts_path = artifacts_path / repo_cache_folder
-            # Initialize VLLM LLM
-            llm_kwargs: Dict[str, Any] = {
-                "model": str(artifacts_path),
-                "limit_mm_per_prompt": {"image": 1},
-                "trust_remote_code": vlm_options.trust_remote_code,
-                "model_impl": "transformers",
-                "gpu_memory_utilization": 0.3,  # hardcoded for now, leaves room for ~3 different models.
-            }
-            # Add device-specific configurations
-            if self.device == "cpu":
-                llm_kwargs["device"] = "cpu"
+        if not self.enabled:
+            return
-            # Add quantization if specified
-            if vlm_options.quantized:
-                if vlm_options.load_in_8bit:
-                    llm_kwargs["quantization"] = "bitsandbytes"
+        from transformers import AutoProcessor
+        from vllm import LLM, SamplingParams
-            self.llm = LLM(**llm_kwargs)
+        # Device selection
+        self.device = decide_device(
+            accelerator_options.device, supported_devices=vlm_options.supported_devices
+        )
+        _log.debug(f"Available device for VLM: {self.device}")
-            # Initialize processor for prompt formatting
-            self.processor = AutoProcessor.from_pretrained(
-                artifacts_path,
-                trust_remote_code=vlm_options.trust_remote_code,
+        # Resolve artifacts path / cache folder
+        repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+        if artifacts_path is None:
+            artifacts_path = self.download_models(
+                self.vlm_options.repo_id, revision=self.vlm_options.revision
             )
-            # Set up sampling parameters
-            self.sampling_params = SamplingParams(
-                temperature=self.temperature,
-                max_tokens=self.max_new_tokens,
-                stop=vlm_options.stop_strings if vlm_options.stop_strings else None,
-                **vlm_options.extra_generation_config,
+        elif (artifacts_path / repo_cache_folder).exists():
+            artifacts_path = artifacts_path / repo_cache_folder
+        # --------- Strict split & validation of extra_generation_config ---------
+        extra_cfg = self.vlm_options.extra_generation_config
+        load_cfg = {k: v for k, v in extra_cfg.items() if k in self._VLLM_ENGINE_KEYS}
+        gen_cfg = {k: v for k, v in extra_cfg.items() if k in self._VLLM_SAMPLING_KEYS}
+        unknown = sorted(
+            k
+            for k in extra_cfg.keys()
+            if k not in self._VLLM_ENGINE_KEYS and k not in self._VLLM_SAMPLING_KEYS
+        )
+        if unknown:
+            _log.warning(
+                "Ignoring unknown extra_generation_config keys for vLLM: %s", unknown
             )
+        # --------- Construct LLM kwargs (engine/load-time) ---------
+        llm_kwargs: Dict[str, Any] = {
+            "model": str(artifacts_path),
+            "model_impl": "transformers",
+            "limit_mm_per_prompt": {"image": 1},
+            "revision": self.vlm_options.revision,
+            "trust_remote_code": self.vlm_options.trust_remote_code,
+            **load_cfg,
+        }
+        if self.device == "cpu":
+            llm_kwargs.setdefault("enforce_eager", True)
+        else:
+            llm_kwargs.setdefault(
+                "gpu_memory_utilization", 0.3
+            )  # room for other models
+        # Quantization (kept as-is; coarse)
+        if self.vlm_options.quantized and self.vlm_options.load_in_8bit:
+            llm_kwargs.setdefault("quantization", "bitsandbytes")
+        # Initialize vLLM LLM
+        self.llm = LLM(**llm_kwargs)
+        # Initialize processor for prompt templating (needed for CHAT style)
+        self.processor = AutoProcessor.from_pretrained(
+            artifacts_path,
+            trust_remote_code=self.vlm_options.trust_remote_code,
+            revision=self.vlm_options.revision,
+        )
+        # --------- SamplingParams (runtime) ---------
+        self.sampling_params = SamplingParams(
+            temperature=self.temperature,
+            max_tokens=self.max_new_tokens,
+            stop=(self.vlm_options.stop_strings or None),
+            **gen_cfg,
+        )
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
+        # If disabled, pass-through
+        if not self.enabled:
+            for page in page_batch:
+                yield page
+            return
         page_list = list(page_batch)
         if not page_list:
             return
-        valid_pages = []
-        invalid_pages = []
+        # Preserve original order
+        original_order = page_list[:]
+        # Separate valid/invalid
+        valid_pages: list[Page] = []
+        invalid_pages: list[Page] = []
         for page in page_list:
             assert page._backend is not None
-            if not page._backend.is_valid():
-                invalid_pages.append(page)
-            else:
+            if page._backend.is_valid():
                 valid_pages.append(page)
+            else:
+                invalid_pages.append(page)
-        # Process valid pages in batch
         if valid_pages:
             with TimeRecorder(conv_res, "vlm"):
-                # Prepare images and prompts for batch processing
-                images = []
-                user_prompts = []
-                pages_with_images = []
+                images: list[Image] = []
+                user_prompts: list[str] = []
+                pages_with_images: list[Page] = []
                 for page in valid_pages:
                     assert page.size is not None
                     hi_res_image = page.get_image(
-                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                        scale=self.vlm_options.scale,
+                        max_size=self.vlm_options.max_size,
                     )
+                    if hi_res_image is None:
+                        continue
-                    # Only process pages with valid images
-                    if hi_res_image is not None:
-                        images.append(hi_res_image)
+                    images.append(hi_res_image)
-                        # Define prompt structure
-                        if callable(self.vlm_options.prompt):
-                            user_prompt = self.vlm_options.prompt(page.parsed_page)
-                        else:
-                            user_prompt = self.vlm_options.prompt
+                    # Define prompt structure
+                    user_prompt = self.vlm_options.build_prompt(page.parsed_page)
-                        user_prompts.append(user_prompt)
-                        pages_with_images.append(page)
+                    user_prompts.append(user_prompt)
+                    pages_with_images.append(page)
-                # Use process_images for the actual inference
-                if images:  # Only if we have valid images
+                if images:
                     predictions = list(self.process_images(images, user_prompts))
-                    # Attach results to pages
                     for page, prediction in zip(pages_with_images, predictions):
                         page.predictions.vlm_response = prediction
-        # Yield all pages (valid and invalid)
-        for page in invalid_pages:
-            yield page
-        for page in valid_pages:
+        # Yield in original order
+        for page in original_order:
             yield page
     def process_images(
@@ -156,50 +235,33 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
         image_batch: Iterable[Union[Image, np.ndarray]],
         prompt: Union[str, list[str]],
     ) -> Iterable[VlmPrediction]:
-        """Process raw images without page metadata in a single batched inference call.
-        Args:
-            image_batch: Iterable of PIL Images or numpy arrays
-            prompt: Either:
-                - str: Single prompt used for all images
-                - list[str]: List of prompts (one per image, must match image count)
+        """Process images in a single batched vLLM inference call."""
+        import numpy as np
+        from PIL import Image as PILImage
-        Raises:
-            ValueError: If prompt list length doesn't match image count.
-        """
+        # -- Normalize images to RGB PIL
         pil_images: list[Image] = []
         for img in image_batch:
-            # Convert numpy array to PIL Image if needed
             if isinstance(img, np.ndarray):
-                if img.ndim == 3 and img.shape[2] in [3, 4]:
-                    from PIL import Image as PILImage
+                if img.ndim == 3 and img.shape[2] in (3, 4):
                     pil_img = PILImage.fromarray(img.astype(np.uint8))
                 elif img.ndim == 2:
-                    from PIL import Image as PILImage
                     pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
                 else:
                     raise ValueError(f"Unsupported numpy array shape: {img.shape}")
             else:
                 pil_img = img
-            # Ensure image is in RGB mode (handles RGBA, L, etc.)
             if pil_img.mode != "RGB":
                 pil_img = pil_img.convert("RGB")
             pil_images.append(pil_img)
-        if len(pil_images) == 0:
+        if not pil_images:
             return
-        # Handle prompt parameter
+        # Normalize prompts
         if isinstance(prompt, str):
-            # Single prompt for all images
             user_prompts = [prompt] * len(pil_images)
         elif isinstance(prompt, list):
-            # List of prompts (one per image)
             if len(prompt) != len(pil_images):
                 raise ValueError(
                     f"Number of prompts ({len(prompt)}) must match number of images ({len(pil_images)})"
@@ -208,28 +270,31 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
         else:
             raise ValueError(f"prompt must be str or list[str], got {type(prompt)}")
-        # Format prompts individually
-        prompts: list[str] = [
-            self.formulate_prompt(user_prompt) for user_prompt in user_prompts
-        ]
+        # Format prompts
+        prompts: list[str] = [self.formulate_prompt(up) for up in user_prompts]
-        # Prepare VLLM inputs
-        llm_inputs = []
-        for prompt, image in zip(prompts, pil_images):
-            llm_inputs.append({"prompt": prompt, "multi_modal_data": {"image": image}})
+        # Build vLLM inputs
+        llm_inputs = [
+            {"prompt": p, "multi_modal_data": {"image": im}}
+            for p, im in zip(prompts, pil_images)
+        ]
+        # Generate
+        assert self.llm is not None and self.sampling_params is not None
         start_time = time.time()
         outputs = self.llm.generate(llm_inputs, sampling_params=self.sampling_params)  # type: ignore
         generation_time = time.time() - start_time
-        # Logging tokens count for the first sample as a representative metric
-        if len(outputs) > 0:
-            num_tokens = len(outputs[0].outputs[0].token_ids)
-            _log.debug(
-                f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
-            )
+        # Optional debug
+        if outputs:
+            try:
+                num_tokens = len(outputs[0].outputs[0].token_ids)
+                _log.debug(f"Generated {num_tokens} tokens in {generation_time:.2f}s.")
+            except Exception:
+                pass
+        # Emit predictions
         for output in outputs:
-            # Apply decode_response to the output text
-            decoded_text = self.vlm_options.decode_response(output.outputs[0].text)
+            text = output.outputs[0].text if output.outputs else ""
+            decoded_text = self.vlm_options.decode_response(text)
             yield VlmPrediction(text=decoded_text, generation_time=generation_time)

docling/utils/api_image_request.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import base64
+import json
 import logging
 from io import BytesIO
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 import requests
 from PIL import Image
 from pydantic import AnyUrl
 from docling.datamodel.base_models import OpenAiApiResponse
+from docling.models.utils.generation_utils import GenerationStopper
 _log = logging.getLogger(__name__)
@@ -59,3 +61,107 @@ def api_image_request(
     api_resp = OpenAiApiResponse.model_validate_json(r.text)
     generated_text = api_resp.choices[0].message.content.strip()
     return generated_text
+def api_image_request_streaming(
+    image: Image.Image,
+    prompt: str,
+    url: AnyUrl,
+    *,
+    timeout: float = 20,
+    headers: Optional[Dict[str, str]] = None,
+    generation_stoppers: List[GenerationStopper] = [],
+    **params,
+) -> str:
+    """
+    Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
+    Parses SSE lines: 'data: {json}\\n\\n', terminated by 'data: [DONE]'.
+    Accumulates text and calls stopper.should_stop(window) as chunks arrive.
+    If stopper triggers, the HTTP connection is closed to abort server-side generation.
+    """
+    img_io = BytesIO()
+    image.save(img_io, "PNG")
+    image_b64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                },
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    payload = {
+        "messages": messages,
+        "stream": True,  # <-- critical for SSE streaming
+        **params,
+    }
+    # Debug: Log the payload to verify temperature is included
+    _log.debug(f"API streaming request payload: {json.dumps(payload, indent=2)}")
+    # Some servers require Accept: text/event-stream for SSE.
+    # It's safe to set it; OpenAI-compatible servers tolerate it.
+    hdrs = {"Accept": "text/event-stream", **(headers or {})}
+    # Try to force temperature via header if server ignores payload parameter
+    if "temperature" in params:
+        hdrs["X-Temperature"] = str(params["temperature"])
+    # Stream the HTTP response
+    with requests.post(
+        str(url), headers=hdrs, json=payload, timeout=timeout, stream=True
+    ) as r:
+        if not r.ok:
+            _log.error(
+                f"Error calling the API {url} in streaming mode. Response was {r.text}"
+            )
+        r.raise_for_status()
+        full_text = []
+        for raw_line in r.iter_lines(decode_unicode=True):
+            if not raw_line:  # keep-alives / blank lines
+                continue
+            if not raw_line.startswith("data:"):
+                # Some proxies inject comments; ignore anything not starting with 'data:'
+                continue
+            data = raw_line[len("data:") :].strip()
+            if data == "[DONE]":
+                break
+            try:
+                obj = json.loads(data)
+            except json.JSONDecodeError:
+                _log.debug("Skipping non-JSON SSE chunk: %r", data[:200])
+                continue
+            # OpenAI-compatible delta format
+            # obj["choices"][0]["delta"]["content"] may be None or missing (e.g., tool calls)
+            try:
+                delta = obj["choices"][0].get("delta") or {}
+                piece = delta.get("content") or ""
+            except (KeyError, IndexError) as e:
+                _log.debug("Unexpected SSE chunk shape: %s", e)
+                piece = ""
+            if piece:
+                full_text.append(piece)
+                for stopper in generation_stoppers:
+                    # Respect stopper's lookback window. We use a simple string window which
+                    # works with the GenerationStopper interface.
+                    lookback = max(1, stopper.lookback_tokens())
+                    window = "".join(full_text)[-lookback:]
+                    if stopper.should_stop(window):
+                        # Break out of the loop cleanly. The context manager will handle
+                        # closing the connection when we exit the 'with' block.
+                        # vLLM/OpenAI-compatible servers will detect the client disconnect
+                        # and abort the request server-side.
+                        return "".join(full_text)
+        return "".join(full_text)

{docling-2.53.0.dist-info → docling-2.55.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.53.0
+Version: 2.55.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: pydantic<3.0.0,>=2.0.0
-Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.0
+Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.2
 Requires-Dist: docling-parse<5.0.0,>=4.4.0
 Requires-Dist: docling-ibm-models<4,>=3.9.1
 Requires-Dist: filetype<2.0.0,>=1.2.0
@@ -37,7 +37,7 @@ Requires-Dist: requests<3.0.0,>=2.32.2
 Requires-Dist: easyocr<2.0,>=1.7
 Requires-Dist: certifi>=2024.7.4
 Requires-Dist: rtree<2.0.0,>=1.3.0
-Requires-Dist: typer<0.17.0,>=0.12.5
+Requires-Dist: typer<0.20.0,>=0.12.5
 Requires-Dist: python-docx<2.0.0,>=1.1.2
 Requires-Dist: python-pptx<2.0.0,>=1.0.2
 Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
@@ -101,7 +101,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
 ## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
 * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
 * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
 * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -117,13 +117,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
 * 📤 Structured [information extraction][extraction] \[🧪 beta\]
 * 📑 New layout model (**Heron**) by default, for faster PDF parsing
 * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
+* 💬 Parsing of Web Video Text Tracks (WebVTT) files
 ### Coming soon
 * 📝 Metadata extraction, including title, authors, references & language
 * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
 * 📝 Complex chemistry understanding (Molecular structures)
-* 📝 Parsing of Web Video Text Tracks (WebVTT) files
 ## Installation

docling 2.53.0__py3-none-any.whl → 2.55.0__py3-none-any.whl

docling 2.53.0py3-none-any.whl → 2.55.0py3-none-any.whl