PyPI - docling - Versions diffs - 2.45.0__py3-none-any.whl → 2.47.0__py3-none-any.whl - Mend

docling 2.45.0py3-none-any.whl → 2.47.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

docling/backend/docling_parse_v4_backend.py +61 -27
docling/backend/html_backend.py +119 -17
docling/backend/msword_backend.py +126 -16
docling/cli/main.py +14 -0
docling/cli/models.py +56 -0
docling/datamodel/base_models.py +1 -1
docling/datamodel/pipeline_options.py +4 -3
docling/datamodel/pipeline_options_vlm_model.py +5 -0
docling/datamodel/vlm_model_specs.py +114 -1
docling/models/base_model.py +95 -2
docling/models/code_formula_model.py +87 -76
docling/models/page_preprocessing_model.py +5 -1
docling/models/picture_description_vlm_model.py +4 -2
docling/models/tesseract_ocr_cli_model.py +4 -2
docling/models/vlm_models_inline/__init__.py +1 -0
docling/models/vlm_models_inline/hf_transformers_model.py +179 -79
docling/models/vlm_models_inline/mlx_model.py +179 -68
docling/models/vlm_models_inline/vllm_model.py +235 -0
docling/pipeline/base_pipeline.py +7 -1
docling/pipeline/threaded_standard_pdf_pipeline.py +7 -5
docling/pipeline/vlm_pipeline.py +14 -1
docling/utils/layout_postprocessor.py +51 -43
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/METADATA +3 -2
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/RECORD +28 -27
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/WHEEL +0 -0
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/entry_points.txt +0 -0
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.45.0.dist-info → docling-2.47.0.dist-info}/top_level.txt +0 -0

docling/cli/main.py CHANGED Viewed

@@ -60,10 +60,12 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.datamodel.vlm_model_specs import (
+    GOT2_TRANSFORMERS,
     GRANITE_VISION_OLLAMA,
     GRANITE_VISION_TRANSFORMERS,
     SMOLDOCLING_MLX,
     SMOLDOCLING_TRANSFORMERS,
+    SMOLDOCLING_VLLM,
     VlmModelType,
 )
 from docling.document_converter import (
@@ -477,6 +479,13 @@ def convert(  # noqa: C901
             "--logo", callback=logo_callback, is_eager=True, help="Docling logo"
         ),
     ] = None,
+    page_batch_size: Annotated[
+        int,
+        typer.Option(
+            "--page-batch-size",
+            help=f"Number of pages processed in one batch. Default: {settings.perf.page_batch_size}",
+        ),
+    ] = settings.perf.page_batch_size,
 ):
     log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
@@ -491,6 +500,7 @@ def convert(  # noqa: C901
     settings.debug.visualize_layout = debug_visualize_layout
     settings.debug.visualize_tables = debug_visualize_tables
     settings.debug.visualize_ocr = debug_visualize_ocr
+    settings.perf.page_batch_size = page_batch_size
     if from_formats is None:
         from_formats = list(InputFormat)
@@ -631,6 +641,8 @@ def convert(  # noqa: C901
                 pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
             elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
                 pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
+            elif vlm_model == VlmModelType.GOT_OCR_2:
+                pipeline_options.vlm_options = GOT2_TRANSFORMERS
             elif vlm_model == VlmModelType.SMOLDOCLING:
                 pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                 if sys.platform == "darwin":
@@ -643,6 +655,8 @@ def convert(  # noqa: C901
                             "To run SmolDocling faster, please install mlx-vlm:\n"
                             "pip install mlx-vlm"
                         )
+            elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
+                pipeline_options.vlm_options = SMOLDOCLING_VLLM
             pdf_format_option = PdfFormatOption(
                 pipeline_cls=VlmPipeline, pipeline_options=pipeline_options

docling/cli/models.py CHANGED Viewed

@@ -9,6 +9,7 @@ from rich.console import Console
 from rich.logging import RichHandler
 from docling.datamodel.settings import settings
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.model_downloader import download_models
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -128,6 +129,61 @@ def download(
         )
+@app.command("download-hf-repo")
+def download_hf_repo(
+    models: Annotated[
+        list[str],
+        typer.Argument(
+            help="Specific models to download from HuggingFace identified by their repo id. For example: ds4sd/docling-models .",
+        ),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            "-o",
+            "--output-dir",
+            help="The directory where to download the models.",
+        ),
+    ] = (settings.cache_dir / "models"),
+    force: Annotated[
+        bool, typer.Option(..., help="If true, the download will be forced.")
+    ] = False,
+    quiet: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "-q",
+            "--quiet",
+            help="No extra output is generated, the CLI prints only the directory with the cached models.",
+        ),
+    ] = False,
+):
+    if not quiet:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[blue]%(message)s[/blue]",
+            datefmt="[%X]",
+            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
+        )
+    for item in models:
+        typer.secho(f"\nDownloading {item} model from HuggingFace...")
+        download_hf_model(
+            repo_id=item,
+            # would be better to reuse "repo_cache_folder" property: https://github.com/docling-project/docling/blob/main/docling/datamodel/pipeline_options_vlm_model.py#L76
+            # but creating options objects seams like an overkill
+            local_dir=output_dir / item.replace("/", "--"),
+            force=force,
+            progress=(not quiet),
+        )
+    if quiet:
+        typer.echo(output_dir)
+    else:
+        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
 click_app = typer.main.get_command(app)
 if __name__ == "__main__":

docling/datamodel/base_models.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import math
 from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 import numpy as np
 from docling_core.types.doc import (

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -282,6 +282,9 @@ class LayoutOptions(BaseModel):
     keep_empty_clusters: bool = (
         False  # Whether to keep clusters that contain no text cells
     )
+    skip_cell_assignment: bool = (
+        False  # Skip cell-to-cluster assignment for VLM-only processing
+    )
     model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
@@ -323,9 +326,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
         ),
     )
-    generate_parsed_pages: Literal[True] = (
-        True  # Always True since parsed_page is now mandatory
-    )
+    generate_parsed_pages: bool = False
 class ProcessingPipeline(str, Enum):

docling/datamodel/pipeline_options_vlm_model.py CHANGED Viewed

@@ -26,11 +26,14 @@ class ResponseFormat(str, Enum):
     DOCTAGS = "doctags"
     MARKDOWN = "markdown"
     HTML = "html"
+    OTSL = "otsl"
+    PLAINTEXT = "plaintext"
 class InferenceFramework(str, Enum):
     MLX = "mlx"
     TRANSFORMERS = "transformers"
+    VLLM = "vllm"
 class TransformersModelType(str, Enum):
@@ -43,6 +46,7 @@ class TransformersModelType(str, Enum):
 class TransformersPromptStyle(str, Enum):
     CHAT = "chat"
     RAW = "raw"
+    NONE = "none"
 class InlineVlmOptions(BaseVlmOptions):
@@ -68,6 +72,7 @@ class InlineVlmOptions(BaseVlmOptions):
     stop_strings: List[str] = []
     extra_generation_config: Dict[str, Any] = {}
+    extra_processor_kwargs: Dict[str, Any] = {}
     use_kv_cache: bool = True
     max_new_tokens: int = 4096

docling/datamodel/vlm_model_specs.py CHANGED Viewed

@@ -12,6 +12,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
     InlineVlmOptions,
     ResponseFormat,
     TransformersModelType,
+    TransformersPromptStyle,
 )
 _log = logging.getLogger(__name__)
@@ -26,6 +27,7 @@ SMOLDOCLING_MLX = InlineVlmOptions(
     supported_devices=[AcceleratorDevice.MPS],
     scale=2.0,
     temperature=0.0,
+    stop_strings=["</doctag>", "<end_of_utterance>"],
 )
 SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
@@ -33,16 +35,74 @@ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
     prompt="Convert this page to docling.",
     response_format=ResponseFormat.DOCTAGS,
     inference_framework=InferenceFramework.TRANSFORMERS,
-    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
     supported_devices=[
         AcceleratorDevice.CPU,
         AcceleratorDevice.CUDA,
+    ],
+    torch_dtype="bfloat16",
+    scale=2.0,
+    temperature=0.0,
+    stop_strings=["</doctag>", "<end_of_utterance>"],
+)
+SMOLDOCLING_VLLM = InlineVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.VLLM,
+    supported_devices=[
+        AcceleratorDevice.CUDA,
+    ],
+    scale=2.0,
+    temperature=0.0,
+    stop_strings=["</doctag>", "<end_of_utterance>"],
+)
+# SmolVLM-256M-Instruct
+SMOLVLM256_TRANSFORMERS = InlineVlmOptions(
+    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
+    prompt="Transcribe this image to plain text.",
+    response_format=ResponseFormat.PLAINTEXT,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+    supported_devices=[
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        # AcceleratorDevice.MPS,
+    ],
+    torch_dtype="bfloat16",
+    scale=2.0,
+    temperature=0.0,
+)
+# SmolVLM2-2.2b-Instruct
+SMOLVLM256_MLX = InlineVlmOptions(
+    repo_id="moot20/SmolVLM-256M-Instruct-MLX",
+    prompt="Extract the text.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.MLX,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+    supported_devices=[
         AcceleratorDevice.MPS,
     ],
     scale=2.0,
     temperature=0.0,
 )
+SMOLVLM256_VLLM = InlineVlmOptions(
+    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
+    prompt="Transcribe this image to plain text.",
+    response_format=ResponseFormat.PLAINTEXT,
+    inference_framework=InferenceFramework.VLLM,
+    supported_devices=[
+        AcceleratorDevice.CUDA,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
 # GraniteVision
 GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
     repo_id="ibm-granite/granite-vision-3.2-2b",
@@ -59,6 +119,18 @@ GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
     temperature=0.0,
 )
+GRANITE_VISION_VLLM = InlineVlmOptions(
+    repo_id="ibm-granite/granite-vision-3.2-2b",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.VLLM,
+    supported_devices=[
+        AcceleratorDevice.CUDA,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
 GRANITE_VISION_OLLAMA = ApiVlmOptions(
     url=AnyUrl("http://localhost:11434/v1/chat/completions"),
     params={"model": "granite3.2-vision:2b"},
@@ -116,6 +188,26 @@ QWEN25_VL_3B_MLX = InlineVlmOptions(
     temperature=0.0,
 )
+# GoT 2.0
+GOT2_TRANSFORMERS = InlineVlmOptions(
+    repo_id="stepfun-ai/GOT-OCR-2.0-hf",
+    prompt="",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_prompt_style=TransformersPromptStyle.NONE,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+    supported_devices=[
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        #    AcceleratorDevice.MPS,
+    ],
+    scale=2.0,
+    temperature=0.0,
+    stop_strings=["<|im_end|>"],
+    extra_processor_kwargs={"format": True},
+)
 # Gemma-3
 GEMMA3_12B_MLX = InlineVlmOptions(
     repo_id="mlx-community/gemma-3-12b-it-bf16",
@@ -137,8 +229,29 @@ GEMMA3_27B_MLX = InlineVlmOptions(
     temperature=0.0,
 )
+# Dolphin
+DOLPHIN_TRANSFORMERS = InlineVlmOptions(
+    repo_id="ByteDance/Dolphin",
+    prompt="<s>Read text in the image. <Answer/>",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+    transformers_prompt_style=TransformersPromptStyle.RAW,
+    supported_devices=[
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.MPS,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
 class VlmModelType(str, Enum):
     SMOLDOCLING = "smoldocling"
+    SMOLDOCLING_VLLM = "smoldocling_vllm"
     GRANITE_VISION = "granite_vision"
+    GRANITE_VISION_VLLM = "granite_vision_vllm"
     GRANITE_VISION_OLLAMA = "granite_vision_ollama"
+    GOT_OCR_2 = "got_ocr_2"

docling/models/base_model.py CHANGED Viewed

@@ -1,13 +1,24 @@
+import logging
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
-from typing import Generic, Optional, Protocol, Type
+from typing import Any, Generic, Optional, Protocol, Type, Union
+import numpy as np
 from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
+from PIL.Image import Image
 from typing_extensions import TypeVar
-from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
+from docling.datamodel.base_models import (
+    ItemAndImageEnrichmentElement,
+    Page,
+    VlmPrediction,
+)
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import BaseOptions
+from docling.datamodel.pipeline_options_vlm_model import (
+    InlineVlmOptions,
+    TransformersPromptStyle,
+)
 from docling.datamodel.settings import settings
@@ -26,6 +37,88 @@ class BasePageModel(ABC):
         pass
+class BaseVlmModel(ABC):
+    """Base class for Vision-Language Models that adds image processing capability."""
+    @abstractmethod
+    def process_images(
+        self,
+        image_batch: Iterable[Union[Image, np.ndarray]],
+        prompt: Union[str, list[str]],
+    ) -> Iterable[VlmPrediction]:
+        """Process raw images without page metadata.
+        Args:
+            image_batch: Iterable of PIL Images or numpy arrays
+            prompt: Either:
+                - str: Single prompt used for all images
+                - list[str]: List of prompts (one per image, must match image count)
+        Raises:
+            ValueError: If prompt list length doesn't match image count.
+        """
+class BaseVlmPageModel(BasePageModel, BaseVlmModel):
+    """Base implementation for VLM models that inherit from BasePageModel.
+    Provides a default __call__ implementation that extracts images from pages,
+    processes them using process_images, and attaches results back to pages.
+    """
+    # Type annotations for attributes that subclasses must initialize
+    vlm_options: InlineVlmOptions
+    processor: Any
+    @abstractmethod
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        """Extract images from pages, process them, and attach results back."""
+    def formulate_prompt(self, user_prompt: str) -> str:
+        """Formulate a prompt for the VLM."""
+        _log = logging.getLogger(__name__)
+        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
+            return user_prompt
+        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+            _log.debug("Using specialized prompt for Phi-4")
+            # Note: This might need adjustment for VLLM vs transformers
+            user_prompt_prefix = "<|user|>"
+            assistant_prompt = "<|assistant|>"
+            prompt_suffix = "<|end|>"
+            prompt = f"{user_prompt_prefix}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
+            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
+            return prompt
+        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "This is a page from a document.",
+                        },
+                        {"type": "image"},
+                        {"type": "text", "text": user_prompt},
+                    ],
+                }
+            ]
+            prompt = self.processor.apply_chat_template(
+                messages, add_generation_prompt=True
+            )
+            return prompt
+        raise RuntimeError(
+            f"Unknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
+        )
 EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)

docling 2.45.0__py3-none-any.whl → 2.47.0__py3-none-any.whl

docling 2.45.0py3-none-any.whl → 2.47.0py3-none-any.whl