PyPI - docling - Versions diffs - 2.35.0__py3-none-any.whl → 2.36.1__py3-none-any.whl - Mend

docling 2.35.0py3-none-any.whl → 2.36.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docling/backend/xml/jats_backend.py +0 -0
docling/cli/main.py +12 -15
docling/datamodel/accelerator_options.py +68 -0
docling/datamodel/base_models.py +10 -8
docling/datamodel/pipeline_options.py +29 -161
docling/datamodel/pipeline_options_vlm_model.py +81 -0
docling/datamodel/vlm_model_specs.py +144 -0
docling/document_converter.py +5 -0
docling/models/api_vlm_model.py +1 -1
docling/models/base_ocr_model.py +2 -1
docling/models/code_formula_model.py +6 -11
docling/models/document_picture_classifier.py +6 -11
docling/models/easyocr_model.py +1 -2
docling/models/layout_model.py +6 -11
docling/models/ocr_mac_model.py +1 -1
docling/models/picture_description_api_model.py +1 -1
docling/models/picture_description_base_model.py +1 -1
docling/models/picture_description_vlm_model.py +7 -22
docling/models/rapid_ocr_model.py +1 -2
docling/models/table_structure_model.py +6 -12
docling/models/tesseract_ocr_cli_model.py +1 -1
docling/models/tesseract_ocr_model.py +1 -1
docling/models/utils/__init__.py +0 -0
docling/models/utils/hf_model_download.py +40 -0
docling/models/vlm_models_inline/__init__.py +0 -0
docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} +56 -44
docling/pipeline/vlm_pipeline.py +228 -61
docling/utils/accelerator_utils.py +17 -2
docling/utils/model_downloader.py +13 -12
{docling-2.35.0.dist-info → docling-2.36.1.dist-info}/METADATA +53 -55
{docling-2.35.0.dist-info → docling-2.36.1.dist-info}/RECORD +46 -39
{docling-2.35.0.dist-info → docling-2.36.1.dist-info}/WHEEL +2 -1
docling-2.36.1.dist-info/entry_points.txt +6 -0
docling-2.36.1.dist-info/top_level.txt +1 -0
docling/models/hf_vlm_model.py +0 -182
docling-2.35.0.dist-info/entry_points.txt +0 -7
{docling-2.35.0.dist-info → docling-2.36.1.dist-info/licenses}/LICENSE +0 -0

docling/backend/xml/jats_backend.py CHANGED Viewed

File without changes

docling/cli/main.py CHANGED Viewed

@@ -28,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import (
     ConversionStatus,
     FormatToExtensions,
@@ -36,8 +37,6 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
     PaginatedPipelineOptions,
@@ -45,14 +44,16 @@ from docling.datamodel.pipeline_options import (
     PdfPipeline,
     PdfPipelineOptions,
     TableFormerMode,
-    VlmModelType,
     VlmPipelineOptions,
-    granite_vision_vlm_conversion_options,
-    granite_vision_vlm_ollama_conversion_options,
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
+from docling.datamodel.vlm_model_specs import (
+    GRANITE_VISION_OLLAMA,
+    GRANITE_VISION_TRANSFORMERS,
+    SMOLDOCLING_MLX,
+    SMOLDOCLING_TRANSFORMERS,
+    VlmModelType,
+)
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
 from docling.pipeline.vlm_pipeline import VlmPipeline
@@ -579,20 +580,16 @@ def convert(  # noqa: C901
             )
             if vlm_model == VlmModelType.GRANITE_VISION:
-                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
             elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
-                pipeline_options.vlm_options = (
-                    granite_vision_vlm_ollama_conversion_options
-                )
+                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
             elif vlm_model == VlmModelType.SMOLDOCLING:
-                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                 if sys.platform == "darwin":
                     try:
                         import mlx_vlm
-                        pipeline_options.vlm_options = (
-                            smoldocling_vlm_mlx_conversion_options
-                        )
+                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                     except ImportError:
                         _log.warning(
                             "To run SmolDocling faster, please install mlx-vlm:\n"

docling/datamodel/accelerator_options.py ADDED Viewed

@@ -0,0 +1,68 @@
+import logging
+import os
+import re
+from enum import Enum
+from typing import Any, Union
+from pydantic import field_validator, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+_log = logging.getLogger(__name__)
+class AcceleratorDevice(str, Enum):
+    """Devices to run model inference"""
+    AUTO = "auto"
+    CPU = "cpu"
+    CUDA = "cuda"
+    MPS = "mps"
+class AcceleratorOptions(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
+    )
+    num_threads: int = 4
+    device: Union[str, AcceleratorDevice] = "auto"
+    cuda_use_flash_attention2: bool = False
+    @field_validator("device")
+    def validate_device(cls, value):
+        # "auto", "cpu", "cuda", "mps", or "cuda:N"
+        if value in {d.value for d in AcceleratorDevice} or re.match(
+            r"^cuda(:\d+)?$", value
+        ):
+            return value
+        raise ValueError(
+            "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
+        )
+    @model_validator(mode="before")
+    @classmethod
+    def check_alternative_envvars(cls, data: Any) -> Any:
+        r"""
+        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
+        The alternative envvar is used only if it is valid and the regular envvar is not set.
+        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
+        the same functionality. In case the alias envvar is set and the user tries to override the
+        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
+        as an extra input instead of simply overwriting the evvar value for that parameter.
+        """
+        if isinstance(data, dict):
+            input_num_threads = data.get("num_threads")
+            # Check if to set the num_threads from the alternative envvar
+            if input_num_threads is None:
+                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
+                omp_num_threads = os.getenv("OMP_NUM_THREADS")
+                if docling_num_threads is None and omp_num_threads is not None:
+                    try:
+                        data["num_threads"] = int(omp_num_threads)
+                    except ValueError:
+                        _log.error(
+                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
+                            omp_num_threads,
+                        )
+        return data

docling/datamodel/base_models.py CHANGED Viewed

@@ -13,11 +13,11 @@ from docling_core.types.doc import (
     TableCell,
 )
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-# DO NOT REMOVE; explicitly exposed from this location
 from docling_core.types.io import (
     DocumentStream,
 )
+# DO NOT REMOVE; explicitly exposed from this location
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict, Field, computed_field
@@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
     error_message: str
-# class Cell(BaseModel):
-#    id: int
-#    text: str
-#    bbox: BoundingBox
 class Cluster(BaseModel):
     id: int
     label: DocItemLabel
@@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
     clusters: List[Cluster] = []
+class VlmPredictionToken(BaseModel):
+    text: str = ""
+    token: int = -1
+    logprob: float = -1
 class VlmPrediction(BaseModel):
     text: str = ""
+    generated_tokens: list[VlmPredictionToken] = []
+    generation_time: float = -1
 class ContainerElement(

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import logging
-import os
-import re
 from enum import Enum
 from pathlib import Path
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@@ -10,71 +8,26 @@ from pydantic import (
     BaseModel,
     ConfigDict,
     Field,
-    field_validator,
-    model_validator,
 )
-from pydantic_settings import BaseSettings, SettingsConfigDict
 from typing_extensions import deprecated
-_log = logging.getLogger(__name__)
-class AcceleratorDevice(str, Enum):
-    """Devices to run model inference"""
-    AUTO = "auto"
-    CPU = "cpu"
-    CUDA = "cuda"
-    MPS = "mps"
-class AcceleratorOptions(BaseSettings):
-    model_config = SettingsConfigDict(
-        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
-    )
+# Import the following for backwards compatibility
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options_vlm_model import (
+    ApiVlmOptions,
+    InferenceFramework,
+    InlineVlmOptions,
+    ResponseFormat,
+)
+from docling.datamodel.vlm_model_specs import (
+    GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
+    GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
+    SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
+    SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
+    VlmModelType,
+)
-    num_threads: int = 4
-    device: Union[str, AcceleratorDevice] = "auto"
-    cuda_use_flash_attention2: bool = False
-    @field_validator("device")
-    def validate_device(cls, value):
-        # "auto", "cpu", "cuda", "mps", or "cuda:N"
-        if value in {d.value for d in AcceleratorDevice} or re.match(
-            r"^cuda(:\d+)?$", value
-        ):
-            return value
-        raise ValueError(
-            "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
-        )
-    @model_validator(mode="before")
-    @classmethod
-    def check_alternative_envvars(cls, data: Any) -> Any:
-        r"""
-        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
-        The alternative envvar is used only if it is valid and the regular envvar is not set.
-        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
-        the same functionality. In case the alias envvar is set and the user tries to override the
-        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
-        as an extra input instead of simply overwriting the evvar value for that parameter.
-        """
-        if isinstance(data, dict):
-            input_num_threads = data.get("num_threads")
-            # Check if to set the num_threads from the alternative envvar
-            if input_num_threads is None:
-                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
-                omp_num_threads = os.getenv("OMP_NUM_THREADS")
-                if docling_num_threads is None and omp_num_threads is not None:
-                    try:
-                        data["num_threads"] = int(omp_num_threads)
-                    except ValueError:
-                        _log.error(
-                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
-                            omp_num_threads,
-                        )
-        return data
+_log = logging.getLogger(__name__)
 class BaseOptions(BaseModel):
@@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions):
     lang: List[str] = [
         "english",
         "chinese",
-    ]  # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
-    # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
+    ]
+    # However, language as a parameter is not supported by rapidocr yet
+    # and hence changing this options doesn't affect anything.
+    # For more details on supported languages by RapidOCR visit
+    # https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
+    # For more details on the following options visit
+    # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
-    # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
     text_score: float = 0.5  # same default as rapidocr
     use_det: Optional[bool] = None  # same default as rapidocr
     use_cls: Optional[bool] = None  # same default as rapidocr
     use_rec: Optional[bool] = None  # same default as rapidocr
-    # class Device(Enum):
-    #     CPU = "CPU"
-    #     CUDA = "CUDA"
-    #     DIRECTML = "DIRECTML"
-    #     AUTO = "AUTO"
-    # device: Device = Device.AUTO  # Default value is AUTO
     print_verbose: bool = False  # same default as rapidocr
     det_model_path: Optional[str] = None  # same default as rapidocr
@@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
         return self.repo_id.replace("/", "--")
+# SmolVLM
 smolvlm_picture_description = PictureDescriptionVlmOptions(
     repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
 )
-# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
+# GraniteVision
 granite_picture_description = PictureDescriptionVlmOptions(
     repo_id="ibm-granite/granite-vision-3.1-2b-preview",
     prompt="What is shown in this image?",
 )
-class BaseVlmOptions(BaseModel):
-    kind: str
-    prompt: str
-class ResponseFormat(str, Enum):
-    DOCTAGS = "doctags"
-    MARKDOWN = "markdown"
-class InferenceFramework(str, Enum):
-    MLX = "mlx"
-    TRANSFORMERS = "transformers"
-    OPENAI = "openai"
-class HuggingFaceVlmOptions(BaseVlmOptions):
-    kind: Literal["hf_model_options"] = "hf_model_options"
-    repo_id: str
-    load_in_8bit: bool = True
-    llm_int8_threshold: float = 6.0
-    quantized: bool = False
-    inference_framework: InferenceFramework
-    response_format: ResponseFormat
-    @property
-    def repo_cache_folder(self) -> str:
-        return self.repo_id.replace("/", "--")
-class ApiVlmOptions(BaseVlmOptions):
-    kind: Literal["api_model_options"] = "api_model_options"
-    url: AnyUrl = AnyUrl(
-        "http://localhost:11434/v1/chat/completions"
-    )  # Default to ollama
-    headers: Dict[str, str] = {}
-    params: Dict[str, Any] = {}
-    scale: float = 2.0
-    timeout: float = 60
-    concurrency: int = 1
-    response_format: ResponseFormat
-smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
-    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
-    prompt="Convert this page to docling.",
-    response_format=ResponseFormat.DOCTAGS,
-    inference_framework=InferenceFramework.MLX,
-)
-smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="ds4sd/SmolDocling-256M-preview",
-    prompt="Convert this page to docling.",
-    response_format=ResponseFormat.DOCTAGS,
-    inference_framework=InferenceFramework.TRANSFORMERS,
-)
-granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
-    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
-    # prompt="OCR the full page to markdown.",
-    prompt="OCR this image.",
-    response_format=ResponseFormat.MARKDOWN,
-    inference_framework=InferenceFramework.TRANSFORMERS,
-)
-granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
-    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
-    params={"model": "granite3.2-vision:2b"},
-    prompt="OCR the full page to markdown.",
-    scale=1.0,
-    timeout=120,
-    response_format=ResponseFormat.MARKDOWN,
-)
-class VlmModelType(str, Enum):
-    SMOLDOCLING = "smoldocling"
-    GRANITE_VISION = "granite_vision"
-    GRANITE_VISION_OLLAMA = "granite_vision_ollama"
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
     """Enum of valid PDF backends."""
@@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
         False  # (To be used with vlms, or other generative models)
     )
     # If True, text from backend will be used instead of generated text
-    vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
         smoldocling_vlm_conversion_options
     )

docling/datamodel/pipeline_options_vlm_model.py ADDED Viewed

@@ -0,0 +1,81 @@
+from enum import Enum
+from typing import Any, Dict, List, Literal
+from pydantic import AnyUrl, BaseModel
+from typing_extensions import deprecated
+from docling.datamodel.accelerator_options import AcceleratorDevice
+class BaseVlmOptions(BaseModel):
+    kind: str
+    prompt: str
+class ResponseFormat(str, Enum):
+    DOCTAGS = "doctags"
+    MARKDOWN = "markdown"
+    HTML = "html"
+class InferenceFramework(str, Enum):
+    MLX = "mlx"
+    TRANSFORMERS = "transformers"
+class TransformersModelType(str, Enum):
+    AUTOMODEL = "automodel"
+    AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
+    AUTOMODEL_CAUSALLM = "automodel-causallm"
+class InlineVlmOptions(BaseVlmOptions):
+    kind: Literal["inline_model_options"] = "inline_model_options"
+    repo_id: str
+    trust_remote_code: bool = False
+    load_in_8bit: bool = True
+    llm_int8_threshold: float = 6.0
+    quantized: bool = False
+    inference_framework: InferenceFramework
+    transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
+    response_format: ResponseFormat
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ]
+    scale: float = 2.0
+    temperature: float = 0.0
+    stop_strings: List[str] = []
+    extra_generation_config: Dict[str, Any] = {}
+    use_kv_cache: bool = True
+    max_new_tokens: int = 4096
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+@deprecated("Use InlineVlmOptions instead.")
+class HuggingFaceVlmOptions(InlineVlmOptions):
+    pass
+class ApiVlmOptions(BaseVlmOptions):
+    kind: Literal["api_model_options"] = "api_model_options"
+    url: AnyUrl = AnyUrl(
+        "http://localhost:11434/v1/chat/completions"
+    )  # Default to ollama
+    headers: Dict[str, str] = {}
+    params: Dict[str, Any] = {}
+    scale: float = 2.0
+    timeout: float = 60
+    concurrency: int = 1
+    response_format: ResponseFormat

docling/datamodel/vlm_model_specs.py ADDED Viewed

@@ -0,0 +1,144 @@
+import logging
+from enum import Enum
+from pydantic import (
+    AnyUrl,
+)
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_vlm_model import (
+    ApiVlmOptions,
+    InferenceFramework,
+    InlineVlmOptions,
+    ResponseFormat,
+    TransformersModelType,
+)
+_log = logging.getLogger(__name__)
+# SmolDocling
+SMOLDOCLING_MLX = InlineVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    supported_devices=[
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
+# GraniteVision
+GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
+    repo_id="ibm-granite/granite-vision-3.2-2b",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    supported_devices=[
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ],
+    scale=2.0,
+    temperature=0.0,
+)
+GRANITE_VISION_OLLAMA = ApiVlmOptions(
+    url=AnyUrl("http://localhost:11434/v1/chat/completions"),
+    params={"model": "granite3.2-vision:2b"},
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    scale=1.0,
+    timeout=120,
+    response_format=ResponseFormat.MARKDOWN,
+    temperature=0.0,
+)
+# Pixtral
+PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
+    repo_id="mistral-community/pixtral-12b",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
+    scale=2.0,
+    temperature=0.0,
+)
+PIXTRAL_12B_MLX = InlineVlmOptions(
+    repo_id="mlx-community/pixtral-12b-bf16",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+# Phi4
+PHI4_TRANSFORMERS = InlineVlmOptions(
+    repo_id="microsoft/Phi-4-multimodal-instruct",
+    prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
+    trust_remote_code=True,
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM,
+    supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
+    scale=2.0,
+    temperature=0.0,
+    extra_generation_config=dict(num_logits_to_keep=0),
+)
+# Qwen
+QWEN25_VL_3B_MLX = InlineVlmOptions(
+    repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+# Gemma-3
+GEMMA3_12B_MLX = InlineVlmOptions(
+    repo_id="mlx-community/gemma-3-12b-it-bf16",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+GEMMA3_27B_MLX = InlineVlmOptions(
+    repo_id="mlx-community/gemma-3-27b-it-bf16",
+    prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+    response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.MLX,
+    supported_devices=[AcceleratorDevice.MPS],
+    scale=2.0,
+    temperature=0.0,
+)
+class VlmModelType(str, Enum):
+    SMOLDOCLING = "smoldocling"
+    GRANITE_VISION = "granite_vision"
+    GRANITE_VISION_OLLAMA = "granite_vision_ollama"

docling/document_converter.py CHANGED Viewed

@@ -186,6 +186,11 @@ class DocumentConverter:
             Tuple[Type[BasePipeline], str], BasePipeline
         ] = {}
+    def _get_initialized_pipelines(
+        self,
+    ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
+        return self.initialized_pipelines
     def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
         """Generate a hash of pipeline options to use as part of the cache key."""
         options_str = str(pipeline_options.model_dump())

docling/models/api_vlm_model.py CHANGED Viewed

@@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import ApiVlmOptions
+from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
 from docling.exceptions import OperationNotAllowed
 from docling.models.base_model import BasePageModel
 from docling.utils.api_image_request import api_image_request

docling/models/base_ocr_model.py CHANGED Viewed

@@ -11,9 +11,10 @@ from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
+from docling.datamodel.pipeline_options import OcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BaseModelWithOptions, BasePageModel

docling 2.35.0__py3-none-any.whl → 2.36.1__py3-none-any.whl

docling 2.35.0py3-none-any.whl → 2.36.1py3-none-any.whl