PyPI - docling - Versions diffs - 2.37.0__py3-none-any.whl → 2.38.1__py3-none-any.whl - Mend

docling 2.37.0py3-none-any.whl → 2.38.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

docling/backend/md_backend.py +185 -80
docling/backend/msword_backend.py +76 -63
docling/backend/noop_backend.py +51 -0
docling/cli/main.py +82 -14
docling/datamodel/asr_model_specs.py +92 -0
docling/datamodel/base_models.py +12 -2
docling/datamodel/document.py +3 -1
docling/datamodel/pipeline_options.py +13 -2
docling/datamodel/pipeline_options_asr_model.py +57 -0
docling/datamodel/pipeline_options_vlm_model.py +2 -3
docling/document_converter.py +8 -0
docling/models/api_vlm_model.py +3 -1
docling/models/base_model.py +1 -1
docling/models/readingorder_model.py +1 -1
docling/models/vlm_models_inline/hf_transformers_model.py +3 -1
docling/models/vlm_models_inline/mlx_model.py +3 -1
docling/pipeline/asr_pipeline.py +253 -0
docling/pipeline/base_pipeline.py +11 -0
{docling-2.37.0.dist-info → docling-2.38.1.dist-info}/METADATA +7 -4
{docling-2.37.0.dist-info → docling-2.38.1.dist-info}/RECORD +24 -20
{docling-2.37.0.dist-info → docling-2.38.1.dist-info}/WHEEL +0 -0
{docling-2.37.0.dist-info → docling-2.38.1.dist-info}/entry_points.txt +0 -0
{docling-2.37.0.dist-info → docling-2.38.1.dist-info}/licenses/LICENSE +0 -0
{docling-2.37.0.dist-info → docling-2.38.1.dist-info}/top_level.txt +0 -0

docling/backend/noop_backend.py ADDED Viewed

@@ -0,0 +1,51 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+class NoOpBackend(AbstractDocumentBackend):
+    """
+    A no-op backend that only validates input existence.
+    Used e.g. for audio files where actual processing is handled by the ASR pipeline.
+    """
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        _log.debug(f"NoOpBackend initialized for: {path_or_stream}")
+        # Validate input
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                # Check if stream has content
+                self.valid = len(self.path_or_stream.getvalue()) > 0
+                _log.debug(
+                    f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
+                )
+            elif isinstance(self.path_or_stream, Path):
+                # Check if file exists
+                self.valid = self.path_or_stream.exists()
+                _log.debug(f"File exists: {self.valid}")
+            else:
+                self.valid = False
+        except Exception as e:
+            _log.error(f"NoOpBackend validation failed: {e}")
+            self.valid = False
+    def is_valid(self) -> bool:
+        return self.valid
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set(InputFormat)

docling/cli/main.py CHANGED Viewed

@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+    WHISPER_BASE,
+    WHISPER_LARGE,
+    WHISPER_MEDIUM,
+    WHISPER_SMALL,
+    WHISPER_TINY,
+    WHISPER_TURBO,
+    AsrModelType,
+)
 from docling.datamodel.base_models import (
     ConversionStatus,
     FormatToExtensions,
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AsrPipelineOptions,
     EasyOcrOptions,
     OcrOptions,
     PaginatedPipelineOptions,
     PdfBackend,
-    PdfPipeline,
     PdfPipelineOptions,
+    PipelineOptions,
+    ProcessingPipeline,
     TableFormerMode,
     VlmPipelineOptions,
 )
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
     SMOLDOCLING_TRANSFORMERS,
     VlmModelType,
 )
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.document_converter import (
+    AudioFormatOption,
+    DocumentConverter,
+    FormatOption,
+    PdfFormatOption,
+)
 from docling.models.factories import get_ocr_factory
+from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.vlm_pipeline import VlmPipeline
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +313,17 @@ def convert(  # noqa: C901
         ),
     ] = ImageRefMode.EMBEDDED,
     pipeline: Annotated[
-        PdfPipeline,
+        ProcessingPipeline,
         typer.Option(..., help="Choose the pipeline to process PDF or image files."),
-    ] = PdfPipeline.STANDARD,
+    ] = ProcessingPipeline.STANDARD,
     vlm_model: Annotated[
         VlmModelType,
         typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
     ] = VlmModelType.SMOLDOCLING,
+    asr_model: Annotated[
+        AsrModelType,
+        typer.Option(..., help="Choose the ASR model to use with audio/video files."),
+    ] = AsrModelType.WHISPER_TINY,
     ocr: Annotated[
         bool,
         typer.Option(
@@ -450,12 +471,14 @@ def convert(  # noqa: C901
         ),
     ] = None,
 ):
+    log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
     if verbose == 0:
-        logging.basicConfig(level=logging.WARNING)
+        logging.basicConfig(level=logging.WARNING, format=log_format)
     elif verbose == 1:
-        logging.basicConfig(level=logging.INFO)
+        logging.basicConfig(level=logging.INFO, format=log_format)
     else:
-        logging.basicConfig(level=logging.DEBUG)
+        logging.basicConfig(level=logging.DEBUG, format=log_format)
     settings.debug.visualize_cells = debug_visualize_cells
     settings.debug.visualize_layout = debug_visualize_layout
@@ -530,9 +553,12 @@ def convert(  # noqa: C901
             ocr_options.lang = ocr_lang_list
         accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
-        pipeline_options: PaginatedPipelineOptions
+        # pipeline_options: PaginatedPipelineOptions
+        pipeline_options: PipelineOptions
+        format_options: Dict[InputFormat, FormatOption] = {}
-        if pipeline == PdfPipeline.STANDARD:
+        if pipeline == ProcessingPipeline.STANDARD:
             pipeline_options = PdfPipelineOptions(
                 allow_external_plugins=allow_external_plugins,
                 enable_remote_services=enable_remote_services,
@@ -574,7 +600,13 @@ def convert(  # noqa: C901
                 pipeline_options=pipeline_options,
                 backend=backend,  # pdf_backend
             )
-        elif pipeline == PdfPipeline.VLM:
+            format_options = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+        elif pipeline == ProcessingPipeline.VLM:
             pipeline_options = VlmPipelineOptions(
                 enable_remote_services=enable_remote_services,
             )
@@ -600,13 +632,48 @@ def convert(  # noqa: C901
                 pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
             )
+            format_options = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+        elif pipeline == ProcessingPipeline.ASR:
+            pipeline_options = AsrPipelineOptions(
+                # enable_remote_services=enable_remote_services,
+                # artifacts_path = artifacts_path
+            )
+            if asr_model == AsrModelType.WHISPER_TINY:
+                pipeline_options.asr_options = WHISPER_TINY
+            elif asr_model == AsrModelType.WHISPER_SMALL:
+                pipeline_options.asr_options = WHISPER_SMALL
+            elif asr_model == AsrModelType.WHISPER_MEDIUM:
+                pipeline_options.asr_options = WHISPER_MEDIUM
+            elif asr_model == AsrModelType.WHISPER_BASE:
+                pipeline_options.asr_options = WHISPER_BASE
+            elif asr_model == AsrModelType.WHISPER_LARGE:
+                pipeline_options.asr_options = WHISPER_LARGE
+            elif asr_model == AsrModelType.WHISPER_TURBO:
+                pipeline_options.asr_options = WHISPER_TURBO
+            else:
+                _log.error(f"{asr_model} is not known")
+                raise ValueError(f"{asr_model} is not known")
+            _log.info(f"pipeline_options: {pipeline_options}")
+            audio_format_option = AudioFormatOption(
+                pipeline_cls=AsrPipeline,
+                pipeline_options=pipeline_options,
+            )
+            format_options = {
+                InputFormat.AUDIO: audio_format_option,
+            }
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
+            # audio_pipeline_options.artifacts_path = artifacts_path
-        format_options: Dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
         doc_converter = DocumentConverter(
             allowed_formats=from_formats,
             format_options=format_options,
@@ -614,6 +681,7 @@ def convert(  # noqa: C901
         start_time = time.time()
+        _log.info(f"paths: {input_doc_paths}")
         conv_results = doc_converter.convert_all(
             input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
         )

docling/datamodel/asr_model_specs.py ADDED Viewed

@@ -0,0 +1,92 @@
+import logging
+from enum import Enum
+from pydantic import (
+    AnyUrl,
+)
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_asr_model import (
+    # AsrResponseFormat,
+    # ApiAsrOptions,
+    InferenceAsrFramework,
+    InlineAsrNativeWhisperOptions,
+    TransformersModelType,
+)
+_log = logging.getLogger(__name__)
+WHISPER_TINY = InlineAsrNativeWhisperOptions(
+    repo_id="tiny",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_SMALL = InlineAsrNativeWhisperOptions(
+    repo_id="small",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
+    repo_id="medium",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_BASE = InlineAsrNativeWhisperOptions(
+    repo_id="base",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_LARGE = InlineAsrNativeWhisperOptions(
+    repo_id="large",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+WHISPER_TURBO = InlineAsrNativeWhisperOptions(
+    repo_id="turbo",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+class AsrModelType(str, Enum):
+    WHISPER_TINY = "whisper_tiny"
+    WHISPER_SMALL = "whisper_small"
+    WHISPER_MEDIUM = "whisper_medium"
+    WHISPER_BASE = "whisper_base"
+    WHISPER_LARGE = "whisper_large"
+    WHISPER_TURBO = "whisper_turbo"

docling/datamodel/base_models.py CHANGED Viewed

@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
     JSON_DOCLING = "json_docling"
+    AUDIO = "audio"
 class OutputFormat(str, Enum):
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
     InputFormat.JSON_DOCLING: ["json"],
+    InputFormat.AUDIO: ["wav", "mp3"],
 }
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
     InputFormat.JSON_DOCLING: ["application/json"],
+    InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
 }
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -253,11 +256,18 @@ class Page(BaseModel):
             return []
     def get_image(
-        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
+        self,
+        scale: float = 1.0,
+        max_size: Optional[int] = None,
+        cropbox: Optional[BoundingBox] = None,
     ) -> Optional[Image]:
         if self._backend is None:
             return self._image_cache.get(scale, None)
+        if max_size:
+            assert self.size is not None
+            scale = min(scale, max_size / max(self.size.as_tuple()))
         if scale not in self._image_cache:
             if cropbox is None:
                 self._image_cache[scale] = self._backend.get_page_image(scale=scale)
@@ -291,7 +301,7 @@ class OpenAiChatMessage(BaseModel):
 class OpenAiResponseChoice(BaseModel):
     index: int
     message: OpenAiChatMessage
-    finish_reason: str
+    finish_reason: Optional[str]
 class OpenAiResponseUsage(BaseModel):

docling/datamodel/document.py CHANGED Viewed

@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
             backend: Type[AbstractDocumentBackend]
             if format not in format_options.keys():
                 _log.error(
-                    f"Input document {obj.name} does not match any allowed format."
+                    f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
                 )
                 backend = _DummyBackend
             else:
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
         mime = mime or _DocumentConversionInput._detect_csv(content)
         mime = mime or "text/plain"
         formats = MimeTypeToFormat.get(mime, [])
+        _log.info(f"detected formats: {formats}")
         if formats:
             if len(formats) == 1 and mime not in ("text/plain"):
                 return formats[0]

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -11,8 +11,13 @@ from pydantic import (
 )
 from typing_extensions import deprecated
+from docling.datamodel import asr_model_specs
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options_asr_model import (
+    InlineAsrOptions,
+)
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
@@ -202,7 +207,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
 # GraniteVision
 granite_picture_description = PictureDescriptionVlmOptions(
-    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
+    repo_id="ibm-granite/granite-vision-3.2-2b-preview",
     prompt="What is shown in this image?",
 )
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
     )
+class AsrPipelineOptions(PipelineOptions):
+    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
+    artifacts_path: Optional[Union[Path, str]] = None
 class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
@@ -297,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
     )
-class PdfPipeline(str, Enum):
+class ProcessingPipeline(str, Enum):
     STANDARD = "standard"
     VLM = "vlm"
+    ASR = "asr"

docling/datamodel/pipeline_options_asr_model.py ADDED Viewed

@@ -0,0 +1,57 @@
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Union
+from pydantic import AnyUrl, BaseModel
+from typing_extensions import deprecated
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_vlm_model import (
+    # InferenceFramework,
+    TransformersModelType,
+)
+class BaseAsrOptions(BaseModel):
+    kind: str
+    # prompt: str
+class InferenceAsrFramework(str, Enum):
+    # MLX = "mlx" # disabled for now
+    # TRANSFORMERS = "transformers" # disabled for now
+    WHISPER = "whisper"
+class InlineAsrOptions(BaseAsrOptions):
+    kind: Literal["inline_model_options"] = "inline_model_options"
+    repo_id: str
+    verbose: bool = False
+    timestamps: bool = True
+    temperature: float = 0.0
+    max_new_tokens: int = 256
+    max_time_chunk: float = 30.0
+    torch_dtype: Optional[str] = None
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ]
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+class InlineAsrNativeWhisperOptions(InlineAsrOptions):
+    inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
+    language: str = "en"
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+    ]
+    word_timestamps: bool = True

docling/datamodel/pipeline_options_vlm_model.py CHANGED Viewed

@@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
     kind: str
     prompt: str
+    scale: float = 2.0
+    max_size: Optional[int] = None
 class ResponseFormat(str, Enum):
@@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions):
         AcceleratorDevice.MPS,
     ]
-    scale: float = 2.0
     temperature: float = 0.0
     stop_strings: List[str] = []
     extra_generation_config: Dict[str, Any] = {}
@@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
     )  # Default to ollama
     headers: Dict[str, str] = {}
     params: Dict[str, Any] = {}
-    scale: float = 2.0
     timeout: float = 60
     concurrency: int = 1
     response_format: ResponseFormat

docling/document_converter.py CHANGED Viewed

@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.noop_backend import NoOpBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
     settings,
 )
 from docling.exceptions import ConversionError
+from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
     backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
+class AudioFormatOption(FormatOption):
+    pipeline_cls: Type = AsrPipeline
+    backend: Type[AbstractDocumentBackend] = NoOpBackend
 def _get_default_option(format: InputFormat) -> FormatOption:
     format_to_default_options = {
         InputFormat.CSV: FormatOption(
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.JSON_DOCLING: FormatOption(
             pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
         ),
+        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
     }
     if (options := format_to_default_options.get(format)) is not None:
         return options

docling/models/api_vlm_model.py CHANGED Viewed

@@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
                 with TimeRecorder(conv_res, "vlm"):
                     assert page.size is not None
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
                     assert hi_res_image is not None
                     if hi_res_image:
                         if hi_res_image.mode != "RGB":

docling/models/base_model.py CHANGED Viewed

@@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
             coord_origin=bbox.coord_origin,
         )
-        page_ix = element_prov.page_no - 1
+        page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
         cropped_image = conv_res.pages[page_ix].get_image(
             scale=self.images_scale, cropbox=expanded_bbox
         )

docling/models/readingorder_model.py CHANGED Viewed

@@ -124,7 +124,7 @@ class ReadingOrderModel:
             page_no = page.page_no + 1
             size = page.size
-            assert size is not None
+            assert size is not None, "Page size is not initialized."
             out_doc.add_page(page_no=page_no, size=size)

docling/models/vlm_models_inline/hf_transformers_model.py CHANGED Viewed

@@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                 with TimeRecorder(conv_res, "vlm"):
                     assert page.size is not None
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
                     # Define prompt structure
                     prompt = self.formulate_prompt()

docling/models/vlm_models_inline/mlx_model.py CHANGED Viewed

@@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                 with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                     assert page.size is not None
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
                     if hi_res_image is not None:
                         im_width, im_height = hi_res_image.size

docling 2.37.0__py3-none-any.whl → 2.38.1__py3-none-any.whl

docling 2.37.0py3-none-any.whl → 2.38.1py3-none-any.whl