PyPI - docling - Versions diffs - 2.35.0__py3-none-any.whl → 2.36.0__py3-none-any.whl - Mend

docling 2.35.0py3-none-any.whl → 2.36.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docling/backend/xml/jats_backend.py +0 -0
docling/cli/main.py +12 -15
docling/datamodel/accelerator_options.py +68 -0
docling/datamodel/base_models.py +10 -8
docling/datamodel/pipeline_options.py +29 -161
docling/datamodel/pipeline_options_vlm_model.py +81 -0
docling/datamodel/vlm_model_specs.py +144 -0
docling/document_converter.py +5 -0
docling/models/api_vlm_model.py +1 -1
docling/models/base_ocr_model.py +2 -1
docling/models/code_formula_model.py +6 -11
docling/models/document_picture_classifier.py +6 -11
docling/models/easyocr_model.py +1 -2
docling/models/layout_model.py +6 -11
docling/models/ocr_mac_model.py +1 -1
docling/models/picture_description_api_model.py +1 -1
docling/models/picture_description_base_model.py +1 -1
docling/models/picture_description_vlm_model.py +7 -22
docling/models/rapid_ocr_model.py +1 -2
docling/models/table_structure_model.py +6 -12
docling/models/tesseract_ocr_cli_model.py +1 -1
docling/models/tesseract_ocr_model.py +1 -1
docling/models/utils/__init__.py +0 -0
docling/models/utils/hf_model_download.py +40 -0
docling/models/vlm_models_inline/__init__.py +0 -0
docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} +56 -44
docling/pipeline/vlm_pipeline.py +228 -61
docling/utils/accelerator_utils.py +17 -2
docling/utils/model_downloader.py +13 -12
{docling-2.35.0.dist-info → docling-2.36.0.dist-info}/METADATA +54 -55
{docling-2.35.0.dist-info → docling-2.36.0.dist-info}/RECORD +46 -39
{docling-2.35.0.dist-info → docling-2.36.0.dist-info}/WHEEL +2 -1
docling-2.36.0.dist-info/entry_points.txt +6 -0
docling-2.36.0.dist-info/top_level.txt +1 -0
docling/models/hf_vlm_model.py +0 -182
docling-2.35.0.dist-info/entry_points.txt +0 -7
{docling-2.35.0.dist-info → docling-2.36.0.dist-info/licenses}/LICENSE +0 -0

docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} RENAMED Viewed

@@ -4,29 +4,34 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
-from docling.datamodel.base_models import Page, VlmPrediction
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
+from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
-    HuggingFaceVlmOptions,
 )
+from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
 from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
-class HuggingFaceMlxModel(BasePageModel):
+class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         enabled: bool,
         artifacts_path: Optional[Path],
         accelerator_options: AcceleratorOptions,
-        vlm_options: HuggingFaceVlmOptions,
+        vlm_options: InlineVlmOptions,
     ):
         self.enabled = enabled
         self.vlm_options = vlm_options
+        self.max_tokens = vlm_options.max_new_tokens
+        self.temperature = vlm_options.temperature
         if self.enabled:
             try:
@@ -39,42 +44,24 @@ class HuggingFaceMlxModel(BasePageModel):
                 )
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
             self.apply_chat_template = apply_chat_template
             self.stream_generate = stream_generate
             # PARAMETERS:
             if artifacts_path is None:
-                artifacts_path = self.download_models(self.vlm_options.repo_id)
+                artifacts_path = self.download_models(
+                    self.vlm_options.repo_id,
+                )
             elif (artifacts_path / repo_cache_folder).exists():
                 artifacts_path = artifacts_path / repo_cache_folder
-            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
+            self.param_question = vlm_options.prompt
             ## Load the model
             self.vlm_model, self.processor = load(artifacts_path)
             self.config = load_config(artifacts_path)
-    @staticmethod
-    def download_models(
-        repo_id: str,
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id=repo_id,
-            force_download=force,
-            local_dir=local_dir,
-            # revision="v0.0.1",
-        )
-        return Path(download_path)
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -83,12 +70,10 @@ class HuggingFaceMlxModel(BasePageModel):
             if not page._backend.is_valid():
                 yield page
             else:
-                with TimeRecorder(conv_res, "vlm"):
+                with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                     assert page.size is not None
-                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
-                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
+                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
                     if hi_res_image is not None:
                         im_width, im_height = hi_res_image.size
@@ -104,16 +89,45 @@ class HuggingFaceMlxModel(BasePageModel):
                     )
                     start_time = time.time()
+                    _log.debug("start generating ...")
                     # Call model to generate:
+                    tokens: list[VlmPredictionToken] = []
                     output = ""
                     for token in self.stream_generate(
                         self.vlm_model,
                         self.processor,
                         prompt,
                         [hi_res_image],
-                        max_tokens=4096,
+                        max_tokens=self.max_tokens,
                         verbose=False,
+                        temp=self.temperature,
                     ):
+                        if len(token.logprobs.shape) == 1:
+                            tokens.append(
+                                VlmPredictionToken(
+                                    text=token.text,
+                                    token=token.token,
+                                    logprob=token.logprobs[token.token],
+                                )
+                            )
+                        elif (
+                            len(token.logprobs.shape) == 2
+                            and token.logprobs.shape[0] == 1
+                        ):
+                            tokens.append(
+                                VlmPredictionToken(
+                                    text=token.text,
+                                    token=token.token,
+                                    logprob=token.logprobs[0, token.token],
+                                )
+                            )
+                        else:
+                            _log.warning(
+                                f"incompatible shape for logprobs: {token.logprobs.shape}"
+                            )
                         output += token.text
                         if "</doctag>" in token.text:
                             break
@@ -121,15 +135,13 @@ class HuggingFaceMlxModel(BasePageModel):
                     generation_time = time.time() - start_time
                     page_tags = output
-                    _log.debug(f"Generation time {generation_time:.2f} seconds.")
-                    # inference_time = time.time() - start_time
-                    # tokens_per_second = num_tokens / generation_time
-                    # print("")
-                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
-                    # print(f"Total tokens on page: {num_tokens:.2f}")
-                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
-                    # print("")
-                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
+                    _log.debug(
+                        f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
+                    )
+                    page.predictions.vlm_response = VlmPrediction(
+                        text=page_tags,
+                        generation_time=generation_time,
+                        generated_tokens=tokens,
+                    )
                 yield page

docling/pipeline/vlm_pipeline.py CHANGED Viewed

@@ -1,29 +1,46 @@
 import logging
+import re
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
-from docling_core.types import DoclingDocument
-from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItem,
+    DoclingDocument,
+    ImageRef,
+    PictureItem,
+    ProvenanceItem,
+    TextItem,
+)
+from docling_core.types.doc.base import (
+    BoundingBox,
+    Size,
+)
 from docling_core.types.doc.document import DocTagsDocument
 from PIL import Image as PILImage
 from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import (
+    VlmPipelineOptions,
+)
+from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
-    HuggingFaceVlmOptions,
     InferenceFramework,
+    InlineVlmOptions,
     ResponseFormat,
-    VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel
-from docling.models.hf_mlx_model import HuggingFaceMlxModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
+from docling.models.vlm_models_inline.hf_transformers_model import (
+    HuggingFaceTransformersVlmModel,
+)
+from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -66,8 +83,8 @@ class VlmPipeline(PaginatedPipeline):
                     vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
                 ),
             ]
-        elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
-            vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
+        elif isinstance(self.pipeline_options.vlm_options, InlineVlmOptions):
+            vlm_options = cast(InlineVlmOptions, self.pipeline_options.vlm_options)
             if vlm_options.inference_framework == InferenceFramework.MLX:
                 self.build_pipe = [
                     HuggingFaceMlxModel(
@@ -77,15 +94,19 @@ class VlmPipeline(PaginatedPipeline):
                         vlm_options=vlm_options,
                     ),
                 ]
-            else:
+            elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
                 self.build_pipe = [
-                    HuggingFaceVlmModel(
+                    HuggingFaceTransformersVlmModel(
                         enabled=True,  # must be always enabled for this pipeline to make sense.
                         artifacts_path=artifacts_path,
                         accelerator_options=pipeline_options.accelerator_options,
                         vlm_options=vlm_options,
                     ),
                 ]
+            else:
+                raise ValueError(
+                    f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
+                )
         self.enrichment_pipe = [
             # Other models working on `NodeItem` elements in the DoclingDocument
@@ -116,49 +137,19 @@ class VlmPipeline(PaginatedPipeline):
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.DOCTAGS
             ):
-                doctags_list = []
-                image_list = []
-                for page in conv_res.pages:
-                    predicted_doctags = ""
-                    img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
-                    if page.predictions.vlm_response:
-                        predicted_doctags = page.predictions.vlm_response.text
-                    if page.image:
-                        img = page.image
-                    image_list.append(img)
-                    doctags_list.append(predicted_doctags)
-                doctags_list_c = cast(List[Union[Path, str]], doctags_list)
-                image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
-                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
-                    doctags_list_c, image_list_c
-                )
-                conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
-                # If forced backend text, replace model predicted text with backend one
-                if self.force_backend_text:
-                    scale = self.pipeline_options.images_scale
-                    for element, _level in conv_res.document.iterate_items():
-                        if not isinstance(element, TextItem) or len(element.prov) == 0:
-                            continue
-                        page_ix = element.prov[0].page_no - 1
-                        page = conv_res.pages[page_ix]
-                        if not page.size:
-                            continue
-                        crop_bbox = (
-                            element.prov[0]
-                            .bbox.scaled(scale=scale)
-                            .to_top_left_origin(page_height=page.size.height * scale)
-                        )
-                        txt = self.extract_text_from_backend(page, crop_bbox)
-                        element.text = txt
-                        element.orig = txt
+                conv_res.document = self._turn_dt_into_doc(conv_res)
             elif (
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.MARKDOWN
             ):
                 conv_res.document = self._turn_md_into_doc(conv_res)
+            elif (
+                self.pipeline_options.vlm_options.response_format == ResponseFormat.HTML
+            ):
+                conv_res.document = self._turn_html_into_doc(conv_res)
             else:
                 raise RuntimeError(
                     f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
@@ -192,23 +183,199 @@ class VlmPipeline(PaginatedPipeline):
         return conv_res
-    def _turn_md_into_doc(self, conv_res):
-        predicted_text = ""
-        for pg_idx, page in enumerate(conv_res.pages):
+    def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
+        doctags_list = []
+        image_list = []
+        for page in conv_res.pages:
+            predicted_doctags = ""
+            img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
             if page.predictions.vlm_response:
-                predicted_text += page.predictions.vlm_response.text + "\n\n"
-        response_bytes = BytesIO(predicted_text.encode("utf8"))
-        out_doc = InputDocument(
-            path_or_stream=response_bytes,
-            filename=conv_res.input.file.name,
-            format=InputFormat.MD,
-            backend=MarkdownDocumentBackend,
+                predicted_doctags = page.predictions.vlm_response.text
+            if page.image:
+                img = page.image
+            image_list.append(img)
+            doctags_list.append(predicted_doctags)
+        doctags_list_c = cast(List[Union[Path, str]], doctags_list)
+        image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
+        doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
+            doctags_list_c, image_list_c
         )
-        backend = MarkdownDocumentBackend(
-            in_doc=out_doc,
-            path_or_stream=response_bytes,
+        conv_res.document = DoclingDocument.load_from_doctags(
+            doctag_document=doctags_doc
         )
-        return backend.convert()
+        # If forced backend text, replace model predicted text with backend one
+        if page.size:
+            if self.force_backend_text:
+                scale = self.pipeline_options.images_scale
+                for element, _level in conv_res.document.iterate_items():
+                    if not isinstance(element, TextItem) or len(element.prov) == 0:
+                        continue
+                    crop_bbox = (
+                        element.prov[0]
+                        .bbox.scaled(scale=scale)
+                        .to_top_left_origin(page_height=page.size.height * scale)
+                    )
+                    txt = self.extract_text_from_backend(page, crop_bbox)
+                    element.text = txt
+                    element.orig = txt
+        return conv_res.document
+    def _turn_md_into_doc(self, conv_res):
+        def _extract_markdown_code(text):
+            """
+            Extracts text from markdown code blocks (enclosed in triple backticks).
+            If no code blocks are found, returns the original text.
+            Args:
+                text (str): Input text that may contain markdown code blocks
+            Returns:
+                str: Extracted code if code blocks exist, otherwise original text
+            """
+            # Regex pattern to match content between triple backticks
+            # This handles multiline content and optional language specifier
+            pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
+            # Search with DOTALL flag to match across multiple lines
+            mtch = re.search(pattern, text, re.DOTALL)
+            if mtch:
+                # Return only the content of the first capturing group
+                return mtch.group(1)
+            else:
+                # No code blocks found, return original text
+                return text
+        for pg_idx, page in enumerate(conv_res.pages):
+            page_no = pg_idx + 1  # FIXME: might be incorrect
+            predicted_text = ""
+            if page.predictions.vlm_response:
+                predicted_text = page.predictions.vlm_response.text + "\n\n"
+            predicted_text = _extract_markdown_code(text=predicted_text)
+            response_bytes = BytesIO(predicted_text.encode("utf8"))
+            out_doc = InputDocument(
+                path_or_stream=response_bytes,
+                filename=conv_res.input.file.name,
+                format=InputFormat.MD,
+                backend=MarkdownDocumentBackend,
+            )
+            backend = MarkdownDocumentBackend(
+                in_doc=out_doc,
+                path_or_stream=response_bytes,
+            )
+            page_doc = backend.convert()
+            if page.image is not None:
+                pg_width = page.image.width
+                pg_height = page.image.height
+            else:
+                pg_width = 1
+                pg_height = 1
+            conv_res.document.add_page(
+                page_no=page_no,
+                size=Size(width=pg_width, height=pg_height),
+                image=ImageRef.from_pil(image=page.image, dpi=72)
+                if page.image
+                else None,
+            )
+            for item, level in page_doc.iterate_items():
+                item.prov = [
+                    ProvenanceItem(
+                        page_no=pg_idx + 1,
+                        bbox=BoundingBox(
+                            t=0.0, b=0.0, l=0.0, r=0.0
+                        ),  # FIXME: would be nice not to have to "fake" it
+                        charspan=[0, 0],
+                    )
+                ]
+                conv_res.document.append_child_item(child=item)
+        return conv_res.document
+    def _turn_html_into_doc(self, conv_res):
+        def _extract_html_code(text):
+            """
+            Extracts text from markdown code blocks (enclosed in triple backticks).
+            If no code blocks are found, returns the original text.
+            Args:
+                text (str): Input text that may contain markdown code blocks
+            Returns:
+                str: Extracted code if code blocks exist, otherwise original text
+            """
+            # Regex pattern to match content between triple backticks
+            # This handles multiline content and optional language specifier
+            pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
+            # Search with DOTALL flag to match across multiple lines
+            mtch = re.search(pattern, text, re.DOTALL)
+            if mtch:
+                # Return only the content of the first capturing group
+                return mtch.group(1)
+            else:
+                # No code blocks found, return original text
+                return text
+        for pg_idx, page in enumerate(conv_res.pages):
+            page_no = pg_idx + 1  # FIXME: might be incorrect
+            predicted_text = ""
+            if page.predictions.vlm_response:
+                predicted_text = page.predictions.vlm_response.text + "\n\n"
+            predicted_text = _extract_html_code(text=predicted_text)
+            response_bytes = BytesIO(predicted_text.encode("utf8"))
+            out_doc = InputDocument(
+                path_or_stream=response_bytes,
+                filename=conv_res.input.file.name,
+                format=InputFormat.MD,
+                backend=HTMLDocumentBackend,
+            )
+            backend = HTMLDocumentBackend(
+                in_doc=out_doc,
+                path_or_stream=response_bytes,
+            )
+            page_doc = backend.convert()
+            if page.image is not None:
+                pg_width = page.image.width
+                pg_height = page.image.height
+            else:
+                pg_width = 1
+                pg_height = 1
+            conv_res.document.add_page(
+                page_no=page_no,
+                size=Size(width=pg_width, height=pg_height),
+                image=ImageRef.from_pil(image=page.image, dpi=72)
+                if page.image
+                else None,
+            )
+            for item, level in page_doc.iterate_items():
+                item.prov = [
+                    ProvenanceItem(
+                        page_no=pg_idx + 1,
+                        bbox=BoundingBox(
+                            t=0.0, b=0.0, l=0.0, r=0.0
+                        ),  # FIXME: would be nice not to have to "fake" it
+                        charspan=[0, 0],
+                    )
+                ]
+                conv_res.document.append_child_item(child=item)
+        return conv_res.document
     @classmethod
     def get_default_options(cls) -> VlmPipelineOptions:

docling/utils/accelerator_utils.py CHANGED Viewed

@@ -1,13 +1,16 @@
 import logging
+from typing import List, Optional
 import torch
-from docling.datamodel.pipeline_options import AcceleratorDevice
+from docling.datamodel.accelerator_options import AcceleratorDevice
 _log = logging.getLogger(__name__)
-def decide_device(accelerator_device: str) -> str:
+def decide_device(
+    accelerator_device: str, supported_devices: Optional[List[AcceleratorDevice]] = None
+) -> str:
     r"""
     Resolve the device based on the acceleration options and the available devices in the system.
@@ -20,6 +23,18 @@ def decide_device(accelerator_device: str) -> str:
     has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
     has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
+    if supported_devices is not None:
+        if has_cuda and AcceleratorDevice.CUDA not in supported_devices:
+            _log.info(
+                f"Removing CUDA from available devices because it is not in {supported_devices=}"
+            )
+            has_cuda = False
+        if has_mps and AcceleratorDevice.MPS not in supported_devices:
+            _log.info(
+                f"Removing MPS from available devices because it is not in {supported_devices=}"
+            )
+            has_mps = False
     if accelerator_device == AcceleratorDevice.AUTO.value:  # Handle 'auto'
         if has_cuda:
             device = "cuda:0"

docling/utils/model_downloader.py CHANGED Viewed

@@ -4,18 +4,20 @@ from typing import Optional
 from docling.datamodel.pipeline_options import (
     granite_picture_description,
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
     smolvlm_picture_description,
 )
 from docling.datamodel.settings import settings
+from docling.datamodel.vlm_model_specs import (
+    SMOLDOCLING_MLX,
+    SMOLDOCLING_TRANSFORMERS,
+)
 from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
 from docling.models.easyocr_model import EasyOcrModel
-from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.models.layout_model import LayoutModel
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.table_structure_model import TableStructureModel
+from docling.models.utils.hf_model_download import download_hf_model
 _log = logging.getLogger(__name__)
@@ -75,7 +77,7 @@ def download_models(
     if with_smolvlm:
         _log.info("Downloading SmolVlm model...")
-        PictureDescriptionVlmModel.download_models(
+        download_hf_model(
             repo_id=smolvlm_picture_description.repo_id,
             local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
             force=force,
@@ -84,26 +86,25 @@ def download_models(
     if with_smoldocling:
         _log.info("Downloading SmolDocling model...")
-        HuggingFaceVlmModel.download_models(
-            repo_id=smoldocling_vlm_conversion_options.repo_id,
-            local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
+        download_hf_model(
+            repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
+            local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
             force=force,
             progress=progress,
         )
     if with_smoldocling_mlx:
         _log.info("Downloading SmolDocling MLX model...")
-        HuggingFaceVlmModel.download_models(
-            repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
-            local_dir=output_dir
-            / smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
+        download_hf_model(
+            repo_id=SMOLDOCLING_MLX.repo_id,
+            local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
             force=force,
             progress=progress,
         )
     if with_granite_vision:
         _log.info("Downloading Granite Vision model...")
-        PictureDescriptionVlmModel.download_models(
+        download_hf_model(
             repo_id=granite_picture_description.repo_id,
             local_dir=output_dir / granite_picture_description.repo_cache_folder,
             force=force,

docling 2.35.0__py3-none-any.whl → 2.36.0__py3-none-any.whl

docling 2.35.0py3-none-any.whl → 2.36.0py3-none-any.whl