PyPI - docling - Versions diffs - 2.39.0__tar.gz → 2.41.0__tar.gz - Mend

docling 2.39.0tar.gz → 2.41.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

{docling-2.39.0 → docling-2.41.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.39.0
+Version: 2.41.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -26,9 +26,9 @@ Requires-Python: <4.0,>=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: pydantic<3.0.0,>=2.0.0
-Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
-Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
+Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
 Requires-Dist: docling-parse<5.0.0,>=4.0.0
+Requires-Dist: docling-ibm-models<4,>=3.6.0
 Requires-Dist: filetype<2.0.0,>=1.2.0
 Requires-Dist: pypdfium2<5.0.0,>=4.30.0
 Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -57,12 +57,12 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
 Provides-Extra: vlm
 Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
 Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
-Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
+Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
 Provides-Extra: rapidocr
 Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
 Provides-Extra: asr
-Requires-Dist: openai-whisper>=20240930; extra == "asr"
+Requires-Dist: openai-whisper>=20250625; extra == "asr"
 Dynamic: license-file
 <p align="center">

{docling-2.39.0 → docling-2.41.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

@@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
     def unload(self):
         super().unload()
-        self.dp_doc.unload()
-        with pypdfium2_lock:
-            self._pdoc.close()
-        self._pdoc = None
+        # Unload docling-parse document first
+        if self.dp_doc is not None:
+            self.dp_doc.unload()
+            self.dp_doc = None
+        # Then close pypdfium2 document with proper locking
+        if self._pdoc is not None:
+            with pypdfium2_lock:
+                try:
+                    self._pdoc.close()
+                except Exception:
+                    # Ignore cleanup errors
+                    pass
+            self._pdoc = None

{docling-2.39.0 → docling-2.41.0}/docling/backend/msexcel_backend.py RENAMED Viewed

@@ -337,10 +337,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         # Collect the data within the bounds
         data = []
         visited_cells: set[tuple[int, int]] = set()
-        for ri in range(start_row, max_row + 1):
-            for rj in range(start_col, max_col + 1):
-                cell = sheet.cell(row=ri + 1, column=rj + 1)  # 1-based indexing
+        for ri, row in enumerate(
+            sheet.iter_rows(
+                min_row=start_row + 1,  # start_row is 0-based but iter_rows is 1-based
+                max_row=max_row + 1,
+                min_col=start_col + 1,
+                max_col=max_col + 1,
+                values_only=False,
+            ),
+            start_row,
+        ):
+            for rj, cell in enumerate(row, start_col):
                 # Check if the cell belongs to a merged range
                 row_span = 1
                 col_span = 1
@@ -397,10 +404,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         """
         max_row: int = start_row
-        while max_row < sheet.max_row - 1:
-            # Get the cell value or check if it is part of a merged cell
-            cell = sheet.cell(row=max_row + 2, column=start_col + 1)
+        for ri, (cell,) in enumerate(
+            sheet.iter_rows(
+                min_row=start_row + 2,
+                max_row=sheet.max_row,
+                min_col=start_col + 1,
+                max_col=start_col + 1,
+                values_only=False,
+            ),
+            start_row + 1,
+        ):
             # Check if the cell is part of a merged range
             merged_range = next(
                 (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
@@ -414,7 +427,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
             if merged_range:
                 max_row = max(max_row, merged_range.max_row - 1)
             else:
-                max_row += 1
+                max_row = ri
         return max_row
@@ -433,10 +446,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         """
         max_col: int = start_col
-        while max_col < sheet.max_column - 1:
-            # Get the cell value or check if it is part of a merged cell
-            cell = sheet.cell(row=start_row + 1, column=max_col + 2)
+        for rj, (cell,) in enumerate(
+            sheet.iter_cols(
+                min_row=start_row + 1,
+                max_row=start_row + 1,
+                min_col=start_col + 2,
+                max_col=sheet.max_column,
+                values_only=False,
+            ),
+            start_col + 1,
+        ):
             # Check if the cell is part of a merged range
             merged_range = next(
                 (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
@@ -450,7 +469,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
             if merged_range:
                 max_col = max(max_col, merged_range.max_col - 1)
             else:
-                max_col += 1
+                max_col = rj
         return max_col

{docling-2.39.0 → docling-2.41.0}/docling/datamodel/asr_model_specs.py RENAMED Viewed

@@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )
@@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
     verbose=True,
     timestamps=True,
     word_timestamps=True,
-    temperatue=0.0,
+    temperature=0.0,
     max_new_tokens=256,
     max_time_chunk=30.0,
 )

{docling-2.39.0 → docling-2.41.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -12,6 +12,7 @@ from docling_core.types.doc import (
     Size,
     TableCell,
 )
+from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_core.types.io import (
     DocumentStream,
@@ -19,7 +20,14 @@ from docling_core.types.io import (
 # DO NOT REMOVE; explicitly exposed from this location
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, Field, computed_field
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    FieldSerializationInfo,
+    computed_field,
+    field_serializer,
+)
 if TYPE_CHECKING:
     from docling.backend.pdf_backend import PdfPageBackend
@@ -142,6 +150,10 @@ class Cluster(BaseModel):
     cells: List[TextCell] = []
     children: List["Cluster"] = []  # Add child cluster support
+    @field_serializer("confidence")
+    def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
+        return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
 class BasePageElement(BaseModel):
     label: DocItemLabel
@@ -194,6 +206,16 @@ class FigureElement(BasePageElement):
     predicted_class: Optional[str] = None
     confidence: Optional[float] = None
+    @field_serializer("confidence")
+    def _serialize(
+        self, value: Optional[float], info: FieldSerializationInfo
+    ) -> Optional[float]:
+        return (
+            round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
+            if value is not None
+            else None
+        )
 class FigureClassificationPrediction(BaseModel):
     figure_count: int = 0

docling-2.41.0/docling/datamodel/layout_model_specs.py ADDED Viewed

@@ -0,0 +1,90 @@
+import logging
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+from pydantic import BaseModel
+from docling.datamodel.accelerator_options import AcceleratorDevice
+_log = logging.getLogger(__name__)
+class LayoutModelConfig(BaseModel):
+    name: str
+    repo_id: str
+    revision: str
+    model_path: str
+    supported_devices: list[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ]
+    @property
+    def model_repo_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+# HuggingFace Layout Models
+# Default Docling Layout Model
+DOCLING_LAYOUT_V2 = LayoutModelConfig(
+    name="docling_layout_v2",
+    repo_id="ds4sd/docling-layout-old",
+    revision="main",
+    model_path="",
+)
+DOCLING_LAYOUT_HERON = LayoutModelConfig(
+    name="docling_layout_heron",
+    repo_id="ds4sd/docling-layout-heron",
+    revision="main",
+    model_path="",
+)
+DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
+    name="docling_layout_heron_101",
+    repo_id="ds4sd/docling-layout-heron-101",
+    revision="main",
+    model_path="",
+)
+DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
+    name="docling_layout_egret_medium",
+    repo_id="ds4sd/docling-layout-egret-medium",
+    revision="main",
+    model_path="",
+)
+DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
+    name="docling_layout_egret_large",
+    repo_id="ds4sd/docling-layout-egret-large",
+    revision="main",
+    model_path="",
+)
+DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
+    name="docling_layout_egret_xlarge",
+    repo_id="ds4sd/docling-layout-egret-xlarge",
+    revision="main",
+    model_path="",
+)
+# Example for a hypothetical alternative model
+# ALTERNATIVE_LAYOUT = LayoutModelConfig(
+#     name="alternative_layout",
+#     repo_id="someorg/alternative-layout",
+#     revision="main",
+#     model_path="model_artifacts/layout_alt",
+# )
+class LayoutModelType(str, Enum):
+    DOCLING_LAYOUT_V2 = "docling_layout_v2"
+    DOCLING_LAYOUT_HERON = "docling_layout_heron"
+    DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
+    DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
+    DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
+    DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
+    # ALTERNATIVE_LAYOUT = "alternative_layout"

{docling-2.39.0 → docling-2.41.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import logging
+from datetime import datetime
 from enum import Enum
 from pathlib import Path
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@@ -15,6 +16,15 @@ from docling.datamodel import asr_model_specs
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.layout_model_specs import (
+    DOCLING_LAYOUT_EGRET_LARGE,
+    DOCLING_LAYOUT_EGRET_MEDIUM,
+    DOCLING_LAYOUT_EGRET_XLARGE,
+    DOCLING_LAYOUT_HERON,
+    DOCLING_LAYOUT_HERON_101,
+    DOCLING_LAYOUT_V2,
+    LayoutModelConfig,
+)
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrOptions,
 )
@@ -265,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
     )
+class LayoutOptions(BaseModel):
+    """Options for layout processing."""
+    create_orphan_clusters: bool = True  # Whether to create clusters for orphaned cells
+    model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
 class AsrPipelineOptions(PipelineOptions):
     asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
     artifacts_path: Optional[Union[Path, str]] = None
@@ -289,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
     picture_description_options: PictureDescriptionBaseOptions = (
         smolvlm_picture_description
     )
+    layout_options: LayoutOptions = LayoutOptions()
     images_scale: float = 1.0
     generate_page_images: bool = False

{docling-2.39.0 → docling-2.41.0}/docling/datamodel/pipeline_options_vlm_model.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
+from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
@@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
     kind: str
-    prompt: str
+    prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
     scale: float = 2.0
     max_size: Optional[int] = None
+    temperature: float = 0.0
 class ResponseFormat(str, Enum):
@@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
     AUTOMODEL = "automodel"
     AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
     AUTOMODEL_CAUSALLM = "automodel-causallm"
+    AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
+class TransformersPromptStyle(str, Enum):
+    CHAT = "chat"
+    RAW = "raw"
 class InlineVlmOptions(BaseVlmOptions):
@@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
     inference_framework: InferenceFramework
     transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
+    transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
     response_format: ResponseFormat
     torch_dtype: Optional[str] = None
@@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
         AcceleratorDevice.MPS,
     ]
-    temperature: float = 0.0
     stop_strings: List[str] = []
     extra_generation_config: Dict[str, Any] = {}

{docling-2.39.0 → docling-2.41.0}/docling/models/api_vlm_model.py RENAMED Viewed

@@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
             self.timeout = self.vlm_options.timeout
             self.concurrency = self.vlm_options.concurrency
-            self.prompt_content = (
-                f"This is a page from a document.\n{self.vlm_options.prompt}"
-            )
             self.params = {
                 **self.vlm_options.params,
-                "temperature": 0,
+                "temperature": self.vlm_options.temperature,
             }
     def __call__(
@@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
+                    if callable(self.vlm_options.prompt):
+                        prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        prompt = self.vlm_options.prompt
                     page_tags = api_image_request(
                         image=hi_res_image,
-                        prompt=self.prompt_content,
+                        prompt=prompt,
                         url=self.vlm_options.url,
                         timeout=self.timeout,
                         headers=self.vlm_options.headers,

{docling-2.39.0 → docling-2.41.0}/docling/models/base_ocr_model.py RENAMED Viewed

@@ -3,14 +3,13 @@ import logging
 from abc import abstractmethod
 from collections.abc import Iterable
 from pathlib import Path
-from typing import List, Optional, Type
+from typing import TYPE_CHECKING, List, Optional, Type
 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import TextCell
 from PIL import Image, ImageDraw
 from rtree import index
-from scipy.ndimage import binary_dilation, find_objects, label
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
@@ -31,11 +30,16 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
         options: OcrOptions,
         accelerator_options: AcceleratorOptions,
     ):
+        # Make sure any delay/error from import occurs on ocr model init and not first use
+        from scipy.ndimage import binary_dilation, find_objects, label
         self.enabled = enabled
         self.options = options
     # Computes the optimum amount and coordinates of rectangles to OCR on a given page
     def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
+        from scipy.ndimage import binary_dilation, find_objects, label
         BITMAP_COVERAGE_TRESHOLD = 0.75
         assert page.size is not None

{docling-2.39.0 → docling-2.41.0}/docling/models/document_picture_classifier.py RENAMED Viewed

@@ -14,7 +14,8 @@ from PIL import Image
 from pydantic import BaseModel
 from docling.datamodel.accelerator_options import AcceleratorOptions
-from docling.models.base_model import BaseEnrichmentModel
+from docling.datamodel.base_models import ItemAndImageEnrichmentElement
+from docling.models.base_model import BaseItemAndImageEnrichmentModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
@@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
     kind: Literal["document_picture_classifier"] = "document_picture_classifier"
-class DocumentPictureClassifier(BaseEnrichmentModel):
+class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
     """
     A model for classifying pictures in documents.
@@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
     def __call__(
         self,
         doc: DoclingDocument,
-        element_batch: Iterable[NodeItem],
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
     ) -> Iterable[NodeItem]:
         """
         Processes a batch of elements and enriches them with classification predictions.
@@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
         ----------
         doc : DoclingDocument
             The document containing the elements to be processed.
-        element_batch : Iterable[NodeItem]
+        element_batch : Iterable[ItemAndImageEnrichmentElement]
             A batch of pictures to classify.
         Returns
@@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
         """
         if not self.enabled:
             for element in element_batch:
-                yield element
+                yield element.item
             return
         images: List[Union[Image.Image, np.ndarray]] = []
         elements: List[PictureItem] = []
         for el in element_batch:
-            assert isinstance(el, PictureItem)
-            elements.append(el)
-            img = el.get_image(doc)
-            assert img is not None
-            images.append(img)
+            assert isinstance(el.item, PictureItem)
+            elements.append(el.item)
+            images.append(el.image)
         outputs = self.document_picture_classifier.predict(images)
-        for element, output in zip(elements, outputs):
-            element.annotations.append(
+        for item, output in zip(elements, outputs):
+            item.annotations.append(
                 PictureClassificationData(
                     provenance="DocumentPictureClassifier",
                     predicted_classes=[
@@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
                 )
             )
-            yield element
+            yield item

{docling-2.39.0 → docling-2.41.0}/docling/models/layout_model.py RENAMED Viewed

@@ -7,12 +7,13 @@ from typing import Optional
 import numpy as np
 from docling_core.types.doc import DocItemLabel
-from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
+from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
@@ -25,9 +26,6 @@ _log = logging.getLogger(__name__)
 class LayoutModel(BasePageModel):
-    _model_repo_folder = "ds4sd--docling-models"
-    _model_path = "model_artifacts/layout"
     TEXT_ELEM_LABELS = [
         DocItemLabel.TEXT,
         DocItemLabel.FOOTNOTE,
@@ -49,28 +47,38 @@ class LayoutModel(BasePageModel):
     CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
     def __init__(
-        self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
+        self,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        options: LayoutOptions,
     ):
+        from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
+        self.options = options
         device = decide_device(accelerator_options.device)
+        layout_model_config = options.model_spec
+        model_repo_folder = layout_model_config.model_repo_folder
+        model_path = layout_model_config.model_path
         if artifacts_path is None:
-            artifacts_path = self.download_models() / self._model_path
+            artifacts_path = (
+                self.download_models(layout_model_config=layout_model_config)
+                / model_path
+            )
         else:
-            # will become the default in the future
-            if (artifacts_path / self._model_repo_folder).exists():
-                artifacts_path = (
-                    artifacts_path / self._model_repo_folder / self._model_path
-                )
-            elif (artifacts_path / self._model_path).exists():
+            if (artifacts_path / model_repo_folder).exists():
+                artifacts_path = artifacts_path / model_repo_folder / model_path
+            elif (artifacts_path / model_path).exists():
                 warnings.warn(
                     "The usage of artifacts_path containing directly "
-                    f"{self._model_path} is deprecated. Please point "
+                    f"{model_path} is deprecated. Please point "
                     "the artifacts_path to the parent containing "
-                    f"the {self._model_repo_folder} folder.",
+                    f"the {model_repo_folder} folder.",
                     DeprecationWarning,
                     stacklevel=3,
                 )
-                artifacts_path = artifacts_path / self._model_path
+                artifacts_path = artifacts_path / model_path
         self.layout_predictor = LayoutPredictor(
             artifact_path=str(artifacts_path),
@@ -83,10 +91,11 @@ class LayoutModel(BasePageModel):
         local_dir: Optional[Path] = None,
         force: bool = False,
         progress: bool = False,
+        layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
     ) -> Path:
         return download_hf_model(
-            repo_id="ds4sd/docling-models",
-            revision="v2.2.0",
+            repo_id=layout_model_config.repo_id,
+            revision=layout_model_config.revision,
             local_dir=local_dir,
             force=force,
             progress=progress,
@@ -176,7 +185,7 @@ class LayoutModel(BasePageModel):
                     # Apply postprocessing
                     processed_clusters, processed_cells = LayoutPostprocessor(
-                        page, clusters
+                        page, clusters, self.options
                     ).postprocess()
                     # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally

docling 2.39.0__tar.gz → 2.41.0__tar.gz

docling 2.39.0tar.gz → 2.41.0tar.gz