PyPI - docling - Versions diffs - 2.35.0__py3-none-any.whl → 2.36.0__py3-none-any.whl - Mend

docling 2.35.0py3-none-any.whl → 2.36.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docling/backend/xml/jats_backend.py +0 -0
docling/cli/main.py +12 -15
docling/datamodel/accelerator_options.py +68 -0
docling/datamodel/base_models.py +10 -8
docling/datamodel/pipeline_options.py +29 -161
docling/datamodel/pipeline_options_vlm_model.py +81 -0
docling/datamodel/vlm_model_specs.py +144 -0
docling/document_converter.py +5 -0
docling/models/api_vlm_model.py +1 -1
docling/models/base_ocr_model.py +2 -1
docling/models/code_formula_model.py +6 -11
docling/models/document_picture_classifier.py +6 -11
docling/models/easyocr_model.py +1 -2
docling/models/layout_model.py +6 -11
docling/models/ocr_mac_model.py +1 -1
docling/models/picture_description_api_model.py +1 -1
docling/models/picture_description_base_model.py +1 -1
docling/models/picture_description_vlm_model.py +7 -22
docling/models/rapid_ocr_model.py +1 -2
docling/models/table_structure_model.py +6 -12
docling/models/tesseract_ocr_cli_model.py +1 -1
docling/models/tesseract_ocr_model.py +1 -1
docling/models/utils/__init__.py +0 -0
docling/models/utils/hf_model_download.py +40 -0
docling/models/vlm_models_inline/__init__.py +0 -0
docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} +56 -44
docling/pipeline/vlm_pipeline.py +228 -61
docling/utils/accelerator_utils.py +17 -2
docling/utils/model_downloader.py +13 -12
{docling-2.35.0.dist-info → docling-2.36.0.dist-info}/METADATA +54 -55
{docling-2.35.0.dist-info → docling-2.36.0.dist-info}/RECORD +46 -39
{docling-2.35.0.dist-info → docling-2.36.0.dist-info}/WHEEL +2 -1
docling-2.36.0.dist-info/entry_points.txt +6 -0
docling-2.36.0.dist-info/top_level.txt +1 -0
docling/models/hf_vlm_model.py +0 -182
docling-2.35.0.dist-info/entry_points.txt +0 -7
{docling-2.35.0.dist-info → docling-2.36.0.dist-info/licenses}/LICENSE +0 -0

docling/models/code_formula_model.py CHANGED Viewed

@@ -16,9 +16,10 @@ from docling_core.types.doc.labels import CodeLanguageLabel
 from PIL import Image, ImageOps
 from pydantic import BaseModel
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement
-from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.models.base_model import BaseItemAndImageEnrichmentModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
@@ -117,20 +118,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         force: bool = False,
         progress: bool = False,
     ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
             repo_id="ds4sd/CodeFormula",
-            force_download=force,
-            local_dir=local_dir,
             revision="v1.0.2",
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
         )
-        return Path(download_path)
     def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
         """
         Determines if a given element in a document can be processed by the model.

docling/models/document_picture_classifier.py CHANGED Viewed

@@ -13,8 +13,9 @@ from docling_core.types.doc import (
 from PIL import Image
 from pydantic import BaseModel
-from docling.datamodel.pipeline_options import AcceleratorOptions
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.models.base_model import BaseEnrichmentModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
@@ -105,20 +106,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
     def download_models(
         local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
     ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
             repo_id="ds4sd/DocumentFigureClassifier",
-            force_download=force,
-            local_dir=local_dir,
             revision="v1.0.1",
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
         )
-        return Path(download_path)
     def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
         """
         Determines if the given element can be processed by the classifier.

docling/models/easyocr_model.py CHANGED Viewed

@@ -9,11 +9,10 @@ import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
 )

docling/models/layout_model.py CHANGED Viewed

@@ -10,11 +10,12 @@ from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import Image
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
 from docling.utils.profiling import TimeRecorder
@@ -83,20 +84,14 @@ class LayoutModel(BasePageModel):
         force: bool = False,
         progress: bool = False,
     ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
             repo_id="ds4sd/docling-models",
-            force_download=force,
+            revision="v2.2.0",
             local_dir=local_dir,
-            revision="v2.1.0",
+            force=force,
+            progress=progress,
         )
-        return Path(download_path)
     def draw_clusters_and_cells_side_by_side(
         self, conv_res, page, clusters, mode_prefix: str, show: bool = False
     ):

docling/models/ocr_mac_model.py CHANGED Viewed

@@ -8,10 +8,10 @@ from typing import Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
     OcrMacOptions,
     OcrOptions,
 )

docling/models/picture_description_api_model.py CHANGED Viewed

@@ -5,8 +5,8 @@ from typing import Optional, Type, Union
 from PIL import Image
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
     PictureDescriptionApiOptions,
     PictureDescriptionBaseOptions,
 )

docling/models/picture_description_base_model.py CHANGED Viewed

@@ -13,8 +13,8 @@ from docling_core.types.doc.document import (  # TODO: move import to docling_co
 )
 from PIL import Image
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
     PictureDescriptionBaseOptions,
 )
 from docling.models.base_model import (

docling/models/picture_description_vlm_model.py CHANGED Viewed

@@ -4,16 +4,21 @@ from typing import Optional, Type, Union
 from PIL import Image
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
     PictureDescriptionBaseOptions,
     PictureDescriptionVlmOptions,
 )
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
 from docling.utils.accelerator_utils import decide_device
-class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
+class PictureDescriptionVlmModel(
+    PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
+):
     @classmethod
     def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
         return PictureDescriptionVlmOptions
@@ -66,26 +71,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
             self.provenance = f"{self.options.repo_id}"
-    @staticmethod
-    def download_models(
-        repo_id: str,
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id=repo_id,
-            force_download=force,
-            local_dir=local_dir,
-        )
-        return Path(download_path)
     def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
         from transformers import GenerationConfig

docling/models/rapid_ocr_model.py CHANGED Viewed

@@ -7,11 +7,10 @@ import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
     OcrOptions,
     RapidOcrOptions,
 )

docling/models/table_structure_model.py CHANGED Viewed

@@ -13,16 +13,16 @@ from docling_core.types.doc.page import (
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
     TableFormerMode,
     TableStructureOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
@@ -90,20 +90,14 @@ class TableStructureModel(BasePageModel):
     def download_models(
         local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
     ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
+        return download_hf_model(
             repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
             revision="v2.2.0",
+            local_dir=local_dir,
+            force=force,
+            progress=progress,
         )
-        return Path(download_path)
     def draw_table_and_cells(
         self,
         conv_res: ConversionResult,

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -13,10 +13,10 @@ import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import TextCell
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
     OcrOptions,
     TesseractCliOcrOptions,
 )

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -7,10 +7,10 @@ from typing import Iterable, Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import TextCell
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
     OcrOptions,
     TesseractOcrOptions,
 )

docling/models/utils/__init__.py ADDED Viewed

File without changes

docling/models/utils/hf_model_download.py ADDED Viewed

@@ -0,0 +1,40 @@
+import logging
+from pathlib import Path
+from typing import Optional
+_log = logging.getLogger(__name__)
+def download_hf_model(
+    repo_id: str,
+    local_dir: Optional[Path] = None,
+    force: bool = False,
+    progress: bool = False,
+    revision: Optional[str] = None,
+) -> Path:
+    from huggingface_hub import snapshot_download
+    from huggingface_hub.utils import disable_progress_bars
+    if not progress:
+        disable_progress_bars()
+    download_path = snapshot_download(
+        repo_id=repo_id,
+        force_download=force,
+        local_dir=local_dir,
+        revision=revision,
+    )
+    return Path(download_path)
+class HuggingFaceModelDownloadMixin:
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        return download_hf_model(
+            repo_id=repo_id, local_dir=local_dir, force=force, progress=progress
+        )

docling/models/vlm_models_inline/__init__.py ADDED Viewed

File without changes

docling/models/vlm_models_inline/hf_transformers_model.py ADDED Viewed

@@ -0,0 +1,194 @@
+import importlib.metadata
+import logging
+import time
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any, Optional
+from docling.datamodel.accelerator_options import (
+    AcceleratorOptions,
+)
+from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options_vlm_model import (
+    InlineVlmOptions,
+    TransformersModelType,
+)
+from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+_log = logging.getLogger(__name__)
+class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        vlm_options: InlineVlmOptions,
+    ):
+        self.enabled = enabled
+        self.vlm_options = vlm_options
+        if self.enabled:
+            import torch
+            from transformers import (
+                AutoModel,
+                AutoModelForCausalLM,
+                AutoModelForVision2Seq,
+                AutoProcessor,
+                BitsAndBytesConfig,
+                GenerationConfig,
+            )
+            transformers_version = importlib.metadata.version("transformers")
+            if (
+                self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct"
+                and transformers_version >= "4.52.0"
+            ):
+                raise NotImplementedError(
+                    f"Phi 4 only works with transformers<4.52.0 but you have {transformers_version=}. Please downgrage running pip install -U 'transformers<4.52.0'."
+                )
+            self.device = decide_device(
+                accelerator_options.device,
+                supported_devices=vlm_options.supported_devices,
+            )
+            _log.debug(f"Available device for VLM: {self.device}")
+            self.use_cache = vlm_options.use_kv_cache
+            self.max_new_tokens = vlm_options.max_new_tokens
+            self.temperature = vlm_options.temperature
+            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+            if artifacts_path is None:
+                artifacts_path = self.download_models(self.vlm_options.repo_id)
+            elif (artifacts_path / repo_cache_folder).exists():
+                artifacts_path = artifacts_path / repo_cache_folder
+            self.param_quantization_config: Optional[BitsAndBytesConfig] = None
+            if vlm_options.quantized:
+                self.param_quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=vlm_options.load_in_8bit,
+                    llm_int8_threshold=vlm_options.llm_int8_threshold,
+                )
+            model_cls: Any = AutoModel
+            if (
+                self.vlm_options.transformers_model_type
+                == TransformersModelType.AUTOMODEL_CAUSALLM
+            ):
+                model_cls = AutoModelForCausalLM
+            elif (
+                self.vlm_options.transformers_model_type
+                == TransformersModelType.AUTOMODEL_VISION2SEQ
+            ):
+                model_cls = AutoModelForVision2Seq
+            self.processor = AutoProcessor.from_pretrained(
+                artifacts_path,
+                trust_remote_code=vlm_options.trust_remote_code,
+            )
+            self.vlm_model = model_cls.from_pretrained(
+                artifacts_path,
+                device_map=self.device,
+                _attn_implementation=(
+                    "flash_attention_2"
+                    if self.device.startswith("cuda")
+                    and accelerator_options.cuda_use_flash_attention2
+                    else "eager"
+                ),
+                trust_remote_code=vlm_options.trust_remote_code,
+            )
+            # Load generation config
+            self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "vlm"):
+                    assert page.size is not None
+                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    # Define prompt structure
+                    prompt = self.formulate_prompt()
+                    inputs = self.processor(
+                        text=prompt, images=[hi_res_image], return_tensors="pt"
+                    ).to(self.device)
+                    start_time = time.time()
+                    # Call model to generate:
+                    generated_ids = self.vlm_model.generate(
+                        **inputs,
+                        max_new_tokens=self.max_new_tokens,
+                        use_cache=self.use_cache,
+                        temperature=self.temperature,
+                        generation_config=self.generation_config,
+                        **self.vlm_options.extra_generation_config,
+                    )
+                    generation_time = time.time() - start_time
+                    generated_texts = self.processor.batch_decode(
+                        generated_ids[:, inputs["input_ids"].shape[1] :],
+                        skip_special_tokens=False,
+                    )[0]
+                    num_tokens = len(generated_ids[0])
+                    _log.debug(
+                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
+                    )
+                    page.predictions.vlm_response = VlmPrediction(
+                        text=generated_texts,
+                        generation_time=generation_time,
+                    )
+                yield page
+    def formulate_prompt(self) -> str:
+        """Formulate a prompt for the VLM."""
+        if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+            _log.debug("Using specialized prompt for Phi-4")
+            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
+            user_prompt = "<|user|>"
+            assistant_prompt = "<|assistant|>"
+            prompt_suffix = "<|end|>"
+            prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
+            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
+            return prompt
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "This is a page from a document.",
+                    },
+                    {"type": "image"},
+                    {"type": "text", "text": self.vlm_options.prompt},
+                ],
+            }
+        ]
+        prompt = self.processor.apply_chat_template(
+            messages, add_generation_prompt=False
+        )
+        return prompt

docling 2.35.0__py3-none-any.whl → 2.36.0__py3-none-any.whl

docling 2.35.0py3-none-any.whl → 2.36.0py3-none-any.whl