PyPI - docling - Versions diffs - 2.19.0__tar.gz → 2.21.0__tar.gz - Mend

docling 2.19.0tar.gz → 2.21.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{docling-2.19.0 → docling-2.21.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.19.0
+Version: 2.21.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -24,10 +24,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Provides-Extra: ocrmac
 Provides-Extra: rapidocr
 Provides-Extra: tesserocr
+Provides-Extra: vlm
 Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
-Requires-Dist: docling-core[chunking] (>=2.17.2,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.18.0,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
 Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
@@ -53,6 +54,8 @@ Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
 Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
 Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
 Requires-Dist: tqdm (>=4.65.0,<5.0.0)
+Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
+Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
 Requires-Dist: typer (>=0.12.5,<0.13.0)
 Project-URL: Repository, https://github.com/DS4SD/docling
 Description-Content-Type: text/markdown

{docling-2.19.0 → docling-2.21.0}/docling/cli/main.py RENAMED Viewed

@@ -226,6 +226,10 @@ def convert(
             help="Enable the picture classification enrichment model in the pipeline.",
         ),
     ] = False,
+    enrich_picture_description: Annotated[
+        bool,
+        typer.Option(..., help="Enable the picture description model in the pipeline."),
+    ] = False,
     artifacts_path: Annotated[
         Optional[Path],
         typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -382,6 +386,7 @@ def convert(
             do_table_structure=True,
             do_code_enrichment=enrich_code,
             do_formula_enrichment=enrich_formula,
+            do_picture_description=enrich_picture_description,
             do_picture_classification=enrich_picture_classes,
             document_timeout=document_timeout,
         )

{docling-2.19.0 → docling-2.21.0}/docling/cli/models.py RENAMED Viewed

@@ -31,6 +31,7 @@ class _AvailableModels(str, Enum):
     TABLEFORMER = "tableformer"
     CODE_FORMULA = "code_formula"
     PICTURE_CLASSIFIER = "picture_classifier"
+    SMOLVLM = "smolvlm"
     EASYOCR = "easyocr"
@@ -81,6 +82,7 @@ def download(
         with_tableformer=_AvailableModels.TABLEFORMER in to_download,
         with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
         with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
+        with_smolvlm=_AvailableModels.SMOLVLM in to_download,
         with_easyocr=_AvailableModels.EASYOCR in to_download,
     )

{docling-2.19.0 → docling-2.21.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -2,9 +2,9 @@ import logging
 import os
 from enum import Enum
 from pathlib import Path
-from typing import Any, List, Literal, Optional, Union
+from typing import Annotated, Any, Dict, List, Literal, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 _log = logging.getLogger(__name__)
@@ -184,6 +184,51 @@ class OcrMacOptions(OcrOptions):
     )
+class PictureDescriptionBaseOptions(BaseModel):
+    kind: str
+    batch_size: int = 8
+    scale: float = 2
+    bitmap_area_threshold: float = (
+        0.2  # percentage of the area for a bitmap to processed with the models
+    )
+class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
+    kind: Literal["api"] = "api"
+    url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
+    headers: Dict[str, str] = {}
+    params: Dict[str, Any] = {}
+    timeout: float = 20
+    prompt: str = "Describe this image in a few sentences."
+    provenance: str = ""
+class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
+    kind: Literal["vlm"] = "vlm"
+    repo_id: str
+    prompt: str = "Describe this image in a few sentences."
+    # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
+    generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+smolvlm_picture_description = PictureDescriptionVlmOptions(
+    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
+)
+# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
+granite_picture_description = PictureDescriptionVlmOptions(
+    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
+    prompt="What is shown in this image?",
+)
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
     """Enum of valid PDF backends."""
@@ -223,6 +268,7 @@ class PdfPipelineOptions(PipelineOptions):
     do_code_enrichment: bool = False  # True: perform code OCR
     do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
     do_picture_classification: bool = False  # True: classify pictures in documents
+    do_picture_description: bool = False  # True: run describe pictures in documents
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[
@@ -232,6 +278,10 @@ class PdfPipelineOptions(PipelineOptions):
         OcrMacOptions,
         RapidOcrOptions,
     ] = Field(EasyOcrOptions(), discriminator="kind")
+    picture_description_options: Annotated[
+        Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
+        Field(discriminator="kind"),
+    ] = smolvlm_picture_description
     images_scale: float = 1.0
     generate_page_images: bool = False

{docling-2.19.0 → docling-2.21.0}/docling/models/base_model.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Generic, Iterable, Optional
-from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem
+from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
 from typing_extensions import TypeVar
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
@@ -64,7 +64,7 @@ class BaseItemAndImageEnrichmentModel(
         if not self.is_processable(doc=conv_res.document, element=element):
             return None
-        assert isinstance(element, TextItem)
+        assert isinstance(element, DocItem)
         element_prov = element.prov[0]
         bbox = element_prov.bbox

{docling-2.19.0 → docling-2.21.0}/docling/models/ds_glm_model.py RENAMED Viewed

@@ -4,7 +4,12 @@ from pathlib import Path
 from typing import List, Union
 from deepsearch_glm.andromeda_nlp import nlp_model
-from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItemLabel,
+    DoclingDocument,
+)
 from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.legacy_doc.base import (
     Figure,
@@ -71,12 +76,15 @@ class GlmModel:
         )
         main_text: List[Union[Ref, BaseText]] = []
+        page_headers: List[Union[Ref, BaseText]] = []
+        page_footers: List[Union[Ref, BaseText]] = []
         tables: List[DsSchemaTable] = []
         figures: List[Figure] = []
         page_no_to_page = {p.page_no: p for p in conv_res.pages}
-        for element in conv_res.assembled.elements:
+        for element in conv_res.assembled.body:
             # Convert bboxes to lower-left origin.
             target_bbox = DsBoundingBox(
                 element.cluster.bbox.to_bottom_left_origin(
@@ -238,6 +246,53 @@ class GlmModel:
                     )
                 )
+        # We can throw in headers and footers at the end of the legacy doc
+        # since the reading-order will re-sort it later.
+        for element in conv_res.assembled.headers:
+            # Convert bboxes to lower-left origin.
+            target_bbox = DsBoundingBox(
+                element.cluster.bbox.to_bottom_left_origin(
+                    page_no_to_page[element.page_no].size.height
+                ).as_tuple()
+            )
+            if isinstance(element, TextElement):
+                tel = BaseText(
+                    text=element.text,
+                    obj_type=layout_label_to_ds_type.get(element.label),
+                    name=element.label,
+                    prov=[
+                        Prov(
+                            bbox=target_bbox,
+                            page=element.page_no + 1,
+                            span=[0, len(element.text)],
+                        )
+                    ],
+                )
+                if element.label == DocItemLabel.PAGE_HEADER:
+                    index = len(page_headers)
+                    ref_str = f"#/page-headers/{index}"
+                    main_text.append(
+                        Ref(
+                            name=element.label,
+                            obj_type=layout_label_to_ds_type.get(element.label),
+                            ref=ref_str,
+                        ),
+                    )
+                    page_headers.append(tel)
+                elif element.label == DocItemLabel.PAGE_FOOTER:
+                    index = len(page_footers)
+                    ref_str = f"#/page-footers/{index}"
+                    main_text.append(
+                        Ref(
+                            name=element.label,
+                            obj_type=layout_label_to_ds_type.get(element.label),
+                            ref=ref_str,
+                        ),
+                    )
+                    page_footers.append(tel)
         page_dimensions = [
             PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
             for p in conv_res.pages
@@ -252,6 +307,8 @@ class GlmModel:
             tables=tables,
             figures=figures,
             page_dimensions=page_dimensions,
+            page_headers=page_headers,
+            page_footers=page_footers,
         )
         return ds_doc
@@ -264,6 +321,7 @@ class GlmModel:
             glm_doc = self.model.apply_on_doc(ds_doc_dict)
             docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
+            1 == 1
         # DEBUG code:
         def draw_clusters_and_cells(ds_document, page_no, show: bool = False):

{docling-2.19.0 → docling-2.21.0}/docling/models/easyocr_model.py RENAMED Viewed

@@ -4,9 +4,7 @@ import zipfile
 from pathlib import Path
 from typing import Iterable, List, Optional
-import httpx
 import numpy
-import torch
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling.datamodel.base_models import Cell, OcrCell, Page

docling-2.21.0/docling/models/picture_description_api_model.py ADDED Viewed

@@ -0,0 +1,101 @@
+import base64
+import io
+import logging
+from typing import Iterable, List, Optional
+import requests
+from PIL import Image
+from pydantic import BaseModel, ConfigDict
+from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+_log = logging.getLogger(__name__)
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+class ResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: str
+class ResponseUsage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+class ApiResponse(BaseModel):
+    model_config = ConfigDict(
+        protected_namespaces=(),
+    )
+    id: str
+    model: Optional[str] = None  # returned by openai
+    choices: List[ResponseChoice]
+    created: int
+    usage: ResponseUsage
+class PictureDescriptionApiModel(PictureDescriptionBaseModel):
+    # elements_batch_size = 4
+    def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: PictureDescriptionApiOptions
+        if self.enabled:
+            if options.url.host != "localhost":
+                raise NotImplementedError(
+                    "The options try to connect to remote APIs which are not yet allowed."
+                )
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        # Note: technically we could make a batch request here,
+        # but not all APIs will allow for it. For example, vllm won't allow more than 1.
+        for image in images:
+            img_io = io.BytesIO()
+            image.save(img_io, "PNG")
+            image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": self.options.prompt,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{image_base64}"
+                            },
+                        },
+                    ],
+                }
+            ]
+            payload = {
+                "messages": messages,
+                **self.options.params,
+            }
+            r = requests.post(
+                str(self.options.url),
+                headers=self.options.headers,
+                json=payload,
+                timeout=self.options.timeout,
+            )
+            if not r.ok:
+                _log.error(f"Error calling the API. Reponse was {r.text}")
+            r.raise_for_status()
+            api_resp = ApiResponse.model_validate_json(r.text)
+            generated_text = api_resp.choices[0].message.content.strip()
+            yield generated_text

docling-2.21.0/docling/models/picture_description_base_model.py ADDED Viewed

@@ -0,0 +1,64 @@
+import logging
+from pathlib import Path
+from typing import Any, Iterable, List, Optional, Union
+from docling_core.types.doc import (
+    DoclingDocument,
+    NodeItem,
+    PictureClassificationClass,
+    PictureItem,
+)
+from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc
+    PictureDescriptionData,
+)
+from PIL import Image
+from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
+from docling.models.base_model import (
+    BaseItemAndImageEnrichmentModel,
+    ItemAndImageEnrichmentElement,
+)
+class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
+    images_scale: float = 2.0
+    def __init__(
+        self,
+        enabled: bool,
+        options: PictureDescriptionBaseOptions,
+    ):
+        self.enabled = enabled
+        self.options = options
+        self.provenance = "not-implemented"
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        return self.enabled and isinstance(element, PictureItem)
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        raise NotImplementedError
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
+    ) -> Iterable[NodeItem]:
+        if not self.enabled:
+            for element in element_batch:
+                yield element.item
+            return
+        images: List[Image.Image] = []
+        elements: List[PictureItem] = []
+        for el in element_batch:
+            assert isinstance(el.item, PictureItem)
+            elements.append(el.item)
+            images.append(el.image)
+        outputs = self._annotate_images(images)
+        for item, output in zip(elements, outputs):
+            item.annotations.append(
+                PictureDescriptionData(text=output, provenance=self.provenance)
+            )
+            yield item

docling-2.21.0/docling/models/picture_description_vlm_model.py ADDED Viewed

@@ -0,0 +1,109 @@
+from pathlib import Path
+from typing import Iterable, Optional, Union
+from PIL import Image
+from docling.datamodel.pipeline_options import (
+    AcceleratorOptions,
+    PictureDescriptionVlmOptions,
+)
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.utils.accelerator_utils import decide_device
+class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Union[Path, str]],
+        options: PictureDescriptionVlmOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(enabled=enabled, options=options)
+        self.options: PictureDescriptionVlmOptions
+        if self.enabled:
+            if artifacts_path is None:
+                artifacts_path = self.download_models(repo_id=self.options.repo_id)
+            else:
+                artifacts_path = Path(artifacts_path) / self.options.repo_cache_folder
+            self.device = decide_device(accelerator_options.device)
+            try:
+                import torch
+                from transformers import AutoModelForVision2Seq, AutoProcessor
+            except ImportError:
+                raise ImportError(
+                    "transformers >=4.46 is not installed. Please install Docling with the required extras `pip install docling[vlm]`."
+                )
+            # Initialize processor and model
+            self.processor = AutoProcessor.from_pretrained(self.options.repo_id)
+            self.model = AutoModelForVision2Seq.from_pretrained(
+                self.options.repo_id,
+                torch_dtype=torch.bfloat16,
+                _attn_implementation=(
+                    "flash_attention_2" if self.device.startswith("cuda") else "eager"
+                ),
+            ).to(self.device)
+            self.provenance = f"{self.options.repo_id}"
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id=repo_id,
+            force_download=force,
+            local_dir=local_dir,
+        )
+        return Path(download_path)
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        from transformers import GenerationConfig
+        # Create input messages
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": self.options.prompt},
+                ],
+            },
+        ]
+        # TODO: do batch generation
+        for image in images:
+            # Prepare inputs
+            prompt = self.processor.apply_chat_template(
+                messages, add_generation_prompt=True
+            )
+            inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
+            inputs = inputs.to(self.device)
+            # Generate outputs
+            generated_ids = self.model.generate(
+                **inputs,
+                generation_config=GenerationConfig(**self.options.generation_config),
+            )
+            generated_texts = self.processor.batch_decode(
+                generated_ids[:, inputs["input_ids"].shape[1] :],
+                skip_special_tokens=True,
+            )
+            yield generated_texts[0].strip()

{docling-2.19.0 → docling-2.21.0}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

@@ -14,6 +14,8 @@ from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
     OcrMacOptions,
     PdfPipelineOptions,
+    PictureDescriptionApiOptions,
+    PictureDescriptionVlmOptions,
     RapidOcrOptions,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
@@ -34,6 +36,9 @@ from docling.models.page_preprocessing_model import (
     PagePreprocessingModel,
     PagePreprocessingOptions,
 )
+from docling.models.picture_description_api_model import PictureDescriptionApiModel
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.rapid_ocr_model import RapidOcrModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
@@ -95,8 +100,17 @@ class StandardPdfPipeline(PaginatedPipeline):
             PageAssembleModel(options=PageAssembleOptions()),
         ]
+        # Picture description model
+        if (
+            picture_description_model := self.get_picture_description_model(
+                artifacts_path=artifacts_path
+            )
+        ) is None:
+            raise RuntimeError(
+                f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
+            )
         self.enrichment_pipe = [
-            # Other models working on `NodeItem` elements in the DoclingDocument
             # Code Formula Enrichment Model
             CodeFormulaModel(
                 enabled=pipeline_options.do_code_enrichment
@@ -115,11 +129,14 @@ class StandardPdfPipeline(PaginatedPipeline):
                 options=DocumentPictureClassifierOptions(),
                 accelerator_options=pipeline_options.accelerator_options,
             ),
+            # Document Picture description
+            picture_description_model,
         ]
         if (
             self.pipeline_options.do_formula_enrichment
             or self.pipeline_options.do_code_enrichment
+            or self.pipeline_options.do_picture_description
         ):
             self.keep_backend = True
@@ -175,6 +192,29 @@ class StandardPdfPipeline(PaginatedPipeline):
             )
         return None
+    def get_picture_description_model(
+        self, artifacts_path: Optional[Path] = None
+    ) -> Optional[PictureDescriptionBaseModel]:
+        if isinstance(
+            self.pipeline_options.picture_description_options,
+            PictureDescriptionApiOptions,
+        ):
+            return PictureDescriptionApiModel(
+                enabled=self.pipeline_options.do_picture_description,
+                options=self.pipeline_options.picture_description_options,
+            )
+        elif isinstance(
+            self.pipeline_options.picture_description_options,
+            PictureDescriptionVlmOptions,
+        ):
+            return PictureDescriptionVlmModel(
+                enabled=self.pipeline_options.do_picture_description,
+                artifacts_path=artifacts_path,
+                options=self.pipeline_options.picture_description_options,
+                accelerator_options=self.pipeline_options.accelerator_options,
+            )
+        return None
     def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
         with TimeRecorder(conv_res, "page_init"):
             page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore

{docling-2.19.0 → docling-2.21.0}/docling/utils/glm_utils.py RENAMED Viewed

@@ -15,6 +15,7 @@ from docling_core.types.doc import (
     TableCell,
     TableData,
 )
+from docling_core.types.doc.document import ContentLayer
 def resolve_item(paths, obj):
@@ -311,6 +312,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
                 current_list = None
                 doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
+            elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
+                current_list = None
+                doc.add_text(
+                    label=DocItemLabel(name_label),
+                    text=text,
+                    prov=prov,
+                    content_layer=ContentLayer.FURNITURE,
+                )
             else:
                 current_list = None

{docling-2.19.0 → docling-2.21.0}/docling/utils/model_downloader.py RENAMED Viewed

@@ -2,11 +2,13 @@ import logging
 from pathlib import Path
 from typing import Optional
+from docling.datamodel.pipeline_options import smolvlm_picture_description
 from docling.datamodel.settings import settings
 from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
+from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.table_structure_model import TableStructureModel
 _log = logging.getLogger(__name__)
@@ -21,6 +23,7 @@ def download_models(
     with_tableformer: bool = True,
     with_code_formula: bool = True,
     with_picture_classifier: bool = True,
+    with_smolvlm: bool = True,
     with_easyocr: bool = True,
 ):
     if output_dir is None:
@@ -61,6 +64,15 @@ def download_models(
             progress=progress,
         )
+    if with_smolvlm:
+        _log.info(f"Downloading SmolVlm model...")
+        PictureDescriptionVlmModel.download_models(
+            repo_id=smolvlm_picture_description.repo_id,
+            local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
+            force=force,
+            progress=progress,
+        )
     if with_easyocr:
         _log.info(f"Downloading easyocr models...")
         EasyOcrModel.download_models(

{docling-2.19.0 → docling-2.21.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.19.0"  # DO NOT EDIT, updated automatically
+version = "2.21.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
 ######################
 python = "^3.9"
 pydantic = "^2.0.0"
-docling-core = {extras = ["chunking"], version = "^2.17.2"}
+docling-core = {extras = ["chunking"], version = "^2.18.0"}
 docling-ibm-models = "^3.3.0"
 deepsearch-glm = "^1.0.0"
 docling-parse = "^3.3.0"
@@ -59,6 +59,10 @@ onnxruntime = [
   { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
   { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
 ]
+transformers = [
+  {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
+  {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
+]
 pillow = "^10.0.0"
 tqdm = "^4.65.0"
@@ -121,6 +125,7 @@ torchvision = [
 [tool.poetry.extras]
 tesserocr = ["tesserocr"]
 ocrmac = ["ocrmac"]
+vlm = ["transformers"]
 rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
 [tool.poetry.scripts]
@@ -162,7 +167,8 @@ module = [
     "deepsearch_glm.*",
     "lxml.*",
     "bs4.*",
-    "huggingface_hub.*"
+    "huggingface_hub.*",
+    "transformers.*",
 ]
 ignore_missing_imports = true

{docling-2.19.0 → docling-2.21.0}/LICENSE RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/README.md RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/html_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/json/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/json/docling_json_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/msword_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/xml/pubmed_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/cli/tools.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/datamodel/base_models.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/datamodel/document.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/datamodel/settings.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/document_converter.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/base_ocr_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/code_formula_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/document_picture_classifier.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/layout_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/page_assemble_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/rapid_ocr_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/table_structure_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/models/tesseract_ocr_model.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/pipeline/base_pipeline.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/utils/layout_postprocessor.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/utils/ocr_utils.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/utils/utils.py RENAMED Viewed

File without changes

{docling-2.19.0 → docling-2.21.0}/docling/utils/visualization.py RENAMED Viewed

File without changes

docling 2.19.0__tar.gz → 2.21.0__tar.gz

docling 2.19.0tar.gz → 2.21.0tar.gz