docling 2.45.0__py3-none-any.whl → 2.47.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/cli/main.py CHANGED
@@ -60,10 +60,12 @@ from docling.datamodel.pipeline_options import (
60
60
  )
61
61
  from docling.datamodel.settings import settings
62
62
  from docling.datamodel.vlm_model_specs import (
63
+ GOT2_TRANSFORMERS,
63
64
  GRANITE_VISION_OLLAMA,
64
65
  GRANITE_VISION_TRANSFORMERS,
65
66
  SMOLDOCLING_MLX,
66
67
  SMOLDOCLING_TRANSFORMERS,
68
+ SMOLDOCLING_VLLM,
67
69
  VlmModelType,
68
70
  )
69
71
  from docling.document_converter import (
@@ -477,6 +479,13 @@ def convert( # noqa: C901
477
479
  "--logo", callback=logo_callback, is_eager=True, help="Docling logo"
478
480
  ),
479
481
  ] = None,
482
+ page_batch_size: Annotated[
483
+ int,
484
+ typer.Option(
485
+ "--page-batch-size",
486
+ help=f"Number of pages processed in one batch. Default: {settings.perf.page_batch_size}",
487
+ ),
488
+ ] = settings.perf.page_batch_size,
480
489
  ):
481
490
  log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
482
491
 
@@ -491,6 +500,7 @@ def convert( # noqa: C901
491
500
  settings.debug.visualize_layout = debug_visualize_layout
492
501
  settings.debug.visualize_tables = debug_visualize_tables
493
502
  settings.debug.visualize_ocr = debug_visualize_ocr
503
+ settings.perf.page_batch_size = page_batch_size
494
504
 
495
505
  if from_formats is None:
496
506
  from_formats = list(InputFormat)
@@ -631,6 +641,8 @@ def convert( # noqa: C901
631
641
  pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
632
642
  elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
633
643
  pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
644
+ elif vlm_model == VlmModelType.GOT_OCR_2:
645
+ pipeline_options.vlm_options = GOT2_TRANSFORMERS
634
646
  elif vlm_model == VlmModelType.SMOLDOCLING:
635
647
  pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
636
648
  if sys.platform == "darwin":
@@ -643,6 +655,8 @@ def convert( # noqa: C901
643
655
  "To run SmolDocling faster, please install mlx-vlm:\n"
644
656
  "pip install mlx-vlm"
645
657
  )
658
+ elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
659
+ pipeline_options.vlm_options = SMOLDOCLING_VLLM
646
660
 
647
661
  pdf_format_option = PdfFormatOption(
648
662
  pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
docling/cli/models.py CHANGED
@@ -9,6 +9,7 @@ from rich.console import Console
9
9
  from rich.logging import RichHandler
10
10
 
11
11
  from docling.datamodel.settings import settings
12
+ from docling.models.utils.hf_model_download import download_hf_model
12
13
  from docling.utils.model_downloader import download_models
13
14
 
14
15
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -128,6 +129,61 @@ def download(
128
129
  )
129
130
 
130
131
 
132
+ @app.command("download-hf-repo")
133
+ def download_hf_repo(
134
+ models: Annotated[
135
+ list[str],
136
+ typer.Argument(
137
+ help="Specific models to download from HuggingFace identified by their repo id. For example: ds4sd/docling-models .",
138
+ ),
139
+ ],
140
+ output_dir: Annotated[
141
+ Path,
142
+ typer.Option(
143
+ ...,
144
+ "-o",
145
+ "--output-dir",
146
+ help="The directory where to download the models.",
147
+ ),
148
+ ] = (settings.cache_dir / "models"),
149
+ force: Annotated[
150
+ bool, typer.Option(..., help="If true, the download will be forced.")
151
+ ] = False,
152
+ quiet: Annotated[
153
+ bool,
154
+ typer.Option(
155
+ ...,
156
+ "-q",
157
+ "--quiet",
158
+ help="No extra output is generated, the CLI prints only the directory with the cached models.",
159
+ ),
160
+ ] = False,
161
+ ):
162
+ if not quiet:
163
+ logging.basicConfig(
164
+ level=logging.INFO,
165
+ format="[blue]%(message)s[/blue]",
166
+ datefmt="[%X]",
167
+ handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
168
+ )
169
+
170
+ for item in models:
171
+ typer.secho(f"\nDownloading {item} model from HuggingFace...")
172
+ download_hf_model(
173
+ repo_id=item,
174
+ # would be better to reuse "repo_cache_folder" property: https://github.com/docling-project/docling/blob/main/docling/datamodel/pipeline_options_vlm_model.py#L76
175
+ # but creating options objects seams like an overkill
176
+ local_dir=output_dir / item.replace("/", "--"),
177
+ force=force,
178
+ progress=(not quiet),
179
+ )
180
+
181
+ if quiet:
182
+ typer.echo(output_dir)
183
+ else:
184
+ typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
185
+
186
+
131
187
  click_app = typer.main.get_command(app)
132
188
 
133
189
  if __name__ == "__main__":
@@ -1,7 +1,7 @@
1
1
  import math
2
2
  from collections import defaultdict
3
3
  from enum import Enum
4
- from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
4
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
5
5
 
6
6
  import numpy as np
7
7
  from docling_core.types.doc import (
@@ -282,6 +282,9 @@ class LayoutOptions(BaseModel):
282
282
  keep_empty_clusters: bool = (
283
283
  False # Whether to keep clusters that contain no text cells
284
284
  )
285
+ skip_cell_assignment: bool = (
286
+ False # Skip cell-to-cluster assignment for VLM-only processing
287
+ )
285
288
  model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
286
289
 
287
290
 
@@ -323,9 +326,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
323
326
  ),
324
327
  )
325
328
 
326
- generate_parsed_pages: Literal[True] = (
327
- True # Always True since parsed_page is now mandatory
328
- )
329
+ generate_parsed_pages: bool = False
329
330
 
330
331
 
331
332
  class ProcessingPipeline(str, Enum):
@@ -26,11 +26,14 @@ class ResponseFormat(str, Enum):
26
26
  DOCTAGS = "doctags"
27
27
  MARKDOWN = "markdown"
28
28
  HTML = "html"
29
+ OTSL = "otsl"
30
+ PLAINTEXT = "plaintext"
29
31
 
30
32
 
31
33
  class InferenceFramework(str, Enum):
32
34
  MLX = "mlx"
33
35
  TRANSFORMERS = "transformers"
36
+ VLLM = "vllm"
34
37
 
35
38
 
36
39
  class TransformersModelType(str, Enum):
@@ -43,6 +46,7 @@ class TransformersModelType(str, Enum):
43
46
  class TransformersPromptStyle(str, Enum):
44
47
  CHAT = "chat"
45
48
  RAW = "raw"
49
+ NONE = "none"
46
50
 
47
51
 
48
52
  class InlineVlmOptions(BaseVlmOptions):
@@ -68,6 +72,7 @@ class InlineVlmOptions(BaseVlmOptions):
68
72
 
69
73
  stop_strings: List[str] = []
70
74
  extra_generation_config: Dict[str, Any] = {}
75
+ extra_processor_kwargs: Dict[str, Any] = {}
71
76
 
72
77
  use_kv_cache: bool = True
73
78
  max_new_tokens: int = 4096
@@ -12,6 +12,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
12
12
  InlineVlmOptions,
13
13
  ResponseFormat,
14
14
  TransformersModelType,
15
+ TransformersPromptStyle,
15
16
  )
16
17
 
17
18
  _log = logging.getLogger(__name__)
@@ -26,6 +27,7 @@ SMOLDOCLING_MLX = InlineVlmOptions(
26
27
  supported_devices=[AcceleratorDevice.MPS],
27
28
  scale=2.0,
28
29
  temperature=0.0,
30
+ stop_strings=["</doctag>", "<end_of_utterance>"],
29
31
  )
30
32
 
31
33
  SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
@@ -33,16 +35,74 @@ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
33
35
  prompt="Convert this page to docling.",
34
36
  response_format=ResponseFormat.DOCTAGS,
35
37
  inference_framework=InferenceFramework.TRANSFORMERS,
36
- transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
38
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
37
39
  supported_devices=[
38
40
  AcceleratorDevice.CPU,
39
41
  AcceleratorDevice.CUDA,
42
+ ],
43
+ torch_dtype="bfloat16",
44
+ scale=2.0,
45
+ temperature=0.0,
46
+ stop_strings=["</doctag>", "<end_of_utterance>"],
47
+ )
48
+
49
+ SMOLDOCLING_VLLM = InlineVlmOptions(
50
+ repo_id="ds4sd/SmolDocling-256M-preview",
51
+ prompt="Convert this page to docling.",
52
+ response_format=ResponseFormat.DOCTAGS,
53
+ inference_framework=InferenceFramework.VLLM,
54
+ supported_devices=[
55
+ AcceleratorDevice.CUDA,
56
+ ],
57
+ scale=2.0,
58
+ temperature=0.0,
59
+ stop_strings=["</doctag>", "<end_of_utterance>"],
60
+ )
61
+
62
+ # SmolVLM-256M-Instruct
63
+ SMOLVLM256_TRANSFORMERS = InlineVlmOptions(
64
+ repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
65
+ prompt="Transcribe this image to plain text.",
66
+ response_format=ResponseFormat.PLAINTEXT,
67
+ inference_framework=InferenceFramework.TRANSFORMERS,
68
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
69
+ supported_devices=[
70
+ AcceleratorDevice.CPU,
71
+ AcceleratorDevice.CUDA,
72
+ # AcceleratorDevice.MPS,
73
+ ],
74
+ torch_dtype="bfloat16",
75
+ scale=2.0,
76
+ temperature=0.0,
77
+ )
78
+
79
+ # SmolVLM2-2.2b-Instruct
80
+ SMOLVLM256_MLX = InlineVlmOptions(
81
+ repo_id="moot20/SmolVLM-256M-Instruct-MLX",
82
+ prompt="Extract the text.",
83
+ response_format=ResponseFormat.DOCTAGS,
84
+ inference_framework=InferenceFramework.MLX,
85
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
86
+ supported_devices=[
40
87
  AcceleratorDevice.MPS,
41
88
  ],
42
89
  scale=2.0,
43
90
  temperature=0.0,
44
91
  )
45
92
 
93
+ SMOLVLM256_VLLM = InlineVlmOptions(
94
+ repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
95
+ prompt="Transcribe this image to plain text.",
96
+ response_format=ResponseFormat.PLAINTEXT,
97
+ inference_framework=InferenceFramework.VLLM,
98
+ supported_devices=[
99
+ AcceleratorDevice.CUDA,
100
+ ],
101
+ scale=2.0,
102
+ temperature=0.0,
103
+ )
104
+
105
+
46
106
  # GraniteVision
47
107
  GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
48
108
  repo_id="ibm-granite/granite-vision-3.2-2b",
@@ -59,6 +119,18 @@ GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
59
119
  temperature=0.0,
60
120
  )
61
121
 
122
+ GRANITE_VISION_VLLM = InlineVlmOptions(
123
+ repo_id="ibm-granite/granite-vision-3.2-2b",
124
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
125
+ response_format=ResponseFormat.MARKDOWN,
126
+ inference_framework=InferenceFramework.VLLM,
127
+ supported_devices=[
128
+ AcceleratorDevice.CUDA,
129
+ ],
130
+ scale=2.0,
131
+ temperature=0.0,
132
+ )
133
+
62
134
  GRANITE_VISION_OLLAMA = ApiVlmOptions(
63
135
  url=AnyUrl("http://localhost:11434/v1/chat/completions"),
64
136
  params={"model": "granite3.2-vision:2b"},
@@ -116,6 +188,26 @@ QWEN25_VL_3B_MLX = InlineVlmOptions(
116
188
  temperature=0.0,
117
189
  )
118
190
 
191
+ # GoT 2.0
192
+ GOT2_TRANSFORMERS = InlineVlmOptions(
193
+ repo_id="stepfun-ai/GOT-OCR-2.0-hf",
194
+ prompt="",
195
+ response_format=ResponseFormat.MARKDOWN,
196
+ inference_framework=InferenceFramework.TRANSFORMERS,
197
+ transformers_prompt_style=TransformersPromptStyle.NONE,
198
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
199
+ supported_devices=[
200
+ AcceleratorDevice.CPU,
201
+ AcceleratorDevice.CUDA,
202
+ # AcceleratorDevice.MPS,
203
+ ],
204
+ scale=2.0,
205
+ temperature=0.0,
206
+ stop_strings=["<|im_end|>"],
207
+ extra_processor_kwargs={"format": True},
208
+ )
209
+
210
+
119
211
  # Gemma-3
120
212
  GEMMA3_12B_MLX = InlineVlmOptions(
121
213
  repo_id="mlx-community/gemma-3-12b-it-bf16",
@@ -137,8 +229,29 @@ GEMMA3_27B_MLX = InlineVlmOptions(
137
229
  temperature=0.0,
138
230
  )
139
231
 
232
+ # Dolphin
233
+
234
+ DOLPHIN_TRANSFORMERS = InlineVlmOptions(
235
+ repo_id="ByteDance/Dolphin",
236
+ prompt="<s>Read text in the image. <Answer/>",
237
+ response_format=ResponseFormat.MARKDOWN,
238
+ inference_framework=InferenceFramework.TRANSFORMERS,
239
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
240
+ transformers_prompt_style=TransformersPromptStyle.RAW,
241
+ supported_devices=[
242
+ AcceleratorDevice.CUDA,
243
+ AcceleratorDevice.CPU,
244
+ AcceleratorDevice.MPS,
245
+ ],
246
+ scale=2.0,
247
+ temperature=0.0,
248
+ )
249
+
140
250
 
141
251
  class VlmModelType(str, Enum):
142
252
  SMOLDOCLING = "smoldocling"
253
+ SMOLDOCLING_VLLM = "smoldocling_vllm"
143
254
  GRANITE_VISION = "granite_vision"
255
+ GRANITE_VISION_VLLM = "granite_vision_vllm"
144
256
  GRANITE_VISION_OLLAMA = "granite_vision_ollama"
257
+ GOT_OCR_2 = "got_ocr_2"
@@ -1,13 +1,24 @@
1
+ import logging
1
2
  from abc import ABC, abstractmethod
2
3
  from collections.abc import Iterable
3
- from typing import Generic, Optional, Protocol, Type
4
+ from typing import Any, Generic, Optional, Protocol, Type, Union
4
5
 
6
+ import numpy as np
5
7
  from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
8
+ from PIL.Image import Image
6
9
  from typing_extensions import TypeVar
7
10
 
8
- from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
11
+ from docling.datamodel.base_models import (
12
+ ItemAndImageEnrichmentElement,
13
+ Page,
14
+ VlmPrediction,
15
+ )
9
16
  from docling.datamodel.document import ConversionResult
10
17
  from docling.datamodel.pipeline_options import BaseOptions
18
+ from docling.datamodel.pipeline_options_vlm_model import (
19
+ InlineVlmOptions,
20
+ TransformersPromptStyle,
21
+ )
11
22
  from docling.datamodel.settings import settings
12
23
 
13
24
 
@@ -26,6 +37,88 @@ class BasePageModel(ABC):
26
37
  pass
27
38
 
28
39
 
40
+ class BaseVlmModel(ABC):
41
+ """Base class for Vision-Language Models that adds image processing capability."""
42
+
43
+ @abstractmethod
44
+ def process_images(
45
+ self,
46
+ image_batch: Iterable[Union[Image, np.ndarray]],
47
+ prompt: Union[str, list[str]],
48
+ ) -> Iterable[VlmPrediction]:
49
+ """Process raw images without page metadata.
50
+
51
+ Args:
52
+ image_batch: Iterable of PIL Images or numpy arrays
53
+ prompt: Either:
54
+ - str: Single prompt used for all images
55
+ - list[str]: List of prompts (one per image, must match image count)
56
+
57
+ Raises:
58
+ ValueError: If prompt list length doesn't match image count.
59
+ """
60
+
61
+
62
+ class BaseVlmPageModel(BasePageModel, BaseVlmModel):
63
+ """Base implementation for VLM models that inherit from BasePageModel.
64
+
65
+ Provides a default __call__ implementation that extracts images from pages,
66
+ processes them using process_images, and attaches results back to pages.
67
+ """
68
+
69
+ # Type annotations for attributes that subclasses must initialize
70
+ vlm_options: InlineVlmOptions
71
+ processor: Any
72
+
73
+ @abstractmethod
74
+ def __call__(
75
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
76
+ ) -> Iterable[Page]:
77
+ """Extract images from pages, process them, and attach results back."""
78
+
79
+ def formulate_prompt(self, user_prompt: str) -> str:
80
+ """Formulate a prompt for the VLM."""
81
+ _log = logging.getLogger(__name__)
82
+
83
+ if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
84
+ return user_prompt
85
+
86
+ elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
87
+ _log.debug("Using specialized prompt for Phi-4")
88
+ # Note: This might need adjustment for VLLM vs transformers
89
+ user_prompt_prefix = "<|user|>"
90
+ assistant_prompt = "<|assistant|>"
91
+ prompt_suffix = "<|end|>"
92
+
93
+ prompt = f"{user_prompt_prefix}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
94
+ _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
95
+
96
+ return prompt
97
+
98
+ elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
99
+ messages = [
100
+ {
101
+ "role": "user",
102
+ "content": [
103
+ {
104
+ "type": "text",
105
+ "text": "This is a page from a document.",
106
+ },
107
+ {"type": "image"},
108
+ {"type": "text", "text": user_prompt},
109
+ ],
110
+ }
111
+ ]
112
+ prompt = self.processor.apply_chat_template(
113
+ messages, add_generation_prompt=True
114
+ )
115
+ return prompt
116
+
117
+ raise RuntimeError(
118
+ f"Unknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
119
+ )
120
+
121
+
29
122
  EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
30
123
 
31
124