docling 2.35.0__py3-none-any.whl → 2.36.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/xml/jats_backend.py +0 -0
  2. docling/cli/main.py +12 -15
  3. docling/datamodel/accelerator_options.py +68 -0
  4. docling/datamodel/base_models.py +10 -8
  5. docling/datamodel/pipeline_options.py +29 -161
  6. docling/datamodel/pipeline_options_vlm_model.py +81 -0
  7. docling/datamodel/vlm_model_specs.py +144 -0
  8. docling/document_converter.py +5 -0
  9. docling/models/api_vlm_model.py +1 -1
  10. docling/models/base_ocr_model.py +2 -1
  11. docling/models/code_formula_model.py +6 -11
  12. docling/models/document_picture_classifier.py +6 -11
  13. docling/models/easyocr_model.py +1 -2
  14. docling/models/layout_model.py +6 -11
  15. docling/models/ocr_mac_model.py +1 -1
  16. docling/models/picture_description_api_model.py +1 -1
  17. docling/models/picture_description_base_model.py +1 -1
  18. docling/models/picture_description_vlm_model.py +7 -22
  19. docling/models/rapid_ocr_model.py +1 -2
  20. docling/models/table_structure_model.py +6 -12
  21. docling/models/tesseract_ocr_cli_model.py +1 -1
  22. docling/models/tesseract_ocr_model.py +1 -1
  23. docling/models/utils/__init__.py +0 -0
  24. docling/models/utils/hf_model_download.py +40 -0
  25. docling/models/vlm_models_inline/__init__.py +0 -0
  26. docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
  27. docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} +56 -44
  28. docling/pipeline/vlm_pipeline.py +228 -61
  29. docling/utils/accelerator_utils.py +17 -2
  30. docling/utils/model_downloader.py +13 -12
  31. {docling-2.35.0.dist-info → docling-2.36.0.dist-info}/METADATA +54 -55
  32. {docling-2.35.0.dist-info → docling-2.36.0.dist-info}/RECORD +46 -39
  33. {docling-2.35.0.dist-info → docling-2.36.0.dist-info}/WHEEL +2 -1
  34. docling-2.36.0.dist-info/entry_points.txt +6 -0
  35. docling-2.36.0.dist-info/top_level.txt +1 -0
  36. docling/models/hf_vlm_model.py +0 -182
  37. docling-2.35.0.dist-info/entry_points.txt +0 -7
  38. {docling-2.35.0.dist-info → docling-2.36.0.dist-info/licenses}/LICENSE +0 -0
File without changes
docling/cli/main.py CHANGED
@@ -28,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
28
28
  from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
29
29
  from docling.backend.pdf_backend import PdfDocumentBackend
30
30
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
31
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
31
32
  from docling.datamodel.base_models import (
32
33
  ConversionStatus,
33
34
  FormatToExtensions,
@@ -36,8 +37,6 @@ from docling.datamodel.base_models import (
36
37
  )
37
38
  from docling.datamodel.document import ConversionResult
38
39
  from docling.datamodel.pipeline_options import (
39
- AcceleratorDevice,
40
- AcceleratorOptions,
41
40
  EasyOcrOptions,
42
41
  OcrOptions,
43
42
  PaginatedPipelineOptions,
@@ -45,14 +44,16 @@ from docling.datamodel.pipeline_options import (
45
44
  PdfPipeline,
46
45
  PdfPipelineOptions,
47
46
  TableFormerMode,
48
- VlmModelType,
49
47
  VlmPipelineOptions,
50
- granite_vision_vlm_conversion_options,
51
- granite_vision_vlm_ollama_conversion_options,
52
- smoldocling_vlm_conversion_options,
53
- smoldocling_vlm_mlx_conversion_options,
54
48
  )
55
49
  from docling.datamodel.settings import settings
50
+ from docling.datamodel.vlm_model_specs import (
51
+ GRANITE_VISION_OLLAMA,
52
+ GRANITE_VISION_TRANSFORMERS,
53
+ SMOLDOCLING_MLX,
54
+ SMOLDOCLING_TRANSFORMERS,
55
+ VlmModelType,
56
+ )
56
57
  from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
57
58
  from docling.models.factories import get_ocr_factory
58
59
  from docling.pipeline.vlm_pipeline import VlmPipeline
@@ -579,20 +580,16 @@ def convert( # noqa: C901
579
580
  )
580
581
 
581
582
  if vlm_model == VlmModelType.GRANITE_VISION:
582
- pipeline_options.vlm_options = granite_vision_vlm_conversion_options
583
+ pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
583
584
  elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
584
- pipeline_options.vlm_options = (
585
- granite_vision_vlm_ollama_conversion_options
586
- )
585
+ pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
587
586
  elif vlm_model == VlmModelType.SMOLDOCLING:
588
- pipeline_options.vlm_options = smoldocling_vlm_conversion_options
587
+ pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
589
588
  if sys.platform == "darwin":
590
589
  try:
591
590
  import mlx_vlm
592
591
 
593
- pipeline_options.vlm_options = (
594
- smoldocling_vlm_mlx_conversion_options
595
- )
592
+ pipeline_options.vlm_options = SMOLDOCLING_MLX
596
593
  except ImportError:
597
594
  _log.warning(
598
595
  "To run SmolDocling faster, please install mlx-vlm:\n"
@@ -0,0 +1,68 @@
1
+ import logging
2
+ import os
3
+ import re
4
+ from enum import Enum
5
+ from typing import Any, Union
6
+
7
+ from pydantic import field_validator, model_validator
8
+ from pydantic_settings import BaseSettings, SettingsConfigDict
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class AcceleratorDevice(str, Enum):
14
+ """Devices to run model inference"""
15
+
16
+ AUTO = "auto"
17
+ CPU = "cpu"
18
+ CUDA = "cuda"
19
+ MPS = "mps"
20
+
21
+
22
+ class AcceleratorOptions(BaseSettings):
23
+ model_config = SettingsConfigDict(
24
+ env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
25
+ )
26
+
27
+ num_threads: int = 4
28
+ device: Union[str, AcceleratorDevice] = "auto"
29
+ cuda_use_flash_attention2: bool = False
30
+
31
+ @field_validator("device")
32
+ def validate_device(cls, value):
33
+ # "auto", "cpu", "cuda", "mps", or "cuda:N"
34
+ if value in {d.value for d in AcceleratorDevice} or re.match(
35
+ r"^cuda(:\d+)?$", value
36
+ ):
37
+ return value
38
+ raise ValueError(
39
+ "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
40
+ )
41
+
42
+ @model_validator(mode="before")
43
+ @classmethod
44
+ def check_alternative_envvars(cls, data: Any) -> Any:
45
+ r"""
46
+ Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
47
+ The alternative envvar is used only if it is valid and the regular envvar is not set.
48
+
49
+ Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
50
+ the same functionality. In case the alias envvar is set and the user tries to override the
51
+ parameter in settings initialization, Pydantic treats the parameter provided in __init__()
52
+ as an extra input instead of simply overwriting the evvar value for that parameter.
53
+ """
54
+ if isinstance(data, dict):
55
+ input_num_threads = data.get("num_threads")
56
+ # Check if to set the num_threads from the alternative envvar
57
+ if input_num_threads is None:
58
+ docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
59
+ omp_num_threads = os.getenv("OMP_NUM_THREADS")
60
+ if docling_num_threads is None and omp_num_threads is not None:
61
+ try:
62
+ data["num_threads"] = int(omp_num_threads)
63
+ except ValueError:
64
+ _log.error(
65
+ "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
66
+ omp_num_threads,
67
+ )
68
+ return data
@@ -13,11 +13,11 @@ from docling_core.types.doc import (
13
13
  TableCell,
14
14
  )
15
15
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
16
-
17
- # DO NOT REMOVE; explicitly exposed from this location
18
16
  from docling_core.types.io import (
19
17
  DocumentStream,
20
18
  )
19
+
20
+ # DO NOT REMOVE; explicitly exposed from this location
21
21
  from PIL.Image import Image
22
22
  from pydantic import BaseModel, ConfigDict, Field, computed_field
23
23
 
@@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
131
131
  error_message: str
132
132
 
133
133
 
134
- # class Cell(BaseModel):
135
- # id: int
136
- # text: str
137
- # bbox: BoundingBox
138
-
139
-
140
134
  class Cluster(BaseModel):
141
135
  id: int
142
136
  label: DocItemLabel
@@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
158
152
  clusters: List[Cluster] = []
159
153
 
160
154
 
155
+ class VlmPredictionToken(BaseModel):
156
+ text: str = ""
157
+ token: int = -1
158
+ logprob: float = -1
159
+
160
+
161
161
  class VlmPrediction(BaseModel):
162
162
  text: str = ""
163
+ generated_tokens: list[VlmPredictionToken] = []
164
+ generation_time: float = -1
163
165
 
164
166
 
165
167
  class ContainerElement(
@@ -1,6 +1,4 @@
1
1
  import logging
2
- import os
3
- import re
4
2
  from enum import Enum
5
3
  from pathlib import Path
6
4
  from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@@ -10,71 +8,26 @@ from pydantic import (
10
8
  BaseModel,
11
9
  ConfigDict,
12
10
  Field,
13
- field_validator,
14
- model_validator,
15
11
  )
16
- from pydantic_settings import BaseSettings, SettingsConfigDict
17
12
  from typing_extensions import deprecated
18
13
 
19
- _log = logging.getLogger(__name__)
20
-
21
-
22
- class AcceleratorDevice(str, Enum):
23
- """Devices to run model inference"""
24
-
25
- AUTO = "auto"
26
- CPU = "cpu"
27
- CUDA = "cuda"
28
- MPS = "mps"
29
-
30
-
31
- class AcceleratorOptions(BaseSettings):
32
- model_config = SettingsConfigDict(
33
- env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
34
- )
14
+ # Import the following for backwards compatibility
15
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
16
+ from docling.datamodel.pipeline_options_vlm_model import (
17
+ ApiVlmOptions,
18
+ InferenceFramework,
19
+ InlineVlmOptions,
20
+ ResponseFormat,
21
+ )
22
+ from docling.datamodel.vlm_model_specs import (
23
+ GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
24
+ GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
25
+ SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
26
+ SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
27
+ VlmModelType,
28
+ )
35
29
 
36
- num_threads: int = 4
37
- device: Union[str, AcceleratorDevice] = "auto"
38
- cuda_use_flash_attention2: bool = False
39
-
40
- @field_validator("device")
41
- def validate_device(cls, value):
42
- # "auto", "cpu", "cuda", "mps", or "cuda:N"
43
- if value in {d.value for d in AcceleratorDevice} or re.match(
44
- r"^cuda(:\d+)?$", value
45
- ):
46
- return value
47
- raise ValueError(
48
- "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
49
- )
50
-
51
- @model_validator(mode="before")
52
- @classmethod
53
- def check_alternative_envvars(cls, data: Any) -> Any:
54
- r"""
55
- Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
56
- The alternative envvar is used only if it is valid and the regular envvar is not set.
57
-
58
- Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
59
- the same functionality. In case the alias envvar is set and the user tries to override the
60
- parameter in settings initialization, Pydantic treats the parameter provided in __init__()
61
- as an extra input instead of simply overwriting the evvar value for that parameter.
62
- """
63
- if isinstance(data, dict):
64
- input_num_threads = data.get("num_threads")
65
- # Check if to set the num_threads from the alternative envvar
66
- if input_num_threads is None:
67
- docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
68
- omp_num_threads = os.getenv("OMP_NUM_THREADS")
69
- if docling_num_threads is None and omp_num_threads is not None:
70
- try:
71
- data["num_threads"] = int(omp_num_threads)
72
- except ValueError:
73
- _log.error(
74
- "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
75
- omp_num_threads,
76
- )
77
- return data
30
+ _log = logging.getLogger(__name__)
78
31
 
79
32
 
80
33
  class BaseOptions(BaseModel):
@@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions):
121
74
  lang: List[str] = [
122
75
  "english",
123
76
  "chinese",
124
- ] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
125
- # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
77
+ ]
78
+ # However, language as a parameter is not supported by rapidocr yet
79
+ # and hence changing this options doesn't affect anything.
80
+
81
+ # For more details on supported languages by RapidOCR visit
82
+ # https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
83
+
84
+ # For more details on the following options visit
85
+ # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
126
86
 
127
- # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
128
87
  text_score: float = 0.5 # same default as rapidocr
129
88
 
130
89
  use_det: Optional[bool] = None # same default as rapidocr
131
90
  use_cls: Optional[bool] = None # same default as rapidocr
132
91
  use_rec: Optional[bool] = None # same default as rapidocr
133
92
 
134
- # class Device(Enum):
135
- # CPU = "CPU"
136
- # CUDA = "CUDA"
137
- # DIRECTML = "DIRECTML"
138
- # AUTO = "AUTO"
139
-
140
- # device: Device = Device.AUTO # Default value is AUTO
141
-
142
93
  print_verbose: bool = False # same default as rapidocr
143
94
 
144
95
  det_model_path: Optional[str] = None # same default as rapidocr
@@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
244
195
  return self.repo_id.replace("/", "--")
245
196
 
246
197
 
198
+ # SmolVLM
247
199
  smolvlm_picture_description = PictureDescriptionVlmOptions(
248
200
  repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
249
201
  )
250
- # phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
202
+
203
+ # GraniteVision
251
204
  granite_picture_description = PictureDescriptionVlmOptions(
252
205
  repo_id="ibm-granite/granite-vision-3.1-2b-preview",
253
206
  prompt="What is shown in this image?",
254
207
  )
255
208
 
256
209
 
257
- class BaseVlmOptions(BaseModel):
258
- kind: str
259
- prompt: str
260
-
261
-
262
- class ResponseFormat(str, Enum):
263
- DOCTAGS = "doctags"
264
- MARKDOWN = "markdown"
265
-
266
-
267
- class InferenceFramework(str, Enum):
268
- MLX = "mlx"
269
- TRANSFORMERS = "transformers"
270
- OPENAI = "openai"
271
-
272
-
273
- class HuggingFaceVlmOptions(BaseVlmOptions):
274
- kind: Literal["hf_model_options"] = "hf_model_options"
275
-
276
- repo_id: str
277
- load_in_8bit: bool = True
278
- llm_int8_threshold: float = 6.0
279
- quantized: bool = False
280
-
281
- inference_framework: InferenceFramework
282
- response_format: ResponseFormat
283
-
284
- @property
285
- def repo_cache_folder(self) -> str:
286
- return self.repo_id.replace("/", "--")
287
-
288
-
289
- class ApiVlmOptions(BaseVlmOptions):
290
- kind: Literal["api_model_options"] = "api_model_options"
291
-
292
- url: AnyUrl = AnyUrl(
293
- "http://localhost:11434/v1/chat/completions"
294
- ) # Default to ollama
295
- headers: Dict[str, str] = {}
296
- params: Dict[str, Any] = {}
297
- scale: float = 2.0
298
- timeout: float = 60
299
- concurrency: int = 1
300
- response_format: ResponseFormat
301
-
302
-
303
- smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
304
- repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
305
- prompt="Convert this page to docling.",
306
- response_format=ResponseFormat.DOCTAGS,
307
- inference_framework=InferenceFramework.MLX,
308
- )
309
-
310
-
311
- smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
312
- repo_id="ds4sd/SmolDocling-256M-preview",
313
- prompt="Convert this page to docling.",
314
- response_format=ResponseFormat.DOCTAGS,
315
- inference_framework=InferenceFramework.TRANSFORMERS,
316
- )
317
-
318
- granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
319
- repo_id="ibm-granite/granite-vision-3.1-2b-preview",
320
- # prompt="OCR the full page to markdown.",
321
- prompt="OCR this image.",
322
- response_format=ResponseFormat.MARKDOWN,
323
- inference_framework=InferenceFramework.TRANSFORMERS,
324
- )
325
-
326
- granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
327
- url=AnyUrl("http://localhost:11434/v1/chat/completions"),
328
- params={"model": "granite3.2-vision:2b"},
329
- prompt="OCR the full page to markdown.",
330
- scale=1.0,
331
- timeout=120,
332
- response_format=ResponseFormat.MARKDOWN,
333
- )
334
-
335
-
336
- class VlmModelType(str, Enum):
337
- SMOLDOCLING = "smoldocling"
338
- GRANITE_VISION = "granite_vision"
339
- GRANITE_VISION_OLLAMA = "granite_vision_ollama"
340
-
341
-
342
210
  # Define an enum for the backend options
343
211
  class PdfBackend(str, Enum):
344
212
  """Enum of valid PDF backends."""
@@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
387
255
  False # (To be used with vlms, or other generative models)
388
256
  )
389
257
  # If True, text from backend will be used instead of generated text
390
- vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
258
+ vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
391
259
  smoldocling_vlm_conversion_options
392
260
  )
393
261
 
@@ -0,0 +1,81 @@
1
+ from enum import Enum
2
+ from typing import Any, Dict, List, Literal
3
+
4
+ from pydantic import AnyUrl, BaseModel
5
+ from typing_extensions import deprecated
6
+
7
+ from docling.datamodel.accelerator_options import AcceleratorDevice
8
+
9
+
10
+ class BaseVlmOptions(BaseModel):
11
+ kind: str
12
+ prompt: str
13
+
14
+
15
+ class ResponseFormat(str, Enum):
16
+ DOCTAGS = "doctags"
17
+ MARKDOWN = "markdown"
18
+ HTML = "html"
19
+
20
+
21
+ class InferenceFramework(str, Enum):
22
+ MLX = "mlx"
23
+ TRANSFORMERS = "transformers"
24
+
25
+
26
+ class TransformersModelType(str, Enum):
27
+ AUTOMODEL = "automodel"
28
+ AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
29
+ AUTOMODEL_CAUSALLM = "automodel-causallm"
30
+
31
+
32
+ class InlineVlmOptions(BaseVlmOptions):
33
+ kind: Literal["inline_model_options"] = "inline_model_options"
34
+
35
+ repo_id: str
36
+ trust_remote_code: bool = False
37
+ load_in_8bit: bool = True
38
+ llm_int8_threshold: float = 6.0
39
+ quantized: bool = False
40
+
41
+ inference_framework: InferenceFramework
42
+ transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
43
+ response_format: ResponseFormat
44
+
45
+ supported_devices: List[AcceleratorDevice] = [
46
+ AcceleratorDevice.CPU,
47
+ AcceleratorDevice.CUDA,
48
+ AcceleratorDevice.MPS,
49
+ ]
50
+
51
+ scale: float = 2.0
52
+
53
+ temperature: float = 0.0
54
+ stop_strings: List[str] = []
55
+ extra_generation_config: Dict[str, Any] = {}
56
+
57
+ use_kv_cache: bool = True
58
+ max_new_tokens: int = 4096
59
+
60
+ @property
61
+ def repo_cache_folder(self) -> str:
62
+ return self.repo_id.replace("/", "--")
63
+
64
+
65
+ @deprecated("Use InlineVlmOptions instead.")
66
+ class HuggingFaceVlmOptions(InlineVlmOptions):
67
+ pass
68
+
69
+
70
+ class ApiVlmOptions(BaseVlmOptions):
71
+ kind: Literal["api_model_options"] = "api_model_options"
72
+
73
+ url: AnyUrl = AnyUrl(
74
+ "http://localhost:11434/v1/chat/completions"
75
+ ) # Default to ollama
76
+ headers: Dict[str, str] = {}
77
+ params: Dict[str, Any] = {}
78
+ scale: float = 2.0
79
+ timeout: float = 60
80
+ concurrency: int = 1
81
+ response_format: ResponseFormat
@@ -0,0 +1,144 @@
1
+ import logging
2
+ from enum import Enum
3
+
4
+ from pydantic import (
5
+ AnyUrl,
6
+ )
7
+
8
+ from docling.datamodel.accelerator_options import AcceleratorDevice
9
+ from docling.datamodel.pipeline_options_vlm_model import (
10
+ ApiVlmOptions,
11
+ InferenceFramework,
12
+ InlineVlmOptions,
13
+ ResponseFormat,
14
+ TransformersModelType,
15
+ )
16
+
17
+ _log = logging.getLogger(__name__)
18
+
19
+
20
+ # SmolDocling
21
+ SMOLDOCLING_MLX = InlineVlmOptions(
22
+ repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
23
+ prompt="Convert this page to docling.",
24
+ response_format=ResponseFormat.DOCTAGS,
25
+ inference_framework=InferenceFramework.MLX,
26
+ supported_devices=[AcceleratorDevice.MPS],
27
+ scale=2.0,
28
+ temperature=0.0,
29
+ )
30
+
31
+ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
32
+ repo_id="ds4sd/SmolDocling-256M-preview",
33
+ prompt="Convert this page to docling.",
34
+ response_format=ResponseFormat.DOCTAGS,
35
+ inference_framework=InferenceFramework.TRANSFORMERS,
36
+ transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
37
+ supported_devices=[
38
+ AcceleratorDevice.CPU,
39
+ AcceleratorDevice.CUDA,
40
+ AcceleratorDevice.MPS,
41
+ ],
42
+ scale=2.0,
43
+ temperature=0.0,
44
+ )
45
+
46
+ # GraniteVision
47
+ GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
48
+ repo_id="ibm-granite/granite-vision-3.2-2b",
49
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
50
+ response_format=ResponseFormat.MARKDOWN,
51
+ inference_framework=InferenceFramework.TRANSFORMERS,
52
+ transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
53
+ supported_devices=[
54
+ AcceleratorDevice.CPU,
55
+ AcceleratorDevice.CUDA,
56
+ AcceleratorDevice.MPS,
57
+ ],
58
+ scale=2.0,
59
+ temperature=0.0,
60
+ )
61
+
62
+ GRANITE_VISION_OLLAMA = ApiVlmOptions(
63
+ url=AnyUrl("http://localhost:11434/v1/chat/completions"),
64
+ params={"model": "granite3.2-vision:2b"},
65
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
66
+ scale=1.0,
67
+ timeout=120,
68
+ response_format=ResponseFormat.MARKDOWN,
69
+ temperature=0.0,
70
+ )
71
+
72
+ # Pixtral
73
+ PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
74
+ repo_id="mistral-community/pixtral-12b",
75
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
76
+ response_format=ResponseFormat.MARKDOWN,
77
+ inference_framework=InferenceFramework.TRANSFORMERS,
78
+ transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
79
+ supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
80
+ scale=2.0,
81
+ temperature=0.0,
82
+ )
83
+
84
+ PIXTRAL_12B_MLX = InlineVlmOptions(
85
+ repo_id="mlx-community/pixtral-12b-bf16",
86
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
87
+ response_format=ResponseFormat.MARKDOWN,
88
+ inference_framework=InferenceFramework.MLX,
89
+ supported_devices=[AcceleratorDevice.MPS],
90
+ scale=2.0,
91
+ temperature=0.0,
92
+ )
93
+
94
+ # Phi4
95
+ PHI4_TRANSFORMERS = InlineVlmOptions(
96
+ repo_id="microsoft/Phi-4-multimodal-instruct",
97
+ prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
98
+ trust_remote_code=True,
99
+ response_format=ResponseFormat.MARKDOWN,
100
+ inference_framework=InferenceFramework.TRANSFORMERS,
101
+ transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM,
102
+ supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
103
+ scale=2.0,
104
+ temperature=0.0,
105
+ extra_generation_config=dict(num_logits_to_keep=0),
106
+ )
107
+
108
+ # Qwen
109
+ QWEN25_VL_3B_MLX = InlineVlmOptions(
110
+ repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
111
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
112
+ response_format=ResponseFormat.MARKDOWN,
113
+ inference_framework=InferenceFramework.MLX,
114
+ supported_devices=[AcceleratorDevice.MPS],
115
+ scale=2.0,
116
+ temperature=0.0,
117
+ )
118
+
119
+ # Gemma-3
120
+ GEMMA3_12B_MLX = InlineVlmOptions(
121
+ repo_id="mlx-community/gemma-3-12b-it-bf16",
122
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
123
+ response_format=ResponseFormat.MARKDOWN,
124
+ inference_framework=InferenceFramework.MLX,
125
+ supported_devices=[AcceleratorDevice.MPS],
126
+ scale=2.0,
127
+ temperature=0.0,
128
+ )
129
+
130
+ GEMMA3_27B_MLX = InlineVlmOptions(
131
+ repo_id="mlx-community/gemma-3-27b-it-bf16",
132
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
133
+ response_format=ResponseFormat.MARKDOWN,
134
+ inference_framework=InferenceFramework.MLX,
135
+ supported_devices=[AcceleratorDevice.MPS],
136
+ scale=2.0,
137
+ temperature=0.0,
138
+ )
139
+
140
+
141
+ class VlmModelType(str, Enum):
142
+ SMOLDOCLING = "smoldocling"
143
+ GRANITE_VISION = "granite_vision"
144
+ GRANITE_VISION_OLLAMA = "granite_vision_ollama"
@@ -186,6 +186,11 @@ class DocumentConverter:
186
186
  Tuple[Type[BasePipeline], str], BasePipeline
187
187
  ] = {}
188
188
 
189
+ def _get_initialized_pipelines(
190
+ self,
191
+ ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
192
+ return self.initialized_pipelines
193
+
189
194
  def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
190
195
  """Generate a hash of pipeline options to use as part of the cache key."""
191
196
  options_str = str(pipeline_options.model_dump())
@@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor
3
3
 
4
4
  from docling.datamodel.base_models import Page, VlmPrediction
5
5
  from docling.datamodel.document import ConversionResult
6
- from docling.datamodel.pipeline_options import ApiVlmOptions
6
+ from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
7
7
  from docling.exceptions import OperationNotAllowed
8
8
  from docling.models.base_model import BasePageModel
9
9
  from docling.utils.api_image_request import api_image_request
@@ -11,9 +11,10 @@ from PIL import Image, ImageDraw
11
11
  from rtree import index
12
12
  from scipy.ndimage import binary_dilation, find_objects, label
13
13
 
14
+ from docling.datamodel.accelerator_options import AcceleratorOptions
14
15
  from docling.datamodel.base_models import Page
15
16
  from docling.datamodel.document import ConversionResult
16
- from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
17
+ from docling.datamodel.pipeline_options import OcrOptions
17
18
  from docling.datamodel.settings import settings
18
19
  from docling.models.base_model import BaseModelWithOptions, BasePageModel
19
20