docling 2.35.0__py3-none-any.whl → 2.36.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/jats_backend.py +0 -0
- docling/cli/main.py +12 -15
- docling/datamodel/accelerator_options.py +68 -0
- docling/datamodel/base_models.py +10 -8
- docling/datamodel/pipeline_options.py +29 -161
- docling/datamodel/pipeline_options_vlm_model.py +81 -0
- docling/datamodel/vlm_model_specs.py +144 -0
- docling/document_converter.py +5 -0
- docling/models/api_vlm_model.py +1 -1
- docling/models/base_ocr_model.py +2 -1
- docling/models/code_formula_model.py +6 -11
- docling/models/document_picture_classifier.py +6 -11
- docling/models/easyocr_model.py +1 -2
- docling/models/layout_model.py +6 -11
- docling/models/ocr_mac_model.py +1 -1
- docling/models/picture_description_api_model.py +1 -1
- docling/models/picture_description_base_model.py +1 -1
- docling/models/picture_description_vlm_model.py +7 -22
- docling/models/rapid_ocr_model.py +1 -2
- docling/models/table_structure_model.py +6 -12
- docling/models/tesseract_ocr_cli_model.py +1 -1
- docling/models/tesseract_ocr_model.py +1 -1
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/hf_model_download.py +40 -0
- docling/models/vlm_models_inline/__init__.py +0 -0
- docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
- docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} +56 -44
- docling/pipeline/vlm_pipeline.py +228 -61
- docling/utils/accelerator_utils.py +17 -2
- docling/utils/model_downloader.py +13 -12
- {docling-2.35.0.dist-info → docling-2.36.1.dist-info}/METADATA +53 -55
- {docling-2.35.0.dist-info → docling-2.36.1.dist-info}/RECORD +46 -39
- {docling-2.35.0.dist-info → docling-2.36.1.dist-info}/WHEEL +2 -1
- docling-2.36.1.dist-info/entry_points.txt +6 -0
- docling-2.36.1.dist-info/top_level.txt +1 -0
- docling/models/hf_vlm_model.py +0 -182
- docling-2.35.0.dist-info/entry_points.txt +0 -7
- {docling-2.35.0.dist-info → docling-2.36.1.dist-info/licenses}/LICENSE +0 -0
File without changes
|
docling/cli/main.py
CHANGED
@@ -28,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
|
|
28
28
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
29
29
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
30
30
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
31
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
31
32
|
from docling.datamodel.base_models import (
|
32
33
|
ConversionStatus,
|
33
34
|
FormatToExtensions,
|
@@ -36,8 +37,6 @@ from docling.datamodel.base_models import (
|
|
36
37
|
)
|
37
38
|
from docling.datamodel.document import ConversionResult
|
38
39
|
from docling.datamodel.pipeline_options import (
|
39
|
-
AcceleratorDevice,
|
40
|
-
AcceleratorOptions,
|
41
40
|
EasyOcrOptions,
|
42
41
|
OcrOptions,
|
43
42
|
PaginatedPipelineOptions,
|
@@ -45,14 +44,16 @@ from docling.datamodel.pipeline_options import (
|
|
45
44
|
PdfPipeline,
|
46
45
|
PdfPipelineOptions,
|
47
46
|
TableFormerMode,
|
48
|
-
VlmModelType,
|
49
47
|
VlmPipelineOptions,
|
50
|
-
granite_vision_vlm_conversion_options,
|
51
|
-
granite_vision_vlm_ollama_conversion_options,
|
52
|
-
smoldocling_vlm_conversion_options,
|
53
|
-
smoldocling_vlm_mlx_conversion_options,
|
54
48
|
)
|
55
49
|
from docling.datamodel.settings import settings
|
50
|
+
from docling.datamodel.vlm_model_specs import (
|
51
|
+
GRANITE_VISION_OLLAMA,
|
52
|
+
GRANITE_VISION_TRANSFORMERS,
|
53
|
+
SMOLDOCLING_MLX,
|
54
|
+
SMOLDOCLING_TRANSFORMERS,
|
55
|
+
VlmModelType,
|
56
|
+
)
|
56
57
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
57
58
|
from docling.models.factories import get_ocr_factory
|
58
59
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
@@ -579,20 +580,16 @@ def convert( # noqa: C901
|
|
579
580
|
)
|
580
581
|
|
581
582
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
582
|
-
pipeline_options.vlm_options =
|
583
|
+
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
583
584
|
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
584
|
-
pipeline_options.vlm_options =
|
585
|
-
granite_vision_vlm_ollama_conversion_options
|
586
|
-
)
|
585
|
+
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
587
586
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
588
|
-
pipeline_options.vlm_options =
|
587
|
+
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
589
588
|
if sys.platform == "darwin":
|
590
589
|
try:
|
591
590
|
import mlx_vlm
|
592
591
|
|
593
|
-
pipeline_options.vlm_options =
|
594
|
-
smoldocling_vlm_mlx_conversion_options
|
595
|
-
)
|
592
|
+
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
596
593
|
except ImportError:
|
597
594
|
_log.warning(
|
598
595
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
@@ -0,0 +1,68 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Any, Union
|
6
|
+
|
7
|
+
from pydantic import field_validator, model_validator
|
8
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class AcceleratorDevice(str, Enum):
|
14
|
+
"""Devices to run model inference"""
|
15
|
+
|
16
|
+
AUTO = "auto"
|
17
|
+
CPU = "cpu"
|
18
|
+
CUDA = "cuda"
|
19
|
+
MPS = "mps"
|
20
|
+
|
21
|
+
|
22
|
+
class AcceleratorOptions(BaseSettings):
|
23
|
+
model_config = SettingsConfigDict(
|
24
|
+
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
25
|
+
)
|
26
|
+
|
27
|
+
num_threads: int = 4
|
28
|
+
device: Union[str, AcceleratorDevice] = "auto"
|
29
|
+
cuda_use_flash_attention2: bool = False
|
30
|
+
|
31
|
+
@field_validator("device")
|
32
|
+
def validate_device(cls, value):
|
33
|
+
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
34
|
+
if value in {d.value for d in AcceleratorDevice} or re.match(
|
35
|
+
r"^cuda(:\d+)?$", value
|
36
|
+
):
|
37
|
+
return value
|
38
|
+
raise ValueError(
|
39
|
+
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
40
|
+
)
|
41
|
+
|
42
|
+
@model_validator(mode="before")
|
43
|
+
@classmethod
|
44
|
+
def check_alternative_envvars(cls, data: Any) -> Any:
|
45
|
+
r"""
|
46
|
+
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
47
|
+
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
48
|
+
|
49
|
+
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
50
|
+
the same functionality. In case the alias envvar is set and the user tries to override the
|
51
|
+
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
52
|
+
as an extra input instead of simply overwriting the evvar value for that parameter.
|
53
|
+
"""
|
54
|
+
if isinstance(data, dict):
|
55
|
+
input_num_threads = data.get("num_threads")
|
56
|
+
# Check if to set the num_threads from the alternative envvar
|
57
|
+
if input_num_threads is None:
|
58
|
+
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
59
|
+
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
60
|
+
if docling_num_threads is None and omp_num_threads is not None:
|
61
|
+
try:
|
62
|
+
data["num_threads"] = int(omp_num_threads)
|
63
|
+
except ValueError:
|
64
|
+
_log.error(
|
65
|
+
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
66
|
+
omp_num_threads,
|
67
|
+
)
|
68
|
+
return data
|
docling/datamodel/base_models.py
CHANGED
@@ -13,11 +13,11 @@ from docling_core.types.doc import (
|
|
13
13
|
TableCell,
|
14
14
|
)
|
15
15
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
16
|
-
|
17
|
-
# DO NOT REMOVE; explicitly exposed from this location
|
18
16
|
from docling_core.types.io import (
|
19
17
|
DocumentStream,
|
20
18
|
)
|
19
|
+
|
20
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
21
21
|
from PIL.Image import Image
|
22
22
|
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
23
23
|
|
@@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
|
|
131
131
|
error_message: str
|
132
132
|
|
133
133
|
|
134
|
-
# class Cell(BaseModel):
|
135
|
-
# id: int
|
136
|
-
# text: str
|
137
|
-
# bbox: BoundingBox
|
138
|
-
|
139
|
-
|
140
134
|
class Cluster(BaseModel):
|
141
135
|
id: int
|
142
136
|
label: DocItemLabel
|
@@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
|
|
158
152
|
clusters: List[Cluster] = []
|
159
153
|
|
160
154
|
|
155
|
+
class VlmPredictionToken(BaseModel):
|
156
|
+
text: str = ""
|
157
|
+
token: int = -1
|
158
|
+
logprob: float = -1
|
159
|
+
|
160
|
+
|
161
161
|
class VlmPrediction(BaseModel):
|
162
162
|
text: str = ""
|
163
|
+
generated_tokens: list[VlmPredictionToken] = []
|
164
|
+
generation_time: float = -1
|
163
165
|
|
164
166
|
|
165
167
|
class ContainerElement(
|
@@ -1,6 +1,4 @@
|
|
1
1
|
import logging
|
2
|
-
import os
|
3
|
-
import re
|
4
2
|
from enum import Enum
|
5
3
|
from pathlib import Path
|
6
4
|
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
@@ -10,71 +8,26 @@ from pydantic import (
|
|
10
8
|
BaseModel,
|
11
9
|
ConfigDict,
|
12
10
|
Field,
|
13
|
-
field_validator,
|
14
|
-
model_validator,
|
15
11
|
)
|
16
|
-
from pydantic_settings import BaseSettings, SettingsConfigDict
|
17
12
|
from typing_extensions import deprecated
|
18
13
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
)
|
14
|
+
# Import the following for backwards compatibility
|
15
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
16
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
17
|
+
ApiVlmOptions,
|
18
|
+
InferenceFramework,
|
19
|
+
InlineVlmOptions,
|
20
|
+
ResponseFormat,
|
21
|
+
)
|
22
|
+
from docling.datamodel.vlm_model_specs import (
|
23
|
+
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
24
|
+
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
25
|
+
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
26
|
+
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
27
|
+
VlmModelType,
|
28
|
+
)
|
35
29
|
|
36
|
-
|
37
|
-
device: Union[str, AcceleratorDevice] = "auto"
|
38
|
-
cuda_use_flash_attention2: bool = False
|
39
|
-
|
40
|
-
@field_validator("device")
|
41
|
-
def validate_device(cls, value):
|
42
|
-
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
43
|
-
if value in {d.value for d in AcceleratorDevice} or re.match(
|
44
|
-
r"^cuda(:\d+)?$", value
|
45
|
-
):
|
46
|
-
return value
|
47
|
-
raise ValueError(
|
48
|
-
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
49
|
-
)
|
50
|
-
|
51
|
-
@model_validator(mode="before")
|
52
|
-
@classmethod
|
53
|
-
def check_alternative_envvars(cls, data: Any) -> Any:
|
54
|
-
r"""
|
55
|
-
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
56
|
-
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
57
|
-
|
58
|
-
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
59
|
-
the same functionality. In case the alias envvar is set and the user tries to override the
|
60
|
-
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
61
|
-
as an extra input instead of simply overwriting the evvar value for that parameter.
|
62
|
-
"""
|
63
|
-
if isinstance(data, dict):
|
64
|
-
input_num_threads = data.get("num_threads")
|
65
|
-
# Check if to set the num_threads from the alternative envvar
|
66
|
-
if input_num_threads is None:
|
67
|
-
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
68
|
-
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
69
|
-
if docling_num_threads is None and omp_num_threads is not None:
|
70
|
-
try:
|
71
|
-
data["num_threads"] = int(omp_num_threads)
|
72
|
-
except ValueError:
|
73
|
-
_log.error(
|
74
|
-
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
75
|
-
omp_num_threads,
|
76
|
-
)
|
77
|
-
return data
|
30
|
+
_log = logging.getLogger(__name__)
|
78
31
|
|
79
32
|
|
80
33
|
class BaseOptions(BaseModel):
|
@@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions):
|
|
121
74
|
lang: List[str] = [
|
122
75
|
"english",
|
123
76
|
"chinese",
|
124
|
-
]
|
125
|
-
#
|
77
|
+
]
|
78
|
+
# However, language as a parameter is not supported by rapidocr yet
|
79
|
+
# and hence changing this options doesn't affect anything.
|
80
|
+
|
81
|
+
# For more details on supported languages by RapidOCR visit
|
82
|
+
# https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
83
|
+
|
84
|
+
# For more details on the following options visit
|
85
|
+
# https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
126
86
|
|
127
|
-
# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
128
87
|
text_score: float = 0.5 # same default as rapidocr
|
129
88
|
|
130
89
|
use_det: Optional[bool] = None # same default as rapidocr
|
131
90
|
use_cls: Optional[bool] = None # same default as rapidocr
|
132
91
|
use_rec: Optional[bool] = None # same default as rapidocr
|
133
92
|
|
134
|
-
# class Device(Enum):
|
135
|
-
# CPU = "CPU"
|
136
|
-
# CUDA = "CUDA"
|
137
|
-
# DIRECTML = "DIRECTML"
|
138
|
-
# AUTO = "AUTO"
|
139
|
-
|
140
|
-
# device: Device = Device.AUTO # Default value is AUTO
|
141
|
-
|
142
93
|
print_verbose: bool = False # same default as rapidocr
|
143
94
|
|
144
95
|
det_model_path: Optional[str] = None # same default as rapidocr
|
@@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
|
244
195
|
return self.repo_id.replace("/", "--")
|
245
196
|
|
246
197
|
|
198
|
+
# SmolVLM
|
247
199
|
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
248
200
|
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
249
201
|
)
|
250
|
-
|
202
|
+
|
203
|
+
# GraniteVision
|
251
204
|
granite_picture_description = PictureDescriptionVlmOptions(
|
252
205
|
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
253
206
|
prompt="What is shown in this image?",
|
254
207
|
)
|
255
208
|
|
256
209
|
|
257
|
-
class BaseVlmOptions(BaseModel):
|
258
|
-
kind: str
|
259
|
-
prompt: str
|
260
|
-
|
261
|
-
|
262
|
-
class ResponseFormat(str, Enum):
|
263
|
-
DOCTAGS = "doctags"
|
264
|
-
MARKDOWN = "markdown"
|
265
|
-
|
266
|
-
|
267
|
-
class InferenceFramework(str, Enum):
|
268
|
-
MLX = "mlx"
|
269
|
-
TRANSFORMERS = "transformers"
|
270
|
-
OPENAI = "openai"
|
271
|
-
|
272
|
-
|
273
|
-
class HuggingFaceVlmOptions(BaseVlmOptions):
|
274
|
-
kind: Literal["hf_model_options"] = "hf_model_options"
|
275
|
-
|
276
|
-
repo_id: str
|
277
|
-
load_in_8bit: bool = True
|
278
|
-
llm_int8_threshold: float = 6.0
|
279
|
-
quantized: bool = False
|
280
|
-
|
281
|
-
inference_framework: InferenceFramework
|
282
|
-
response_format: ResponseFormat
|
283
|
-
|
284
|
-
@property
|
285
|
-
def repo_cache_folder(self) -> str:
|
286
|
-
return self.repo_id.replace("/", "--")
|
287
|
-
|
288
|
-
|
289
|
-
class ApiVlmOptions(BaseVlmOptions):
|
290
|
-
kind: Literal["api_model_options"] = "api_model_options"
|
291
|
-
|
292
|
-
url: AnyUrl = AnyUrl(
|
293
|
-
"http://localhost:11434/v1/chat/completions"
|
294
|
-
) # Default to ollama
|
295
|
-
headers: Dict[str, str] = {}
|
296
|
-
params: Dict[str, Any] = {}
|
297
|
-
scale: float = 2.0
|
298
|
-
timeout: float = 60
|
299
|
-
concurrency: int = 1
|
300
|
-
response_format: ResponseFormat
|
301
|
-
|
302
|
-
|
303
|
-
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
304
|
-
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
305
|
-
prompt="Convert this page to docling.",
|
306
|
-
response_format=ResponseFormat.DOCTAGS,
|
307
|
-
inference_framework=InferenceFramework.MLX,
|
308
|
-
)
|
309
|
-
|
310
|
-
|
311
|
-
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
312
|
-
repo_id="ds4sd/SmolDocling-256M-preview",
|
313
|
-
prompt="Convert this page to docling.",
|
314
|
-
response_format=ResponseFormat.DOCTAGS,
|
315
|
-
inference_framework=InferenceFramework.TRANSFORMERS,
|
316
|
-
)
|
317
|
-
|
318
|
-
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
319
|
-
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
320
|
-
# prompt="OCR the full page to markdown.",
|
321
|
-
prompt="OCR this image.",
|
322
|
-
response_format=ResponseFormat.MARKDOWN,
|
323
|
-
inference_framework=InferenceFramework.TRANSFORMERS,
|
324
|
-
)
|
325
|
-
|
326
|
-
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
327
|
-
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
328
|
-
params={"model": "granite3.2-vision:2b"},
|
329
|
-
prompt="OCR the full page to markdown.",
|
330
|
-
scale=1.0,
|
331
|
-
timeout=120,
|
332
|
-
response_format=ResponseFormat.MARKDOWN,
|
333
|
-
)
|
334
|
-
|
335
|
-
|
336
|
-
class VlmModelType(str, Enum):
|
337
|
-
SMOLDOCLING = "smoldocling"
|
338
|
-
GRANITE_VISION = "granite_vision"
|
339
|
-
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
340
|
-
|
341
|
-
|
342
210
|
# Define an enum for the backend options
|
343
211
|
class PdfBackend(str, Enum):
|
344
212
|
"""Enum of valid PDF backends."""
|
@@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
387
255
|
False # (To be used with vlms, or other generative models)
|
388
256
|
)
|
389
257
|
# If True, text from backend will be used instead of generated text
|
390
|
-
vlm_options: Union[
|
258
|
+
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
391
259
|
smoldocling_vlm_conversion_options
|
392
260
|
)
|
393
261
|
|
@@ -0,0 +1,81 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import Any, Dict, List, Literal
|
3
|
+
|
4
|
+
from pydantic import AnyUrl, BaseModel
|
5
|
+
from typing_extensions import deprecated
|
6
|
+
|
7
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
8
|
+
|
9
|
+
|
10
|
+
class BaseVlmOptions(BaseModel):
|
11
|
+
kind: str
|
12
|
+
prompt: str
|
13
|
+
|
14
|
+
|
15
|
+
class ResponseFormat(str, Enum):
|
16
|
+
DOCTAGS = "doctags"
|
17
|
+
MARKDOWN = "markdown"
|
18
|
+
HTML = "html"
|
19
|
+
|
20
|
+
|
21
|
+
class InferenceFramework(str, Enum):
|
22
|
+
MLX = "mlx"
|
23
|
+
TRANSFORMERS = "transformers"
|
24
|
+
|
25
|
+
|
26
|
+
class TransformersModelType(str, Enum):
|
27
|
+
AUTOMODEL = "automodel"
|
28
|
+
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
29
|
+
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
30
|
+
|
31
|
+
|
32
|
+
class InlineVlmOptions(BaseVlmOptions):
|
33
|
+
kind: Literal["inline_model_options"] = "inline_model_options"
|
34
|
+
|
35
|
+
repo_id: str
|
36
|
+
trust_remote_code: bool = False
|
37
|
+
load_in_8bit: bool = True
|
38
|
+
llm_int8_threshold: float = 6.0
|
39
|
+
quantized: bool = False
|
40
|
+
|
41
|
+
inference_framework: InferenceFramework
|
42
|
+
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
43
|
+
response_format: ResponseFormat
|
44
|
+
|
45
|
+
supported_devices: List[AcceleratorDevice] = [
|
46
|
+
AcceleratorDevice.CPU,
|
47
|
+
AcceleratorDevice.CUDA,
|
48
|
+
AcceleratorDevice.MPS,
|
49
|
+
]
|
50
|
+
|
51
|
+
scale: float = 2.0
|
52
|
+
|
53
|
+
temperature: float = 0.0
|
54
|
+
stop_strings: List[str] = []
|
55
|
+
extra_generation_config: Dict[str, Any] = {}
|
56
|
+
|
57
|
+
use_kv_cache: bool = True
|
58
|
+
max_new_tokens: int = 4096
|
59
|
+
|
60
|
+
@property
|
61
|
+
def repo_cache_folder(self) -> str:
|
62
|
+
return self.repo_id.replace("/", "--")
|
63
|
+
|
64
|
+
|
65
|
+
@deprecated("Use InlineVlmOptions instead.")
|
66
|
+
class HuggingFaceVlmOptions(InlineVlmOptions):
|
67
|
+
pass
|
68
|
+
|
69
|
+
|
70
|
+
class ApiVlmOptions(BaseVlmOptions):
|
71
|
+
kind: Literal["api_model_options"] = "api_model_options"
|
72
|
+
|
73
|
+
url: AnyUrl = AnyUrl(
|
74
|
+
"http://localhost:11434/v1/chat/completions"
|
75
|
+
) # Default to ollama
|
76
|
+
headers: Dict[str, str] = {}
|
77
|
+
params: Dict[str, Any] = {}
|
78
|
+
scale: float = 2.0
|
79
|
+
timeout: float = 60
|
80
|
+
concurrency: int = 1
|
81
|
+
response_format: ResponseFormat
|
@@ -0,0 +1,144 @@
|
|
1
|
+
import logging
|
2
|
+
from enum import Enum
|
3
|
+
|
4
|
+
from pydantic import (
|
5
|
+
AnyUrl,
|
6
|
+
)
|
7
|
+
|
8
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
9
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
10
|
+
ApiVlmOptions,
|
11
|
+
InferenceFramework,
|
12
|
+
InlineVlmOptions,
|
13
|
+
ResponseFormat,
|
14
|
+
TransformersModelType,
|
15
|
+
)
|
16
|
+
|
17
|
+
_log = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
# SmolDocling
|
21
|
+
SMOLDOCLING_MLX = InlineVlmOptions(
|
22
|
+
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
23
|
+
prompt="Convert this page to docling.",
|
24
|
+
response_format=ResponseFormat.DOCTAGS,
|
25
|
+
inference_framework=InferenceFramework.MLX,
|
26
|
+
supported_devices=[AcceleratorDevice.MPS],
|
27
|
+
scale=2.0,
|
28
|
+
temperature=0.0,
|
29
|
+
)
|
30
|
+
|
31
|
+
SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
|
32
|
+
repo_id="ds4sd/SmolDocling-256M-preview",
|
33
|
+
prompt="Convert this page to docling.",
|
34
|
+
response_format=ResponseFormat.DOCTAGS,
|
35
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
36
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
37
|
+
supported_devices=[
|
38
|
+
AcceleratorDevice.CPU,
|
39
|
+
AcceleratorDevice.CUDA,
|
40
|
+
AcceleratorDevice.MPS,
|
41
|
+
],
|
42
|
+
scale=2.0,
|
43
|
+
temperature=0.0,
|
44
|
+
)
|
45
|
+
|
46
|
+
# GraniteVision
|
47
|
+
GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
|
48
|
+
repo_id="ibm-granite/granite-vision-3.2-2b",
|
49
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
50
|
+
response_format=ResponseFormat.MARKDOWN,
|
51
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
52
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
53
|
+
supported_devices=[
|
54
|
+
AcceleratorDevice.CPU,
|
55
|
+
AcceleratorDevice.CUDA,
|
56
|
+
AcceleratorDevice.MPS,
|
57
|
+
],
|
58
|
+
scale=2.0,
|
59
|
+
temperature=0.0,
|
60
|
+
)
|
61
|
+
|
62
|
+
GRANITE_VISION_OLLAMA = ApiVlmOptions(
|
63
|
+
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
64
|
+
params={"model": "granite3.2-vision:2b"},
|
65
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
66
|
+
scale=1.0,
|
67
|
+
timeout=120,
|
68
|
+
response_format=ResponseFormat.MARKDOWN,
|
69
|
+
temperature=0.0,
|
70
|
+
)
|
71
|
+
|
72
|
+
# Pixtral
|
73
|
+
PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
|
74
|
+
repo_id="mistral-community/pixtral-12b",
|
75
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
76
|
+
response_format=ResponseFormat.MARKDOWN,
|
77
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
78
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
79
|
+
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
|
80
|
+
scale=2.0,
|
81
|
+
temperature=0.0,
|
82
|
+
)
|
83
|
+
|
84
|
+
PIXTRAL_12B_MLX = InlineVlmOptions(
|
85
|
+
repo_id="mlx-community/pixtral-12b-bf16",
|
86
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
87
|
+
response_format=ResponseFormat.MARKDOWN,
|
88
|
+
inference_framework=InferenceFramework.MLX,
|
89
|
+
supported_devices=[AcceleratorDevice.MPS],
|
90
|
+
scale=2.0,
|
91
|
+
temperature=0.0,
|
92
|
+
)
|
93
|
+
|
94
|
+
# Phi4
|
95
|
+
PHI4_TRANSFORMERS = InlineVlmOptions(
|
96
|
+
repo_id="microsoft/Phi-4-multimodal-instruct",
|
97
|
+
prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
|
98
|
+
trust_remote_code=True,
|
99
|
+
response_format=ResponseFormat.MARKDOWN,
|
100
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
101
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM,
|
102
|
+
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
|
103
|
+
scale=2.0,
|
104
|
+
temperature=0.0,
|
105
|
+
extra_generation_config=dict(num_logits_to_keep=0),
|
106
|
+
)
|
107
|
+
|
108
|
+
# Qwen
|
109
|
+
QWEN25_VL_3B_MLX = InlineVlmOptions(
|
110
|
+
repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
|
111
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
112
|
+
response_format=ResponseFormat.MARKDOWN,
|
113
|
+
inference_framework=InferenceFramework.MLX,
|
114
|
+
supported_devices=[AcceleratorDevice.MPS],
|
115
|
+
scale=2.0,
|
116
|
+
temperature=0.0,
|
117
|
+
)
|
118
|
+
|
119
|
+
# Gemma-3
|
120
|
+
GEMMA3_12B_MLX = InlineVlmOptions(
|
121
|
+
repo_id="mlx-community/gemma-3-12b-it-bf16",
|
122
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
123
|
+
response_format=ResponseFormat.MARKDOWN,
|
124
|
+
inference_framework=InferenceFramework.MLX,
|
125
|
+
supported_devices=[AcceleratorDevice.MPS],
|
126
|
+
scale=2.0,
|
127
|
+
temperature=0.0,
|
128
|
+
)
|
129
|
+
|
130
|
+
GEMMA3_27B_MLX = InlineVlmOptions(
|
131
|
+
repo_id="mlx-community/gemma-3-27b-it-bf16",
|
132
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
133
|
+
response_format=ResponseFormat.MARKDOWN,
|
134
|
+
inference_framework=InferenceFramework.MLX,
|
135
|
+
supported_devices=[AcceleratorDevice.MPS],
|
136
|
+
scale=2.0,
|
137
|
+
temperature=0.0,
|
138
|
+
)
|
139
|
+
|
140
|
+
|
141
|
+
class VlmModelType(str, Enum):
|
142
|
+
SMOLDOCLING = "smoldocling"
|
143
|
+
GRANITE_VISION = "granite_vision"
|
144
|
+
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
docling/document_converter.py
CHANGED
@@ -186,6 +186,11 @@ class DocumentConverter:
|
|
186
186
|
Tuple[Type[BasePipeline], str], BasePipeline
|
187
187
|
] = {}
|
188
188
|
|
189
|
+
def _get_initialized_pipelines(
|
190
|
+
self,
|
191
|
+
) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
|
192
|
+
return self.initialized_pipelines
|
193
|
+
|
189
194
|
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
190
195
|
"""Generate a hash of pipeline options to use as part of the cache key."""
|
191
196
|
options_str = str(pipeline_options.model_dump())
|
docling/models/api_vlm_model.py
CHANGED
@@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
3
3
|
|
4
4
|
from docling.datamodel.base_models import Page, VlmPrediction
|
5
5
|
from docling.datamodel.document import ConversionResult
|
6
|
-
from docling.datamodel.
|
6
|
+
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
|
7
7
|
from docling.exceptions import OperationNotAllowed
|
8
8
|
from docling.models.base_model import BasePageModel
|
9
9
|
from docling.utils.api_image_request import api_image_request
|
docling/models/base_ocr_model.py
CHANGED
@@ -11,9 +11,10 @@ from PIL import Image, ImageDraw
|
|
11
11
|
from rtree import index
|
12
12
|
from scipy.ndimage import binary_dilation, find_objects, label
|
13
13
|
|
14
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
14
15
|
from docling.datamodel.base_models import Page
|
15
16
|
from docling.datamodel.document import ConversionResult
|
16
|
-
from docling.datamodel.pipeline_options import
|
17
|
+
from docling.datamodel.pipeline_options import OcrOptions
|
17
18
|
from docling.datamodel.settings import settings
|
18
19
|
from docling.models.base_model import BaseModelWithOptions, BasePageModel
|
19
20
|
|