docling 2.34.0__py3-none-any.whl → 2.36.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/jats_backend.py +0 -0
- docling/cli/main.py +48 -18
- docling/datamodel/accelerator_options.py +68 -0
- docling/datamodel/base_models.py +10 -8
- docling/datamodel/document.py +7 -2
- docling/datamodel/pipeline_options.py +29 -161
- docling/datamodel/pipeline_options_vlm_model.py +81 -0
- docling/datamodel/vlm_model_specs.py +144 -0
- docling/document_converter.py +5 -0
- docling/models/api_vlm_model.py +1 -1
- docling/models/base_ocr_model.py +2 -1
- docling/models/code_formula_model.py +6 -11
- docling/models/document_picture_classifier.py +6 -11
- docling/models/easyocr_model.py +1 -2
- docling/models/layout_model.py +22 -17
- docling/models/ocr_mac_model.py +1 -1
- docling/models/page_preprocessing_model.py +11 -6
- docling/models/picture_description_api_model.py +1 -1
- docling/models/picture_description_base_model.py +1 -1
- docling/models/picture_description_vlm_model.py +7 -22
- docling/models/rapid_ocr_model.py +1 -2
- docling/models/table_structure_model.py +6 -12
- docling/models/tesseract_ocr_cli_model.py +1 -1
- docling/models/tesseract_ocr_model.py +1 -1
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/hf_model_download.py +40 -0
- docling/models/vlm_models_inline/__init__.py +0 -0
- docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
- docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} +56 -44
- docling/pipeline/standard_pdf_pipeline.py +69 -57
- docling/pipeline/vlm_pipeline.py +228 -61
- docling/utils/accelerator_utils.py +17 -2
- docling/utils/model_downloader.py +13 -12
- {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/METADATA +54 -55
- {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/RECORD +48 -41
- {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/WHEEL +2 -1
- docling-2.36.0.dist-info/entry_points.txt +6 -0
- docling-2.36.0.dist-info/top_level.txt +1 -0
- docling/models/hf_vlm_model.py +0 -182
- docling-2.34.0.dist-info/entry_points.txt +0 -7
- {docling-2.34.0.dist-info → docling-2.36.0.dist-info/licenses}/LICENSE +0 -0
File without changes
|
docling/cli/main.py
CHANGED
@@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
|
|
12
12
|
|
13
13
|
import rich.table
|
14
14
|
import typer
|
15
|
+
from docling_core.transforms.serializer.html import (
|
16
|
+
HTMLDocSerializer,
|
17
|
+
HTMLOutputStyle,
|
18
|
+
HTMLParams,
|
19
|
+
)
|
20
|
+
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
|
15
21
|
from docling_core.types.doc import ImageRefMode
|
16
22
|
from docling_core.utils.file import resolve_source_to_path
|
17
23
|
from pydantic import TypeAdapter
|
@@ -22,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
|
|
22
28
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
23
29
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
24
30
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
31
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
25
32
|
from docling.datamodel.base_models import (
|
26
33
|
ConversionStatus,
|
27
34
|
FormatToExtensions,
|
@@ -30,8 +37,6 @@ from docling.datamodel.base_models import (
|
|
30
37
|
)
|
31
38
|
from docling.datamodel.document import ConversionResult
|
32
39
|
from docling.datamodel.pipeline_options import (
|
33
|
-
AcceleratorDevice,
|
34
|
-
AcceleratorOptions,
|
35
40
|
EasyOcrOptions,
|
36
41
|
OcrOptions,
|
37
42
|
PaginatedPipelineOptions,
|
@@ -39,14 +44,16 @@ from docling.datamodel.pipeline_options import (
|
|
39
44
|
PdfPipeline,
|
40
45
|
PdfPipelineOptions,
|
41
46
|
TableFormerMode,
|
42
|
-
VlmModelType,
|
43
47
|
VlmPipelineOptions,
|
44
|
-
granite_vision_vlm_conversion_options,
|
45
|
-
granite_vision_vlm_ollama_conversion_options,
|
46
|
-
smoldocling_vlm_conversion_options,
|
47
|
-
smoldocling_vlm_mlx_conversion_options,
|
48
48
|
)
|
49
49
|
from docling.datamodel.settings import settings
|
50
|
+
from docling.datamodel.vlm_model_specs import (
|
51
|
+
GRANITE_VISION_OLLAMA,
|
52
|
+
GRANITE_VISION_TRANSFORMERS,
|
53
|
+
SMOLDOCLING_MLX,
|
54
|
+
SMOLDOCLING_TRANSFORMERS,
|
55
|
+
VlmModelType,
|
56
|
+
)
|
50
57
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
51
58
|
from docling.models.factories import get_ocr_factory
|
52
59
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
@@ -156,6 +163,7 @@ def export_documents(
|
|
156
163
|
export_json: bool,
|
157
164
|
export_html: bool,
|
158
165
|
export_html_split_page: bool,
|
166
|
+
show_layout: bool,
|
159
167
|
export_md: bool,
|
160
168
|
export_txt: bool,
|
161
169
|
export_doctags: bool,
|
@@ -189,9 +197,27 @@ def export_documents(
|
|
189
197
|
if export_html_split_page:
|
190
198
|
fname = output_dir / f"{doc_filename}.html"
|
191
199
|
_log.info(f"writing HTML output to {fname}")
|
192
|
-
|
193
|
-
|
194
|
-
|
200
|
+
if show_layout:
|
201
|
+
ser = HTMLDocSerializer(
|
202
|
+
doc=conv_res.document,
|
203
|
+
params=HTMLParams(
|
204
|
+
image_mode=image_export_mode,
|
205
|
+
output_style=HTMLOutputStyle.SPLIT_PAGE,
|
206
|
+
),
|
207
|
+
)
|
208
|
+
visualizer = LayoutVisualizer()
|
209
|
+
visualizer.params.show_label = False
|
210
|
+
ser_res = ser.serialize(
|
211
|
+
visualizer=visualizer,
|
212
|
+
)
|
213
|
+
with open(fname, "w") as fw:
|
214
|
+
fw.write(ser_res.text)
|
215
|
+
else:
|
216
|
+
conv_res.document.save_as_html(
|
217
|
+
filename=fname,
|
218
|
+
image_mode=image_export_mode,
|
219
|
+
split_page_view=True,
|
220
|
+
)
|
195
221
|
|
196
222
|
# Export Text format:
|
197
223
|
if export_txt:
|
@@ -250,6 +276,13 @@ def convert( # noqa: C901
|
|
250
276
|
to_formats: List[OutputFormat] = typer.Option(
|
251
277
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
252
278
|
),
|
279
|
+
show_layout: Annotated[
|
280
|
+
bool,
|
281
|
+
typer.Option(
|
282
|
+
...,
|
283
|
+
help="If enabled, the page images will show the bounding-boxes of the items.",
|
284
|
+
),
|
285
|
+
] = False,
|
253
286
|
headers: str = typer.Option(
|
254
287
|
None,
|
255
288
|
"--headers",
|
@@ -547,20 +580,16 @@ def convert( # noqa: C901
|
|
547
580
|
)
|
548
581
|
|
549
582
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
550
|
-
pipeline_options.vlm_options =
|
583
|
+
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
551
584
|
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
552
|
-
pipeline_options.vlm_options =
|
553
|
-
granite_vision_vlm_ollama_conversion_options
|
554
|
-
)
|
585
|
+
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
555
586
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
556
|
-
pipeline_options.vlm_options =
|
587
|
+
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
557
588
|
if sys.platform == "darwin":
|
558
589
|
try:
|
559
590
|
import mlx_vlm
|
560
591
|
|
561
|
-
pipeline_options.vlm_options =
|
562
|
-
smoldocling_vlm_mlx_conversion_options
|
563
|
-
)
|
592
|
+
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
564
593
|
except ImportError:
|
565
594
|
_log.warning(
|
566
595
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
@@ -596,6 +625,7 @@ def convert( # noqa: C901
|
|
596
625
|
export_json=export_json,
|
597
626
|
export_html=export_html,
|
598
627
|
export_html_split_page=export_html_split_page,
|
628
|
+
show_layout=show_layout,
|
599
629
|
export_md=export_md,
|
600
630
|
export_txt=export_txt,
|
601
631
|
export_doctags=export_doctags,
|
@@ -0,0 +1,68 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Any, Union
|
6
|
+
|
7
|
+
from pydantic import field_validator, model_validator
|
8
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class AcceleratorDevice(str, Enum):
|
14
|
+
"""Devices to run model inference"""
|
15
|
+
|
16
|
+
AUTO = "auto"
|
17
|
+
CPU = "cpu"
|
18
|
+
CUDA = "cuda"
|
19
|
+
MPS = "mps"
|
20
|
+
|
21
|
+
|
22
|
+
class AcceleratorOptions(BaseSettings):
|
23
|
+
model_config = SettingsConfigDict(
|
24
|
+
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
25
|
+
)
|
26
|
+
|
27
|
+
num_threads: int = 4
|
28
|
+
device: Union[str, AcceleratorDevice] = "auto"
|
29
|
+
cuda_use_flash_attention2: bool = False
|
30
|
+
|
31
|
+
@field_validator("device")
|
32
|
+
def validate_device(cls, value):
|
33
|
+
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
34
|
+
if value in {d.value for d in AcceleratorDevice} or re.match(
|
35
|
+
r"^cuda(:\d+)?$", value
|
36
|
+
):
|
37
|
+
return value
|
38
|
+
raise ValueError(
|
39
|
+
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
40
|
+
)
|
41
|
+
|
42
|
+
@model_validator(mode="before")
|
43
|
+
@classmethod
|
44
|
+
def check_alternative_envvars(cls, data: Any) -> Any:
|
45
|
+
r"""
|
46
|
+
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
47
|
+
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
48
|
+
|
49
|
+
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
50
|
+
the same functionality. In case the alias envvar is set and the user tries to override the
|
51
|
+
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
52
|
+
as an extra input instead of simply overwriting the evvar value for that parameter.
|
53
|
+
"""
|
54
|
+
if isinstance(data, dict):
|
55
|
+
input_num_threads = data.get("num_threads")
|
56
|
+
# Check if to set the num_threads from the alternative envvar
|
57
|
+
if input_num_threads is None:
|
58
|
+
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
59
|
+
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
60
|
+
if docling_num_threads is None and omp_num_threads is not None:
|
61
|
+
try:
|
62
|
+
data["num_threads"] = int(omp_num_threads)
|
63
|
+
except ValueError:
|
64
|
+
_log.error(
|
65
|
+
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
66
|
+
omp_num_threads,
|
67
|
+
)
|
68
|
+
return data
|
docling/datamodel/base_models.py
CHANGED
@@ -13,11 +13,11 @@ from docling_core.types.doc import (
|
|
13
13
|
TableCell,
|
14
14
|
)
|
15
15
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
16
|
-
|
17
|
-
# DO NOT REMOVE; explicitly exposed from this location
|
18
16
|
from docling_core.types.io import (
|
19
17
|
DocumentStream,
|
20
18
|
)
|
19
|
+
|
20
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
21
21
|
from PIL.Image import Image
|
22
22
|
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
23
23
|
|
@@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
|
|
131
131
|
error_message: str
|
132
132
|
|
133
133
|
|
134
|
-
# class Cell(BaseModel):
|
135
|
-
# id: int
|
136
|
-
# text: str
|
137
|
-
# bbox: BoundingBox
|
138
|
-
|
139
|
-
|
140
134
|
class Cluster(BaseModel):
|
141
135
|
id: int
|
142
136
|
label: DocItemLabel
|
@@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
|
|
158
152
|
clusters: List[Cluster] = []
|
159
153
|
|
160
154
|
|
155
|
+
class VlmPredictionToken(BaseModel):
|
156
|
+
text: str = ""
|
157
|
+
token: int = -1
|
158
|
+
logprob: float = -1
|
159
|
+
|
160
|
+
|
161
161
|
class VlmPrediction(BaseModel):
|
162
162
|
text: str = ""
|
163
|
+
generated_tokens: list[VlmPredictionToken] = []
|
164
|
+
generation_time: float = -1
|
163
165
|
|
164
166
|
|
165
167
|
class ContainerElement(
|
docling/datamodel/document.py
CHANGED
@@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
|
|
334
334
|
) -> Optional[InputFormat]:
|
335
335
|
"""Guess the input format of a document by checking part of its content."""
|
336
336
|
input_format: Optional[InputFormat] = None
|
337
|
-
content_str = content.decode("utf-8")
|
338
337
|
|
339
338
|
if mime == "application/xml":
|
339
|
+
content_str = content.decode("utf-8")
|
340
340
|
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
341
341
|
if match_doctype:
|
342
342
|
xml_doctype = match_doctype.group()
|
@@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
|
|
358
358
|
input_format = InputFormat.XML_JATS
|
359
359
|
|
360
360
|
elif mime == "text/plain":
|
361
|
+
content_str = content.decode("utf-8")
|
361
362
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
362
363
|
input_format = InputFormat.XML_USPTO
|
363
364
|
|
@@ -411,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
|
|
411
412
|
else:
|
412
413
|
return "application/xml"
|
413
414
|
|
414
|
-
if re.match(
|
415
|
+
if re.match(
|
416
|
+
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
|
417
|
+
content_str,
|
418
|
+
re.DOTALL,
|
419
|
+
):
|
415
420
|
return "text/html"
|
416
421
|
|
417
422
|
p = re.compile(
|
@@ -1,6 +1,4 @@
|
|
1
1
|
import logging
|
2
|
-
import os
|
3
|
-
import re
|
4
2
|
from enum import Enum
|
5
3
|
from pathlib import Path
|
6
4
|
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
@@ -10,71 +8,26 @@ from pydantic import (
|
|
10
8
|
BaseModel,
|
11
9
|
ConfigDict,
|
12
10
|
Field,
|
13
|
-
field_validator,
|
14
|
-
model_validator,
|
15
11
|
)
|
16
|
-
from pydantic_settings import BaseSettings, SettingsConfigDict
|
17
12
|
from typing_extensions import deprecated
|
18
13
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
)
|
14
|
+
# Import the following for backwards compatibility
|
15
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
16
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
17
|
+
ApiVlmOptions,
|
18
|
+
InferenceFramework,
|
19
|
+
InlineVlmOptions,
|
20
|
+
ResponseFormat,
|
21
|
+
)
|
22
|
+
from docling.datamodel.vlm_model_specs import (
|
23
|
+
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
24
|
+
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
25
|
+
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
26
|
+
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
27
|
+
VlmModelType,
|
28
|
+
)
|
35
29
|
|
36
|
-
|
37
|
-
device: Union[str, AcceleratorDevice] = "auto"
|
38
|
-
cuda_use_flash_attention2: bool = False
|
39
|
-
|
40
|
-
@field_validator("device")
|
41
|
-
def validate_device(cls, value):
|
42
|
-
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
43
|
-
if value in {d.value for d in AcceleratorDevice} or re.match(
|
44
|
-
r"^cuda(:\d+)?$", value
|
45
|
-
):
|
46
|
-
return value
|
47
|
-
raise ValueError(
|
48
|
-
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
49
|
-
)
|
50
|
-
|
51
|
-
@model_validator(mode="before")
|
52
|
-
@classmethod
|
53
|
-
def check_alternative_envvars(cls, data: Any) -> Any:
|
54
|
-
r"""
|
55
|
-
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
56
|
-
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
57
|
-
|
58
|
-
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
59
|
-
the same functionality. In case the alias envvar is set and the user tries to override the
|
60
|
-
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
61
|
-
as an extra input instead of simply overwriting the evvar value for that parameter.
|
62
|
-
"""
|
63
|
-
if isinstance(data, dict):
|
64
|
-
input_num_threads = data.get("num_threads")
|
65
|
-
# Check if to set the num_threads from the alternative envvar
|
66
|
-
if input_num_threads is None:
|
67
|
-
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
68
|
-
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
69
|
-
if docling_num_threads is None and omp_num_threads is not None:
|
70
|
-
try:
|
71
|
-
data["num_threads"] = int(omp_num_threads)
|
72
|
-
except ValueError:
|
73
|
-
_log.error(
|
74
|
-
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
75
|
-
omp_num_threads,
|
76
|
-
)
|
77
|
-
return data
|
30
|
+
_log = logging.getLogger(__name__)
|
78
31
|
|
79
32
|
|
80
33
|
class BaseOptions(BaseModel):
|
@@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions):
|
|
121
74
|
lang: List[str] = [
|
122
75
|
"english",
|
123
76
|
"chinese",
|
124
|
-
]
|
125
|
-
#
|
77
|
+
]
|
78
|
+
# However, language as a parameter is not supported by rapidocr yet
|
79
|
+
# and hence changing this options doesn't affect anything.
|
80
|
+
|
81
|
+
# For more details on supported languages by RapidOCR visit
|
82
|
+
# https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
83
|
+
|
84
|
+
# For more details on the following options visit
|
85
|
+
# https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
126
86
|
|
127
|
-
# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
128
87
|
text_score: float = 0.5 # same default as rapidocr
|
129
88
|
|
130
89
|
use_det: Optional[bool] = None # same default as rapidocr
|
131
90
|
use_cls: Optional[bool] = None # same default as rapidocr
|
132
91
|
use_rec: Optional[bool] = None # same default as rapidocr
|
133
92
|
|
134
|
-
# class Device(Enum):
|
135
|
-
# CPU = "CPU"
|
136
|
-
# CUDA = "CUDA"
|
137
|
-
# DIRECTML = "DIRECTML"
|
138
|
-
# AUTO = "AUTO"
|
139
|
-
|
140
|
-
# device: Device = Device.AUTO # Default value is AUTO
|
141
|
-
|
142
93
|
print_verbose: bool = False # same default as rapidocr
|
143
94
|
|
144
95
|
det_model_path: Optional[str] = None # same default as rapidocr
|
@@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
|
244
195
|
return self.repo_id.replace("/", "--")
|
245
196
|
|
246
197
|
|
198
|
+
# SmolVLM
|
247
199
|
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
248
200
|
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
249
201
|
)
|
250
|
-
|
202
|
+
|
203
|
+
# GraniteVision
|
251
204
|
granite_picture_description = PictureDescriptionVlmOptions(
|
252
205
|
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
253
206
|
prompt="What is shown in this image?",
|
254
207
|
)
|
255
208
|
|
256
209
|
|
257
|
-
class BaseVlmOptions(BaseModel):
|
258
|
-
kind: str
|
259
|
-
prompt: str
|
260
|
-
|
261
|
-
|
262
|
-
class ResponseFormat(str, Enum):
|
263
|
-
DOCTAGS = "doctags"
|
264
|
-
MARKDOWN = "markdown"
|
265
|
-
|
266
|
-
|
267
|
-
class InferenceFramework(str, Enum):
|
268
|
-
MLX = "mlx"
|
269
|
-
TRANSFORMERS = "transformers"
|
270
|
-
OPENAI = "openai"
|
271
|
-
|
272
|
-
|
273
|
-
class HuggingFaceVlmOptions(BaseVlmOptions):
|
274
|
-
kind: Literal["hf_model_options"] = "hf_model_options"
|
275
|
-
|
276
|
-
repo_id: str
|
277
|
-
load_in_8bit: bool = True
|
278
|
-
llm_int8_threshold: float = 6.0
|
279
|
-
quantized: bool = False
|
280
|
-
|
281
|
-
inference_framework: InferenceFramework
|
282
|
-
response_format: ResponseFormat
|
283
|
-
|
284
|
-
@property
|
285
|
-
def repo_cache_folder(self) -> str:
|
286
|
-
return self.repo_id.replace("/", "--")
|
287
|
-
|
288
|
-
|
289
|
-
class ApiVlmOptions(BaseVlmOptions):
|
290
|
-
kind: Literal["api_model_options"] = "api_model_options"
|
291
|
-
|
292
|
-
url: AnyUrl = AnyUrl(
|
293
|
-
"http://localhost:11434/v1/chat/completions"
|
294
|
-
) # Default to ollama
|
295
|
-
headers: Dict[str, str] = {}
|
296
|
-
params: Dict[str, Any] = {}
|
297
|
-
scale: float = 2.0
|
298
|
-
timeout: float = 60
|
299
|
-
concurrency: int = 1
|
300
|
-
response_format: ResponseFormat
|
301
|
-
|
302
|
-
|
303
|
-
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
304
|
-
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
305
|
-
prompt="Convert this page to docling.",
|
306
|
-
response_format=ResponseFormat.DOCTAGS,
|
307
|
-
inference_framework=InferenceFramework.MLX,
|
308
|
-
)
|
309
|
-
|
310
|
-
|
311
|
-
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
312
|
-
repo_id="ds4sd/SmolDocling-256M-preview",
|
313
|
-
prompt="Convert this page to docling.",
|
314
|
-
response_format=ResponseFormat.DOCTAGS,
|
315
|
-
inference_framework=InferenceFramework.TRANSFORMERS,
|
316
|
-
)
|
317
|
-
|
318
|
-
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
319
|
-
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
320
|
-
# prompt="OCR the full page to markdown.",
|
321
|
-
prompt="OCR this image.",
|
322
|
-
response_format=ResponseFormat.MARKDOWN,
|
323
|
-
inference_framework=InferenceFramework.TRANSFORMERS,
|
324
|
-
)
|
325
|
-
|
326
|
-
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
327
|
-
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
328
|
-
params={"model": "granite3.2-vision:2b"},
|
329
|
-
prompt="OCR the full page to markdown.",
|
330
|
-
scale=1.0,
|
331
|
-
timeout=120,
|
332
|
-
response_format=ResponseFormat.MARKDOWN,
|
333
|
-
)
|
334
|
-
|
335
|
-
|
336
|
-
class VlmModelType(str, Enum):
|
337
|
-
SMOLDOCLING = "smoldocling"
|
338
|
-
GRANITE_VISION = "granite_vision"
|
339
|
-
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
340
|
-
|
341
|
-
|
342
210
|
# Define an enum for the backend options
|
343
211
|
class PdfBackend(str, Enum):
|
344
212
|
"""Enum of valid PDF backends."""
|
@@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
387
255
|
False # (To be used with vlms, or other generative models)
|
388
256
|
)
|
389
257
|
# If True, text from backend will be used instead of generated text
|
390
|
-
vlm_options: Union[
|
258
|
+
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
391
259
|
smoldocling_vlm_conversion_options
|
392
260
|
)
|
393
261
|
|
@@ -0,0 +1,81 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import Any, Dict, List, Literal
|
3
|
+
|
4
|
+
from pydantic import AnyUrl, BaseModel
|
5
|
+
from typing_extensions import deprecated
|
6
|
+
|
7
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
8
|
+
|
9
|
+
|
10
|
+
class BaseVlmOptions(BaseModel):
|
11
|
+
kind: str
|
12
|
+
prompt: str
|
13
|
+
|
14
|
+
|
15
|
+
class ResponseFormat(str, Enum):
|
16
|
+
DOCTAGS = "doctags"
|
17
|
+
MARKDOWN = "markdown"
|
18
|
+
HTML = "html"
|
19
|
+
|
20
|
+
|
21
|
+
class InferenceFramework(str, Enum):
|
22
|
+
MLX = "mlx"
|
23
|
+
TRANSFORMERS = "transformers"
|
24
|
+
|
25
|
+
|
26
|
+
class TransformersModelType(str, Enum):
|
27
|
+
AUTOMODEL = "automodel"
|
28
|
+
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
29
|
+
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
30
|
+
|
31
|
+
|
32
|
+
class InlineVlmOptions(BaseVlmOptions):
|
33
|
+
kind: Literal["inline_model_options"] = "inline_model_options"
|
34
|
+
|
35
|
+
repo_id: str
|
36
|
+
trust_remote_code: bool = False
|
37
|
+
load_in_8bit: bool = True
|
38
|
+
llm_int8_threshold: float = 6.0
|
39
|
+
quantized: bool = False
|
40
|
+
|
41
|
+
inference_framework: InferenceFramework
|
42
|
+
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
43
|
+
response_format: ResponseFormat
|
44
|
+
|
45
|
+
supported_devices: List[AcceleratorDevice] = [
|
46
|
+
AcceleratorDevice.CPU,
|
47
|
+
AcceleratorDevice.CUDA,
|
48
|
+
AcceleratorDevice.MPS,
|
49
|
+
]
|
50
|
+
|
51
|
+
scale: float = 2.0
|
52
|
+
|
53
|
+
temperature: float = 0.0
|
54
|
+
stop_strings: List[str] = []
|
55
|
+
extra_generation_config: Dict[str, Any] = {}
|
56
|
+
|
57
|
+
use_kv_cache: bool = True
|
58
|
+
max_new_tokens: int = 4096
|
59
|
+
|
60
|
+
@property
|
61
|
+
def repo_cache_folder(self) -> str:
|
62
|
+
return self.repo_id.replace("/", "--")
|
63
|
+
|
64
|
+
|
65
|
+
@deprecated("Use InlineVlmOptions instead.")
|
66
|
+
class HuggingFaceVlmOptions(InlineVlmOptions):
|
67
|
+
pass
|
68
|
+
|
69
|
+
|
70
|
+
class ApiVlmOptions(BaseVlmOptions):
|
71
|
+
kind: Literal["api_model_options"] = "api_model_options"
|
72
|
+
|
73
|
+
url: AnyUrl = AnyUrl(
|
74
|
+
"http://localhost:11434/v1/chat/completions"
|
75
|
+
) # Default to ollama
|
76
|
+
headers: Dict[str, str] = {}
|
77
|
+
params: Dict[str, Any] = {}
|
78
|
+
scale: float = 2.0
|
79
|
+
timeout: float = 60
|
80
|
+
concurrency: int = 1
|
81
|
+
response_format: ResponseFormat
|