docling 2.45.0__py3-none-any.whl → 2.47.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v4_backend.py +61 -27
- docling/backend/html_backend.py +119 -17
- docling/backend/msword_backend.py +126 -16
- docling/cli/main.py +14 -0
- docling/cli/models.py +56 -0
- docling/datamodel/base_models.py +1 -1
- docling/datamodel/pipeline_options.py +4 -3
- docling/datamodel/pipeline_options_vlm_model.py +5 -0
- docling/datamodel/vlm_model_specs.py +114 -1
- docling/models/base_model.py +95 -2
- docling/models/code_formula_model.py +87 -76
- docling/models/page_preprocessing_model.py +5 -1
- docling/models/picture_description_vlm_model.py +4 -2
- docling/models/tesseract_ocr_cli_model.py +4 -2
- docling/models/vlm_models_inline/__init__.py +1 -0
- docling/models/vlm_models_inline/hf_transformers_model.py +179 -79
- docling/models/vlm_models_inline/mlx_model.py +179 -68
- docling/models/vlm_models_inline/vllm_model.py +235 -0
- docling/pipeline/base_pipeline.py +7 -1
- docling/pipeline/threaded_standard_pdf_pipeline.py +7 -5
- docling/pipeline/vlm_pipeline.py +14 -1
- docling/utils/layout_postprocessor.py +51 -43
- {docling-2.45.0.dist-info → docling-2.47.0.dist-info}/METADATA +3 -2
- {docling-2.45.0.dist-info → docling-2.47.0.dist-info}/RECORD +28 -27
- {docling-2.45.0.dist-info → docling-2.47.0.dist-info}/WHEEL +0 -0
- {docling-2.45.0.dist-info → docling-2.47.0.dist-info}/entry_points.txt +0 -0
- {docling-2.45.0.dist-info → docling-2.47.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.45.0.dist-info → docling-2.47.0.dist-info}/top_level.txt +0 -0
docling/cli/main.py
CHANGED
@@ -60,10 +60,12 @@ from docling.datamodel.pipeline_options import (
|
|
60
60
|
)
|
61
61
|
from docling.datamodel.settings import settings
|
62
62
|
from docling.datamodel.vlm_model_specs import (
|
63
|
+
GOT2_TRANSFORMERS,
|
63
64
|
GRANITE_VISION_OLLAMA,
|
64
65
|
GRANITE_VISION_TRANSFORMERS,
|
65
66
|
SMOLDOCLING_MLX,
|
66
67
|
SMOLDOCLING_TRANSFORMERS,
|
68
|
+
SMOLDOCLING_VLLM,
|
67
69
|
VlmModelType,
|
68
70
|
)
|
69
71
|
from docling.document_converter import (
|
@@ -477,6 +479,13 @@ def convert( # noqa: C901
|
|
477
479
|
"--logo", callback=logo_callback, is_eager=True, help="Docling logo"
|
478
480
|
),
|
479
481
|
] = None,
|
482
|
+
page_batch_size: Annotated[
|
483
|
+
int,
|
484
|
+
typer.Option(
|
485
|
+
"--page-batch-size",
|
486
|
+
help=f"Number of pages processed in one batch. Default: {settings.perf.page_batch_size}",
|
487
|
+
),
|
488
|
+
] = settings.perf.page_batch_size,
|
480
489
|
):
|
481
490
|
log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
|
482
491
|
|
@@ -491,6 +500,7 @@ def convert( # noqa: C901
|
|
491
500
|
settings.debug.visualize_layout = debug_visualize_layout
|
492
501
|
settings.debug.visualize_tables = debug_visualize_tables
|
493
502
|
settings.debug.visualize_ocr = debug_visualize_ocr
|
503
|
+
settings.perf.page_batch_size = page_batch_size
|
494
504
|
|
495
505
|
if from_formats is None:
|
496
506
|
from_formats = list(InputFormat)
|
@@ -631,6 +641,8 @@ def convert( # noqa: C901
|
|
631
641
|
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
632
642
|
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
633
643
|
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
644
|
+
elif vlm_model == VlmModelType.GOT_OCR_2:
|
645
|
+
pipeline_options.vlm_options = GOT2_TRANSFORMERS
|
634
646
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
635
647
|
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
636
648
|
if sys.platform == "darwin":
|
@@ -643,6 +655,8 @@ def convert( # noqa: C901
|
|
643
655
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
644
656
|
"pip install mlx-vlm"
|
645
657
|
)
|
658
|
+
elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
|
659
|
+
pipeline_options.vlm_options = SMOLDOCLING_VLLM
|
646
660
|
|
647
661
|
pdf_format_option = PdfFormatOption(
|
648
662
|
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
docling/cli/models.py
CHANGED
@@ -9,6 +9,7 @@ from rich.console import Console
|
|
9
9
|
from rich.logging import RichHandler
|
10
10
|
|
11
11
|
from docling.datamodel.settings import settings
|
12
|
+
from docling.models.utils.hf_model_download import download_hf_model
|
12
13
|
from docling.utils.model_downloader import download_models
|
13
14
|
|
14
15
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
@@ -128,6 +129,61 @@ def download(
|
|
128
129
|
)
|
129
130
|
|
130
131
|
|
132
|
+
@app.command("download-hf-repo")
|
133
|
+
def download_hf_repo(
|
134
|
+
models: Annotated[
|
135
|
+
list[str],
|
136
|
+
typer.Argument(
|
137
|
+
help="Specific models to download from HuggingFace identified by their repo id. For example: ds4sd/docling-models .",
|
138
|
+
),
|
139
|
+
],
|
140
|
+
output_dir: Annotated[
|
141
|
+
Path,
|
142
|
+
typer.Option(
|
143
|
+
...,
|
144
|
+
"-o",
|
145
|
+
"--output-dir",
|
146
|
+
help="The directory where to download the models.",
|
147
|
+
),
|
148
|
+
] = (settings.cache_dir / "models"),
|
149
|
+
force: Annotated[
|
150
|
+
bool, typer.Option(..., help="If true, the download will be forced.")
|
151
|
+
] = False,
|
152
|
+
quiet: Annotated[
|
153
|
+
bool,
|
154
|
+
typer.Option(
|
155
|
+
...,
|
156
|
+
"-q",
|
157
|
+
"--quiet",
|
158
|
+
help="No extra output is generated, the CLI prints only the directory with the cached models.",
|
159
|
+
),
|
160
|
+
] = False,
|
161
|
+
):
|
162
|
+
if not quiet:
|
163
|
+
logging.basicConfig(
|
164
|
+
level=logging.INFO,
|
165
|
+
format="[blue]%(message)s[/blue]",
|
166
|
+
datefmt="[%X]",
|
167
|
+
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
168
|
+
)
|
169
|
+
|
170
|
+
for item in models:
|
171
|
+
typer.secho(f"\nDownloading {item} model from HuggingFace...")
|
172
|
+
download_hf_model(
|
173
|
+
repo_id=item,
|
174
|
+
# would be better to reuse "repo_cache_folder" property: https://github.com/docling-project/docling/blob/main/docling/datamodel/pipeline_options_vlm_model.py#L76
|
175
|
+
# but creating options objects seams like an overkill
|
176
|
+
local_dir=output_dir / item.replace("/", "--"),
|
177
|
+
force=force,
|
178
|
+
progress=(not quiet),
|
179
|
+
)
|
180
|
+
|
181
|
+
if quiet:
|
182
|
+
typer.echo(output_dir)
|
183
|
+
else:
|
184
|
+
typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
|
185
|
+
|
186
|
+
|
131
187
|
click_app = typer.main.get_command(app)
|
132
188
|
|
133
189
|
if __name__ == "__main__":
|
docling/datamodel/base_models.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import math
|
2
2
|
from collections import defaultdict
|
3
3
|
from enum import Enum
|
4
|
-
from typing import TYPE_CHECKING,
|
4
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
5
5
|
|
6
6
|
import numpy as np
|
7
7
|
from docling_core.types.doc import (
|
@@ -282,6 +282,9 @@ class LayoutOptions(BaseModel):
|
|
282
282
|
keep_empty_clusters: bool = (
|
283
283
|
False # Whether to keep clusters that contain no text cells
|
284
284
|
)
|
285
|
+
skip_cell_assignment: bool = (
|
286
|
+
False # Skip cell-to-cluster assignment for VLM-only processing
|
287
|
+
)
|
285
288
|
model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
|
286
289
|
|
287
290
|
|
@@ -323,9 +326,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
323
326
|
),
|
324
327
|
)
|
325
328
|
|
326
|
-
generate_parsed_pages:
|
327
|
-
True # Always True since parsed_page is now mandatory
|
328
|
-
)
|
329
|
+
generate_parsed_pages: bool = False
|
329
330
|
|
330
331
|
|
331
332
|
class ProcessingPipeline(str, Enum):
|
@@ -26,11 +26,14 @@ class ResponseFormat(str, Enum):
|
|
26
26
|
DOCTAGS = "doctags"
|
27
27
|
MARKDOWN = "markdown"
|
28
28
|
HTML = "html"
|
29
|
+
OTSL = "otsl"
|
30
|
+
PLAINTEXT = "plaintext"
|
29
31
|
|
30
32
|
|
31
33
|
class InferenceFramework(str, Enum):
|
32
34
|
MLX = "mlx"
|
33
35
|
TRANSFORMERS = "transformers"
|
36
|
+
VLLM = "vllm"
|
34
37
|
|
35
38
|
|
36
39
|
class TransformersModelType(str, Enum):
|
@@ -43,6 +46,7 @@ class TransformersModelType(str, Enum):
|
|
43
46
|
class TransformersPromptStyle(str, Enum):
|
44
47
|
CHAT = "chat"
|
45
48
|
RAW = "raw"
|
49
|
+
NONE = "none"
|
46
50
|
|
47
51
|
|
48
52
|
class InlineVlmOptions(BaseVlmOptions):
|
@@ -68,6 +72,7 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
68
72
|
|
69
73
|
stop_strings: List[str] = []
|
70
74
|
extra_generation_config: Dict[str, Any] = {}
|
75
|
+
extra_processor_kwargs: Dict[str, Any] = {}
|
71
76
|
|
72
77
|
use_kv_cache: bool = True
|
73
78
|
max_new_tokens: int = 4096
|
@@ -12,6 +12,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
|
|
12
12
|
InlineVlmOptions,
|
13
13
|
ResponseFormat,
|
14
14
|
TransformersModelType,
|
15
|
+
TransformersPromptStyle,
|
15
16
|
)
|
16
17
|
|
17
18
|
_log = logging.getLogger(__name__)
|
@@ -26,6 +27,7 @@ SMOLDOCLING_MLX = InlineVlmOptions(
|
|
26
27
|
supported_devices=[AcceleratorDevice.MPS],
|
27
28
|
scale=2.0,
|
28
29
|
temperature=0.0,
|
30
|
+
stop_strings=["</doctag>", "<end_of_utterance>"],
|
29
31
|
)
|
30
32
|
|
31
33
|
SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
|
@@ -33,16 +35,74 @@ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
|
|
33
35
|
prompt="Convert this page to docling.",
|
34
36
|
response_format=ResponseFormat.DOCTAGS,
|
35
37
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
36
|
-
transformers_model_type=TransformersModelType.
|
38
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
37
39
|
supported_devices=[
|
38
40
|
AcceleratorDevice.CPU,
|
39
41
|
AcceleratorDevice.CUDA,
|
42
|
+
],
|
43
|
+
torch_dtype="bfloat16",
|
44
|
+
scale=2.0,
|
45
|
+
temperature=0.0,
|
46
|
+
stop_strings=["</doctag>", "<end_of_utterance>"],
|
47
|
+
)
|
48
|
+
|
49
|
+
SMOLDOCLING_VLLM = InlineVlmOptions(
|
50
|
+
repo_id="ds4sd/SmolDocling-256M-preview",
|
51
|
+
prompt="Convert this page to docling.",
|
52
|
+
response_format=ResponseFormat.DOCTAGS,
|
53
|
+
inference_framework=InferenceFramework.VLLM,
|
54
|
+
supported_devices=[
|
55
|
+
AcceleratorDevice.CUDA,
|
56
|
+
],
|
57
|
+
scale=2.0,
|
58
|
+
temperature=0.0,
|
59
|
+
stop_strings=["</doctag>", "<end_of_utterance>"],
|
60
|
+
)
|
61
|
+
|
62
|
+
# SmolVLM-256M-Instruct
|
63
|
+
SMOLVLM256_TRANSFORMERS = InlineVlmOptions(
|
64
|
+
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
|
65
|
+
prompt="Transcribe this image to plain text.",
|
66
|
+
response_format=ResponseFormat.PLAINTEXT,
|
67
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
68
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
69
|
+
supported_devices=[
|
70
|
+
AcceleratorDevice.CPU,
|
71
|
+
AcceleratorDevice.CUDA,
|
72
|
+
# AcceleratorDevice.MPS,
|
73
|
+
],
|
74
|
+
torch_dtype="bfloat16",
|
75
|
+
scale=2.0,
|
76
|
+
temperature=0.0,
|
77
|
+
)
|
78
|
+
|
79
|
+
# SmolVLM2-2.2b-Instruct
|
80
|
+
SMOLVLM256_MLX = InlineVlmOptions(
|
81
|
+
repo_id="moot20/SmolVLM-256M-Instruct-MLX",
|
82
|
+
prompt="Extract the text.",
|
83
|
+
response_format=ResponseFormat.DOCTAGS,
|
84
|
+
inference_framework=InferenceFramework.MLX,
|
85
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
86
|
+
supported_devices=[
|
40
87
|
AcceleratorDevice.MPS,
|
41
88
|
],
|
42
89
|
scale=2.0,
|
43
90
|
temperature=0.0,
|
44
91
|
)
|
45
92
|
|
93
|
+
SMOLVLM256_VLLM = InlineVlmOptions(
|
94
|
+
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
|
95
|
+
prompt="Transcribe this image to plain text.",
|
96
|
+
response_format=ResponseFormat.PLAINTEXT,
|
97
|
+
inference_framework=InferenceFramework.VLLM,
|
98
|
+
supported_devices=[
|
99
|
+
AcceleratorDevice.CUDA,
|
100
|
+
],
|
101
|
+
scale=2.0,
|
102
|
+
temperature=0.0,
|
103
|
+
)
|
104
|
+
|
105
|
+
|
46
106
|
# GraniteVision
|
47
107
|
GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
|
48
108
|
repo_id="ibm-granite/granite-vision-3.2-2b",
|
@@ -59,6 +119,18 @@ GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
|
|
59
119
|
temperature=0.0,
|
60
120
|
)
|
61
121
|
|
122
|
+
GRANITE_VISION_VLLM = InlineVlmOptions(
|
123
|
+
repo_id="ibm-granite/granite-vision-3.2-2b",
|
124
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
125
|
+
response_format=ResponseFormat.MARKDOWN,
|
126
|
+
inference_framework=InferenceFramework.VLLM,
|
127
|
+
supported_devices=[
|
128
|
+
AcceleratorDevice.CUDA,
|
129
|
+
],
|
130
|
+
scale=2.0,
|
131
|
+
temperature=0.0,
|
132
|
+
)
|
133
|
+
|
62
134
|
GRANITE_VISION_OLLAMA = ApiVlmOptions(
|
63
135
|
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
64
136
|
params={"model": "granite3.2-vision:2b"},
|
@@ -116,6 +188,26 @@ QWEN25_VL_3B_MLX = InlineVlmOptions(
|
|
116
188
|
temperature=0.0,
|
117
189
|
)
|
118
190
|
|
191
|
+
# GoT 2.0
|
192
|
+
GOT2_TRANSFORMERS = InlineVlmOptions(
|
193
|
+
repo_id="stepfun-ai/GOT-OCR-2.0-hf",
|
194
|
+
prompt="",
|
195
|
+
response_format=ResponseFormat.MARKDOWN,
|
196
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
197
|
+
transformers_prompt_style=TransformersPromptStyle.NONE,
|
198
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
199
|
+
supported_devices=[
|
200
|
+
AcceleratorDevice.CPU,
|
201
|
+
AcceleratorDevice.CUDA,
|
202
|
+
# AcceleratorDevice.MPS,
|
203
|
+
],
|
204
|
+
scale=2.0,
|
205
|
+
temperature=0.0,
|
206
|
+
stop_strings=["<|im_end|>"],
|
207
|
+
extra_processor_kwargs={"format": True},
|
208
|
+
)
|
209
|
+
|
210
|
+
|
119
211
|
# Gemma-3
|
120
212
|
GEMMA3_12B_MLX = InlineVlmOptions(
|
121
213
|
repo_id="mlx-community/gemma-3-12b-it-bf16",
|
@@ -137,8 +229,29 @@ GEMMA3_27B_MLX = InlineVlmOptions(
|
|
137
229
|
temperature=0.0,
|
138
230
|
)
|
139
231
|
|
232
|
+
# Dolphin
|
233
|
+
|
234
|
+
DOLPHIN_TRANSFORMERS = InlineVlmOptions(
|
235
|
+
repo_id="ByteDance/Dolphin",
|
236
|
+
prompt="<s>Read text in the image. <Answer/>",
|
237
|
+
response_format=ResponseFormat.MARKDOWN,
|
238
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
239
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
240
|
+
transformers_prompt_style=TransformersPromptStyle.RAW,
|
241
|
+
supported_devices=[
|
242
|
+
AcceleratorDevice.CUDA,
|
243
|
+
AcceleratorDevice.CPU,
|
244
|
+
AcceleratorDevice.MPS,
|
245
|
+
],
|
246
|
+
scale=2.0,
|
247
|
+
temperature=0.0,
|
248
|
+
)
|
249
|
+
|
140
250
|
|
141
251
|
class VlmModelType(str, Enum):
|
142
252
|
SMOLDOCLING = "smoldocling"
|
253
|
+
SMOLDOCLING_VLLM = "smoldocling_vllm"
|
143
254
|
GRANITE_VISION = "granite_vision"
|
255
|
+
GRANITE_VISION_VLLM = "granite_vision_vllm"
|
144
256
|
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
257
|
+
GOT_OCR_2 = "got_ocr_2"
|
docling/models/base_model.py
CHANGED
@@ -1,13 +1,24 @@
|
|
1
|
+
import logging
|
1
2
|
from abc import ABC, abstractmethod
|
2
3
|
from collections.abc import Iterable
|
3
|
-
from typing import Generic, Optional, Protocol, Type
|
4
|
+
from typing import Any, Generic, Optional, Protocol, Type, Union
|
4
5
|
|
6
|
+
import numpy as np
|
5
7
|
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
8
|
+
from PIL.Image import Image
|
6
9
|
from typing_extensions import TypeVar
|
7
10
|
|
8
|
-
from docling.datamodel.base_models import
|
11
|
+
from docling.datamodel.base_models import (
|
12
|
+
ItemAndImageEnrichmentElement,
|
13
|
+
Page,
|
14
|
+
VlmPrediction,
|
15
|
+
)
|
9
16
|
from docling.datamodel.document import ConversionResult
|
10
17
|
from docling.datamodel.pipeline_options import BaseOptions
|
18
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
19
|
+
InlineVlmOptions,
|
20
|
+
TransformersPromptStyle,
|
21
|
+
)
|
11
22
|
from docling.datamodel.settings import settings
|
12
23
|
|
13
24
|
|
@@ -26,6 +37,88 @@ class BasePageModel(ABC):
|
|
26
37
|
pass
|
27
38
|
|
28
39
|
|
40
|
+
class BaseVlmModel(ABC):
|
41
|
+
"""Base class for Vision-Language Models that adds image processing capability."""
|
42
|
+
|
43
|
+
@abstractmethod
|
44
|
+
def process_images(
|
45
|
+
self,
|
46
|
+
image_batch: Iterable[Union[Image, np.ndarray]],
|
47
|
+
prompt: Union[str, list[str]],
|
48
|
+
) -> Iterable[VlmPrediction]:
|
49
|
+
"""Process raw images without page metadata.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
image_batch: Iterable of PIL Images or numpy arrays
|
53
|
+
prompt: Either:
|
54
|
+
- str: Single prompt used for all images
|
55
|
+
- list[str]: List of prompts (one per image, must match image count)
|
56
|
+
|
57
|
+
Raises:
|
58
|
+
ValueError: If prompt list length doesn't match image count.
|
59
|
+
"""
|
60
|
+
|
61
|
+
|
62
|
+
class BaseVlmPageModel(BasePageModel, BaseVlmModel):
|
63
|
+
"""Base implementation for VLM models that inherit from BasePageModel.
|
64
|
+
|
65
|
+
Provides a default __call__ implementation that extracts images from pages,
|
66
|
+
processes them using process_images, and attaches results back to pages.
|
67
|
+
"""
|
68
|
+
|
69
|
+
# Type annotations for attributes that subclasses must initialize
|
70
|
+
vlm_options: InlineVlmOptions
|
71
|
+
processor: Any
|
72
|
+
|
73
|
+
@abstractmethod
|
74
|
+
def __call__(
|
75
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
76
|
+
) -> Iterable[Page]:
|
77
|
+
"""Extract images from pages, process them, and attach results back."""
|
78
|
+
|
79
|
+
def formulate_prompt(self, user_prompt: str) -> str:
|
80
|
+
"""Formulate a prompt for the VLM."""
|
81
|
+
_log = logging.getLogger(__name__)
|
82
|
+
|
83
|
+
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
|
84
|
+
return user_prompt
|
85
|
+
|
86
|
+
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
87
|
+
_log.debug("Using specialized prompt for Phi-4")
|
88
|
+
# Note: This might need adjustment for VLLM vs transformers
|
89
|
+
user_prompt_prefix = "<|user|>"
|
90
|
+
assistant_prompt = "<|assistant|>"
|
91
|
+
prompt_suffix = "<|end|>"
|
92
|
+
|
93
|
+
prompt = f"{user_prompt_prefix}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
|
94
|
+
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
|
95
|
+
|
96
|
+
return prompt
|
97
|
+
|
98
|
+
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
|
99
|
+
messages = [
|
100
|
+
{
|
101
|
+
"role": "user",
|
102
|
+
"content": [
|
103
|
+
{
|
104
|
+
"type": "text",
|
105
|
+
"text": "This is a page from a document.",
|
106
|
+
},
|
107
|
+
{"type": "image"},
|
108
|
+
{"type": "text", "text": user_prompt},
|
109
|
+
],
|
110
|
+
}
|
111
|
+
]
|
112
|
+
prompt = self.processor.apply_chat_template(
|
113
|
+
messages, add_generation_prompt=True
|
114
|
+
)
|
115
|
+
return prompt
|
116
|
+
|
117
|
+
raise RuntimeError(
|
118
|
+
f"Unknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
|
119
|
+
)
|
120
|
+
|
121
|
+
|
29
122
|
EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
30
123
|
|
31
124
|
|