docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
|
|
3
|
+
|
|
4
|
+
from docling_core.types.doc.page import SegmentedPage
|
|
5
|
+
from pydantic import AnyUrl, BaseModel, ConfigDict
|
|
6
|
+
from transformers import StoppingCriteria
|
|
7
|
+
from typing_extensions import deprecated
|
|
8
|
+
|
|
9
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
10
|
+
from docling.models.utils.generation_utils import GenerationStopper
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from docling_core.types.doc.page import SegmentedPage
|
|
14
|
+
|
|
15
|
+
from docling.datamodel.base_models import Page
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BaseVlmOptions(BaseModel):
|
|
19
|
+
kind: str
|
|
20
|
+
prompt: str
|
|
21
|
+
scale: float = 2.0
|
|
22
|
+
max_size: Optional[int] = None
|
|
23
|
+
temperature: float = 0.0
|
|
24
|
+
|
|
25
|
+
def build_prompt(
|
|
26
|
+
self,
|
|
27
|
+
page: Optional["SegmentedPage"],
|
|
28
|
+
*,
|
|
29
|
+
_internal_page: Optional["Page"] = None,
|
|
30
|
+
) -> str:
|
|
31
|
+
"""Build the prompt for VLM inference.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
page: The parsed/segmented page to process.
|
|
35
|
+
_internal_page: Internal parameter for experimental layout-aware pipelines.
|
|
36
|
+
Do not rely on this in user code - subject to change.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
The formatted prompt string.
|
|
40
|
+
"""
|
|
41
|
+
return self.prompt
|
|
42
|
+
|
|
43
|
+
def decode_response(self, text: str) -> str:
|
|
44
|
+
return text
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ResponseFormat(str, Enum):
|
|
48
|
+
DOCTAGS = "doctags"
|
|
49
|
+
MARKDOWN = "markdown"
|
|
50
|
+
DEEPSEEKOCR_MARKDOWN = "deepseekocr_markdown"
|
|
51
|
+
HTML = "html"
|
|
52
|
+
OTSL = "otsl"
|
|
53
|
+
PLAINTEXT = "plaintext"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class InferenceFramework(str, Enum):
|
|
57
|
+
MLX = "mlx"
|
|
58
|
+
TRANSFORMERS = "transformers"
|
|
59
|
+
VLLM = "vllm"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TransformersModelType(str, Enum):
|
|
63
|
+
AUTOMODEL = "automodel"
|
|
64
|
+
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
|
65
|
+
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
|
66
|
+
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class TransformersPromptStyle(str, Enum):
|
|
70
|
+
CHAT = "chat"
|
|
71
|
+
RAW = "raw"
|
|
72
|
+
NONE = "none"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class InlineVlmOptions(BaseVlmOptions):
|
|
76
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
77
|
+
|
|
78
|
+
kind: Literal["inline_model_options"] = "inline_model_options"
|
|
79
|
+
|
|
80
|
+
repo_id: str
|
|
81
|
+
revision: str = "main"
|
|
82
|
+
trust_remote_code: bool = False
|
|
83
|
+
load_in_8bit: bool = True
|
|
84
|
+
llm_int8_threshold: float = 6.0
|
|
85
|
+
quantized: bool = False
|
|
86
|
+
|
|
87
|
+
inference_framework: InferenceFramework
|
|
88
|
+
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
|
89
|
+
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
|
|
90
|
+
response_format: ResponseFormat
|
|
91
|
+
|
|
92
|
+
torch_dtype: Optional[str] = None
|
|
93
|
+
supported_devices: List[AcceleratorDevice] = [
|
|
94
|
+
AcceleratorDevice.CPU,
|
|
95
|
+
AcceleratorDevice.CUDA,
|
|
96
|
+
AcceleratorDevice.MPS,
|
|
97
|
+
AcceleratorDevice.XPU,
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
stop_strings: List[str] = []
|
|
101
|
+
custom_stopping_criteria: List[Union[StoppingCriteria, GenerationStopper]] = []
|
|
102
|
+
extra_generation_config: Dict[str, Any] = {}
|
|
103
|
+
extra_processor_kwargs: Dict[str, Any] = {}
|
|
104
|
+
|
|
105
|
+
use_kv_cache: bool = True
|
|
106
|
+
max_new_tokens: int = 4096
|
|
107
|
+
track_generated_tokens: bool = False
|
|
108
|
+
track_input_prompt: bool = False
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def repo_cache_folder(self) -> str:
|
|
112
|
+
return self.repo_id.replace("/", "--")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@deprecated("Use InlineVlmOptions instead.")
|
|
116
|
+
class HuggingFaceVlmOptions(InlineVlmOptions):
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class ApiVlmOptions(BaseVlmOptions):
|
|
121
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
122
|
+
|
|
123
|
+
kind: Literal["api_model_options"] = "api_model_options"
|
|
124
|
+
|
|
125
|
+
url: AnyUrl = AnyUrl(
|
|
126
|
+
"http://localhost:11434/v1/chat/completions"
|
|
127
|
+
) # Default to ollama
|
|
128
|
+
headers: Dict[str, str] = {}
|
|
129
|
+
params: Dict[str, Any] = {}
|
|
130
|
+
timeout: float = 60
|
|
131
|
+
concurrency: int = 1
|
|
132
|
+
response_format: ResponseFormat
|
|
133
|
+
|
|
134
|
+
stop_strings: List[str] = []
|
|
135
|
+
custom_stopping_criteria: List[Union[GenerationStopper]] = []
|
|
136
|
+
track_input_prompt: bool = False
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Annotated, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, PlainValidator
|
|
6
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
|
|
10
|
+
if v[0] < 1 or v[1] < v[0]:
|
|
11
|
+
raise ValueError(
|
|
12
|
+
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
|
|
13
|
+
)
|
|
14
|
+
return v
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
|
|
18
|
+
|
|
19
|
+
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DocumentLimits(BaseModel):
|
|
23
|
+
max_num_pages: int = sys.maxsize
|
|
24
|
+
max_file_size: int = sys.maxsize
|
|
25
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BatchConcurrencySettings(BaseModel):
|
|
29
|
+
doc_batch_size: int = 1 # Number of documents processed in one batch. Should be >= doc_batch_concurrency
|
|
30
|
+
doc_batch_concurrency: int = 1 # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
|
|
31
|
+
page_batch_size: int = 4 # Number of pages processed in one batch.
|
|
32
|
+
page_batch_concurrency: int = 1 # Currently unused.
|
|
33
|
+
elements_batch_size: int = (
|
|
34
|
+
16 # Number of elements processed in one batch, in enrichment models.
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# To force models into single core: export OMP_NUM_THREADS=1
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DebugSettings(BaseModel):
|
|
41
|
+
visualize_cells: bool = False
|
|
42
|
+
visualize_ocr: bool = False
|
|
43
|
+
visualize_layout: bool = False
|
|
44
|
+
visualize_raw_layout: bool = False
|
|
45
|
+
visualize_tables: bool = False
|
|
46
|
+
|
|
47
|
+
profile_pipeline_timings: bool = False
|
|
48
|
+
|
|
49
|
+
# Path used to output debug information.
|
|
50
|
+
debug_output_path: str = str(Path.cwd() / "debug")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AppSettings(BaseSettings):
|
|
54
|
+
model_config = SettingsConfigDict(
|
|
55
|
+
env_prefix="DOCLING_", env_nested_delimiter="_", env_nested_max_split=1
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
perf: BatchConcurrencySettings = BatchConcurrencySettings()
|
|
59
|
+
debug: DebugSettings = DebugSettings()
|
|
60
|
+
|
|
61
|
+
cache_dir: Path = Path.home() / ".cache" / "docling"
|
|
62
|
+
artifacts_path: Optional[Path] = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
settings = AppSettings()
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
from pydantic import (
|
|
5
|
+
AnyUrl,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
9
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
|
10
|
+
ApiVlmOptions,
|
|
11
|
+
InferenceFramework,
|
|
12
|
+
InlineVlmOptions,
|
|
13
|
+
ResponseFormat,
|
|
14
|
+
TransformersModelType,
|
|
15
|
+
TransformersPromptStyle,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
_log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Granite-Docling
|
|
22
|
+
GRANITEDOCLING_TRANSFORMERS = InlineVlmOptions(
|
|
23
|
+
repo_id="ibm-granite/granite-docling-258M",
|
|
24
|
+
prompt="Convert this page to docling.",
|
|
25
|
+
response_format=ResponseFormat.DOCTAGS,
|
|
26
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
27
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
|
28
|
+
supported_devices=[
|
|
29
|
+
AcceleratorDevice.CPU,
|
|
30
|
+
AcceleratorDevice.CUDA,
|
|
31
|
+
AcceleratorDevice.XPU,
|
|
32
|
+
],
|
|
33
|
+
extra_generation_config=dict(skip_special_tokens=False),
|
|
34
|
+
scale=2.0,
|
|
35
|
+
temperature=0.0,
|
|
36
|
+
max_new_tokens=8192,
|
|
37
|
+
stop_strings=["</doctag>", "<|end_of_text|>"],
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
GRANITEDOCLING_VLLM = GRANITEDOCLING_TRANSFORMERS.model_copy()
|
|
41
|
+
GRANITEDOCLING_VLLM.inference_framework = InferenceFramework.VLLM
|
|
42
|
+
|
|
43
|
+
GRANITEDOCLING_MLX = InlineVlmOptions(
|
|
44
|
+
repo_id="ibm-granite/granite-docling-258M-mlx",
|
|
45
|
+
prompt="Convert this page to docling.",
|
|
46
|
+
response_format=ResponseFormat.DOCTAGS,
|
|
47
|
+
inference_framework=InferenceFramework.MLX,
|
|
48
|
+
supported_devices=[AcceleratorDevice.MPS],
|
|
49
|
+
scale=2.0,
|
|
50
|
+
temperature=0.0,
|
|
51
|
+
max_new_tokens=8192,
|
|
52
|
+
stop_strings=["</doctag>", "<|end_of_text|>"],
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
GRANITEDOCLING_VLLM_API = ApiVlmOptions(
|
|
56
|
+
url="http://localhost:8000/v1/chat/completions", # LM studio defaults to port 1234, VLLM to 8000
|
|
57
|
+
params=dict(
|
|
58
|
+
model=GRANITEDOCLING_TRANSFORMERS.repo_id,
|
|
59
|
+
max_tokens=4096,
|
|
60
|
+
skip_special_tokens=True,
|
|
61
|
+
),
|
|
62
|
+
prompt=GRANITEDOCLING_TRANSFORMERS.prompt,
|
|
63
|
+
timeout=90,
|
|
64
|
+
scale=2.0,
|
|
65
|
+
temperature=0.0,
|
|
66
|
+
concurrency=4,
|
|
67
|
+
stop_strings=["</doctag>", "<|end_of_text|>"],
|
|
68
|
+
response_format=ResponseFormat.DOCTAGS,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
GRANITEDOCLING_OLLAMA = GRANITEDOCLING_VLLM_API.model_copy()
|
|
72
|
+
GRANITEDOCLING_OLLAMA.url = AnyUrl("http://localhost:11434/v1/chat/completions")
|
|
73
|
+
GRANITEDOCLING_OLLAMA.params["model"] = "ibm/granite-docling:258m"
|
|
74
|
+
|
|
75
|
+
# SmolDocling
|
|
76
|
+
SMOLDOCLING_MLX = InlineVlmOptions(
|
|
77
|
+
repo_id="docling-project/SmolDocling-256M-preview-mlx-bf16",
|
|
78
|
+
prompt="Convert this page to docling.",
|
|
79
|
+
response_format=ResponseFormat.DOCTAGS,
|
|
80
|
+
inference_framework=InferenceFramework.MLX,
|
|
81
|
+
supported_devices=[AcceleratorDevice.MPS],
|
|
82
|
+
scale=2.0,
|
|
83
|
+
temperature=0.0,
|
|
84
|
+
stop_strings=["</doctag>", "<end_of_utterance>"],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
|
|
88
|
+
repo_id="docling-project/SmolDocling-256M-preview",
|
|
89
|
+
prompt="Convert this page to docling.",
|
|
90
|
+
response_format=ResponseFormat.DOCTAGS,
|
|
91
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
92
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
|
93
|
+
supported_devices=[
|
|
94
|
+
AcceleratorDevice.CPU,
|
|
95
|
+
AcceleratorDevice.CUDA,
|
|
96
|
+
AcceleratorDevice.XPU,
|
|
97
|
+
],
|
|
98
|
+
torch_dtype="bfloat16",
|
|
99
|
+
scale=2.0,
|
|
100
|
+
temperature=0.0,
|
|
101
|
+
stop_strings=["</doctag>", "<end_of_utterance>"],
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
SMOLDOCLING_VLLM = InlineVlmOptions(
|
|
105
|
+
repo_id="docling-project/SmolDocling-256M-preview",
|
|
106
|
+
prompt="Convert this page to docling.",
|
|
107
|
+
response_format=ResponseFormat.DOCTAGS,
|
|
108
|
+
inference_framework=InferenceFramework.VLLM,
|
|
109
|
+
supported_devices=[
|
|
110
|
+
AcceleratorDevice.CUDA,
|
|
111
|
+
AcceleratorDevice.XPU,
|
|
112
|
+
],
|
|
113
|
+
scale=2.0,
|
|
114
|
+
temperature=0.0,
|
|
115
|
+
stop_strings=["</doctag>", "<end_of_utterance>"],
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# SmolVLM-256M-Instruct
|
|
119
|
+
SMOLVLM256_TRANSFORMERS = InlineVlmOptions(
|
|
120
|
+
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
|
|
121
|
+
prompt="Transcribe this image to plain text.",
|
|
122
|
+
response_format=ResponseFormat.PLAINTEXT,
|
|
123
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
124
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
|
125
|
+
supported_devices=[
|
|
126
|
+
AcceleratorDevice.CPU,
|
|
127
|
+
AcceleratorDevice.CUDA,
|
|
128
|
+
# AcceleratorDevice.MPS,
|
|
129
|
+
AcceleratorDevice.XPU,
|
|
130
|
+
],
|
|
131
|
+
torch_dtype="bfloat16",
|
|
132
|
+
scale=2.0,
|
|
133
|
+
temperature=0.0,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# SmolVLM2-2.2b-Instruct
|
|
137
|
+
SMOLVLM256_MLX = InlineVlmOptions(
|
|
138
|
+
repo_id="moot20/SmolVLM-256M-Instruct-MLX",
|
|
139
|
+
prompt="Extract the text.",
|
|
140
|
+
response_format=ResponseFormat.DOCTAGS,
|
|
141
|
+
inference_framework=InferenceFramework.MLX,
|
|
142
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
|
143
|
+
supported_devices=[
|
|
144
|
+
AcceleratorDevice.MPS,
|
|
145
|
+
],
|
|
146
|
+
scale=2.0,
|
|
147
|
+
temperature=0.0,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
SMOLVLM256_VLLM = InlineVlmOptions(
|
|
151
|
+
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
|
|
152
|
+
prompt="Transcribe this image to plain text.",
|
|
153
|
+
response_format=ResponseFormat.PLAINTEXT,
|
|
154
|
+
inference_framework=InferenceFramework.VLLM,
|
|
155
|
+
supported_devices=[
|
|
156
|
+
AcceleratorDevice.CUDA,
|
|
157
|
+
AcceleratorDevice.XPU,
|
|
158
|
+
],
|
|
159
|
+
scale=2.0,
|
|
160
|
+
temperature=0.0,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# GraniteVision
|
|
165
|
+
GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
|
|
166
|
+
repo_id="ibm-granite/granite-vision-3.2-2b",
|
|
167
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
|
168
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
169
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
170
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
|
171
|
+
supported_devices=[
|
|
172
|
+
AcceleratorDevice.CPU,
|
|
173
|
+
AcceleratorDevice.CUDA,
|
|
174
|
+
AcceleratorDevice.MPS,
|
|
175
|
+
AcceleratorDevice.XPU,
|
|
176
|
+
],
|
|
177
|
+
scale=2.0,
|
|
178
|
+
temperature=0.0,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
GRANITE_VISION_VLLM = InlineVlmOptions(
|
|
182
|
+
repo_id="ibm-granite/granite-vision-3.2-2b",
|
|
183
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
|
184
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
185
|
+
inference_framework=InferenceFramework.VLLM,
|
|
186
|
+
supported_devices=[
|
|
187
|
+
AcceleratorDevice.CUDA,
|
|
188
|
+
AcceleratorDevice.XPU,
|
|
189
|
+
],
|
|
190
|
+
scale=2.0,
|
|
191
|
+
temperature=0.0,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
GRANITE_VISION_OLLAMA = ApiVlmOptions(
|
|
195
|
+
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
|
196
|
+
params={"model": "granite3.2-vision:2b"},
|
|
197
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
|
198
|
+
scale=1.0,
|
|
199
|
+
timeout=120,
|
|
200
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
201
|
+
temperature=0.0,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Pixtral
|
|
205
|
+
PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
|
|
206
|
+
repo_id="mistral-community/pixtral-12b",
|
|
207
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
|
208
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
209
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
210
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
|
211
|
+
supported_devices=[
|
|
212
|
+
AcceleratorDevice.CPU,
|
|
213
|
+
AcceleratorDevice.CUDA,
|
|
214
|
+
AcceleratorDevice.XPU,
|
|
215
|
+
],
|
|
216
|
+
scale=2.0,
|
|
217
|
+
temperature=0.0,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
PIXTRAL_12B_MLX = InlineVlmOptions(
|
|
221
|
+
repo_id="mlx-community/pixtral-12b-bf16",
|
|
222
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
|
223
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
224
|
+
inference_framework=InferenceFramework.MLX,
|
|
225
|
+
supported_devices=[AcceleratorDevice.MPS],
|
|
226
|
+
scale=2.0,
|
|
227
|
+
temperature=0.0,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Phi4
|
|
231
|
+
PHI4_TRANSFORMERS = InlineVlmOptions(
|
|
232
|
+
repo_id="microsoft/Phi-4-multimodal-instruct",
|
|
233
|
+
prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
|
|
234
|
+
trust_remote_code=True,
|
|
235
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
236
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
237
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM,
|
|
238
|
+
supported_devices=[
|
|
239
|
+
AcceleratorDevice.CPU,
|
|
240
|
+
AcceleratorDevice.CUDA,
|
|
241
|
+
AcceleratorDevice.XPU,
|
|
242
|
+
],
|
|
243
|
+
scale=2.0,
|
|
244
|
+
temperature=0.0,
|
|
245
|
+
extra_generation_config=dict(num_logits_to_keep=0),
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Qwen
|
|
249
|
+
QWEN25_VL_3B_MLX = InlineVlmOptions(
|
|
250
|
+
repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
|
|
251
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
|
252
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
253
|
+
inference_framework=InferenceFramework.MLX,
|
|
254
|
+
supported_devices=[AcceleratorDevice.MPS],
|
|
255
|
+
scale=2.0,
|
|
256
|
+
temperature=0.0,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# GoT 2.0
|
|
260
|
+
GOT2_TRANSFORMERS = InlineVlmOptions(
|
|
261
|
+
repo_id="stepfun-ai/GOT-OCR-2.0-hf",
|
|
262
|
+
prompt="",
|
|
263
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
264
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
265
|
+
transformers_prompt_style=TransformersPromptStyle.NONE,
|
|
266
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
|
267
|
+
supported_devices=[
|
|
268
|
+
AcceleratorDevice.CPU,
|
|
269
|
+
AcceleratorDevice.CUDA,
|
|
270
|
+
# AcceleratorDevice.MPS,
|
|
271
|
+
AcceleratorDevice.XPU,
|
|
272
|
+
],
|
|
273
|
+
scale=2.0,
|
|
274
|
+
temperature=0.0,
|
|
275
|
+
stop_strings=["<|im_end|>"],
|
|
276
|
+
extra_processor_kwargs={"format": True},
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# Gemma-3
|
|
281
|
+
GEMMA3_12B_MLX = InlineVlmOptions(
|
|
282
|
+
repo_id="mlx-community/gemma-3-12b-it-bf16",
|
|
283
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
|
284
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
285
|
+
inference_framework=InferenceFramework.MLX,
|
|
286
|
+
supported_devices=[AcceleratorDevice.MPS],
|
|
287
|
+
scale=2.0,
|
|
288
|
+
temperature=0.0,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
GEMMA3_27B_MLX = InlineVlmOptions(
|
|
292
|
+
repo_id="mlx-community/gemma-3-27b-it-bf16",
|
|
293
|
+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
|
294
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
295
|
+
inference_framework=InferenceFramework.MLX,
|
|
296
|
+
supported_devices=[AcceleratorDevice.MPS],
|
|
297
|
+
scale=2.0,
|
|
298
|
+
temperature=0.0,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Dolphin
|
|
302
|
+
|
|
303
|
+
DOLPHIN_TRANSFORMERS = InlineVlmOptions(
|
|
304
|
+
repo_id="ByteDance/Dolphin",
|
|
305
|
+
prompt="<s>Read text in the image. <Answer/>",
|
|
306
|
+
response_format=ResponseFormat.MARKDOWN,
|
|
307
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
308
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
|
309
|
+
transformers_prompt_style=TransformersPromptStyle.RAW,
|
|
310
|
+
supported_devices=[
|
|
311
|
+
AcceleratorDevice.CUDA,
|
|
312
|
+
AcceleratorDevice.CPU,
|
|
313
|
+
AcceleratorDevice.MPS,
|
|
314
|
+
AcceleratorDevice.XPU,
|
|
315
|
+
],
|
|
316
|
+
scale=2.0,
|
|
317
|
+
temperature=0.0,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# DeepSeek-OCR
|
|
321
|
+
DEEPSEEKOCR_OLLAMA = ApiVlmOptions(
|
|
322
|
+
url="http://localhost:11434/v1/chat/completions",
|
|
323
|
+
params=dict(
|
|
324
|
+
model="deepseek-ocr:3b",
|
|
325
|
+
max_tokens=4096,
|
|
326
|
+
skip_special_tokens=True,
|
|
327
|
+
),
|
|
328
|
+
prompt="<|grounding|>Convert the document to markdown. ",
|
|
329
|
+
timeout=90,
|
|
330
|
+
scale=2.0,
|
|
331
|
+
temperature=0.0,
|
|
332
|
+
concurrency=4,
|
|
333
|
+
response_format=ResponseFormat.DEEPSEEKOCR_MARKDOWN,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# NuExtract
|
|
337
|
+
NU_EXTRACT_2B_TRANSFORMERS = InlineVlmOptions(
|
|
338
|
+
repo_id="numind/NuExtract-2.0-2B",
|
|
339
|
+
revision="fe5b2f0b63b81150721435a3ca1129a75c59c74e", # 489efed leads to MPS issues
|
|
340
|
+
prompt="", # This won't be used, template is passed separately
|
|
341
|
+
torch_dtype="bfloat16",
|
|
342
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
343
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
|
344
|
+
response_format=ResponseFormat.PLAINTEXT,
|
|
345
|
+
supported_devices=[
|
|
346
|
+
AcceleratorDevice.CPU,
|
|
347
|
+
AcceleratorDevice.CUDA,
|
|
348
|
+
AcceleratorDevice.MPS,
|
|
349
|
+
AcceleratorDevice.XPU,
|
|
350
|
+
],
|
|
351
|
+
scale=2.0,
|
|
352
|
+
temperature=0.0,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
class VlmModelType(str, Enum):
|
|
357
|
+
SMOLDOCLING = "smoldocling"
|
|
358
|
+
SMOLDOCLING_VLLM = "smoldocling_vllm"
|
|
359
|
+
GRANITE_VISION = "granite_vision"
|
|
360
|
+
GRANITE_VISION_VLLM = "granite_vision_vllm"
|
|
361
|
+
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
|
362
|
+
GOT_OCR_2 = "got_ocr_2"
|
|
363
|
+
GRANITEDOCLING = "granite_docling"
|
|
364
|
+
GRANITEDOCLING_VLLM = "granite_docling_vllm"
|
|
365
|
+
DEEPSEEKOCR_OLLAMA = "deepseekocr_ollama"
|