docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,136 @@
1
+ from enum import Enum
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
3
+
4
+ from docling_core.types.doc.page import SegmentedPage
5
+ from pydantic import AnyUrl, BaseModel, ConfigDict
6
+ from transformers import StoppingCriteria
7
+ from typing_extensions import deprecated
8
+
9
+ from docling.datamodel.accelerator_options import AcceleratorDevice
10
+ from docling.models.utils.generation_utils import GenerationStopper
11
+
12
+ if TYPE_CHECKING:
13
+ from docling_core.types.doc.page import SegmentedPage
14
+
15
+ from docling.datamodel.base_models import Page
16
+
17
+
18
+ class BaseVlmOptions(BaseModel):
19
+ kind: str
20
+ prompt: str
21
+ scale: float = 2.0
22
+ max_size: Optional[int] = None
23
+ temperature: float = 0.0
24
+
25
+ def build_prompt(
26
+ self,
27
+ page: Optional["SegmentedPage"],
28
+ *,
29
+ _internal_page: Optional["Page"] = None,
30
+ ) -> str:
31
+ """Build the prompt for VLM inference.
32
+
33
+ Args:
34
+ page: The parsed/segmented page to process.
35
+ _internal_page: Internal parameter for experimental layout-aware pipelines.
36
+ Do not rely on this in user code - subject to change.
37
+
38
+ Returns:
39
+ The formatted prompt string.
40
+ """
41
+ return self.prompt
42
+
43
+ def decode_response(self, text: str) -> str:
44
+ return text
45
+
46
+
47
+ class ResponseFormat(str, Enum):
48
+ DOCTAGS = "doctags"
49
+ MARKDOWN = "markdown"
50
+ DEEPSEEKOCR_MARKDOWN = "deepseekocr_markdown"
51
+ HTML = "html"
52
+ OTSL = "otsl"
53
+ PLAINTEXT = "plaintext"
54
+
55
+
56
+ class InferenceFramework(str, Enum):
57
+ MLX = "mlx"
58
+ TRANSFORMERS = "transformers"
59
+ VLLM = "vllm"
60
+
61
+
62
+ class TransformersModelType(str, Enum):
63
+ AUTOMODEL = "automodel"
64
+ AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
65
+ AUTOMODEL_CAUSALLM = "automodel-causallm"
66
+ AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
67
+
68
+
69
+ class TransformersPromptStyle(str, Enum):
70
+ CHAT = "chat"
71
+ RAW = "raw"
72
+ NONE = "none"
73
+
74
+
75
+ class InlineVlmOptions(BaseVlmOptions):
76
+ model_config = ConfigDict(arbitrary_types_allowed=True)
77
+
78
+ kind: Literal["inline_model_options"] = "inline_model_options"
79
+
80
+ repo_id: str
81
+ revision: str = "main"
82
+ trust_remote_code: bool = False
83
+ load_in_8bit: bool = True
84
+ llm_int8_threshold: float = 6.0
85
+ quantized: bool = False
86
+
87
+ inference_framework: InferenceFramework
88
+ transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
89
+ transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
90
+ response_format: ResponseFormat
91
+
92
+ torch_dtype: Optional[str] = None
93
+ supported_devices: List[AcceleratorDevice] = [
94
+ AcceleratorDevice.CPU,
95
+ AcceleratorDevice.CUDA,
96
+ AcceleratorDevice.MPS,
97
+ AcceleratorDevice.XPU,
98
+ ]
99
+
100
+ stop_strings: List[str] = []
101
+ custom_stopping_criteria: List[Union[StoppingCriteria, GenerationStopper]] = []
102
+ extra_generation_config: Dict[str, Any] = {}
103
+ extra_processor_kwargs: Dict[str, Any] = {}
104
+
105
+ use_kv_cache: bool = True
106
+ max_new_tokens: int = 4096
107
+ track_generated_tokens: bool = False
108
+ track_input_prompt: bool = False
109
+
110
+ @property
111
+ def repo_cache_folder(self) -> str:
112
+ return self.repo_id.replace("/", "--")
113
+
114
+
115
+ @deprecated("Use InlineVlmOptions instead.")
116
+ class HuggingFaceVlmOptions(InlineVlmOptions):
117
+ pass
118
+
119
+
120
+ class ApiVlmOptions(BaseVlmOptions):
121
+ model_config = ConfigDict(arbitrary_types_allowed=True)
122
+
123
+ kind: Literal["api_model_options"] = "api_model_options"
124
+
125
+ url: AnyUrl = AnyUrl(
126
+ "http://localhost:11434/v1/chat/completions"
127
+ ) # Default to ollama
128
+ headers: Dict[str, str] = {}
129
+ params: Dict[str, Any] = {}
130
+ timeout: float = 60
131
+ concurrency: int = 1
132
+ response_format: ResponseFormat
133
+
134
+ stop_strings: List[str] = []
135
+ custom_stopping_criteria: List[Union[GenerationStopper]] = []
136
+ track_input_prompt: bool = False
@@ -0,0 +1,65 @@
1
+ import sys
2
+ from pathlib import Path
3
+ from typing import Annotated, Optional, Tuple
4
+
5
+ from pydantic import BaseModel, PlainValidator
6
+ from pydantic_settings import BaseSettings, SettingsConfigDict
7
+
8
+
9
+ def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
10
+ if v[0] < 1 or v[1] < v[0]:
11
+ raise ValueError(
12
+ "Invalid page range: start must be ≥ 1 and end must be ≥ start."
13
+ )
14
+ return v
15
+
16
+
17
+ PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
18
+
19
+ DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
20
+
21
+
22
+ class DocumentLimits(BaseModel):
23
+ max_num_pages: int = sys.maxsize
24
+ max_file_size: int = sys.maxsize
25
+ page_range: PageRange = DEFAULT_PAGE_RANGE
26
+
27
+
28
+ class BatchConcurrencySettings(BaseModel):
29
+ doc_batch_size: int = 1 # Number of documents processed in one batch. Should be >= doc_batch_concurrency
30
+ doc_batch_concurrency: int = 1 # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
31
+ page_batch_size: int = 4 # Number of pages processed in one batch.
32
+ page_batch_concurrency: int = 1 # Currently unused.
33
+ elements_batch_size: int = (
34
+ 16 # Number of elements processed in one batch, in enrichment models.
35
+ )
36
+
37
+ # To force models into single core: export OMP_NUM_THREADS=1
38
+
39
+
40
+ class DebugSettings(BaseModel):
41
+ visualize_cells: bool = False
42
+ visualize_ocr: bool = False
43
+ visualize_layout: bool = False
44
+ visualize_raw_layout: bool = False
45
+ visualize_tables: bool = False
46
+
47
+ profile_pipeline_timings: bool = False
48
+
49
+ # Path used to output debug information.
50
+ debug_output_path: str = str(Path.cwd() / "debug")
51
+
52
+
53
+ class AppSettings(BaseSettings):
54
+ model_config = SettingsConfigDict(
55
+ env_prefix="DOCLING_", env_nested_delimiter="_", env_nested_max_split=1
56
+ )
57
+
58
+ perf: BatchConcurrencySettings = BatchConcurrencySettings()
59
+ debug: DebugSettings = DebugSettings()
60
+
61
+ cache_dir: Path = Path.home() / ".cache" / "docling"
62
+ artifacts_path: Optional[Path] = None
63
+
64
+
65
+ settings = AppSettings()
@@ -0,0 +1,365 @@
1
+ import logging
2
+ from enum import Enum
3
+
4
+ from pydantic import (
5
+ AnyUrl,
6
+ )
7
+
8
+ from docling.datamodel.accelerator_options import AcceleratorDevice
9
+ from docling.datamodel.pipeline_options_vlm_model import (
10
+ ApiVlmOptions,
11
+ InferenceFramework,
12
+ InlineVlmOptions,
13
+ ResponseFormat,
14
+ TransformersModelType,
15
+ TransformersPromptStyle,
16
+ )
17
+
18
+ _log = logging.getLogger(__name__)
19
+
20
+
21
+ # Granite-Docling
22
+ GRANITEDOCLING_TRANSFORMERS = InlineVlmOptions(
23
+ repo_id="ibm-granite/granite-docling-258M",
24
+ prompt="Convert this page to docling.",
25
+ response_format=ResponseFormat.DOCTAGS,
26
+ inference_framework=InferenceFramework.TRANSFORMERS,
27
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
28
+ supported_devices=[
29
+ AcceleratorDevice.CPU,
30
+ AcceleratorDevice.CUDA,
31
+ AcceleratorDevice.XPU,
32
+ ],
33
+ extra_generation_config=dict(skip_special_tokens=False),
34
+ scale=2.0,
35
+ temperature=0.0,
36
+ max_new_tokens=8192,
37
+ stop_strings=["</doctag>", "<|end_of_text|>"],
38
+ )
39
+
40
+ GRANITEDOCLING_VLLM = GRANITEDOCLING_TRANSFORMERS.model_copy()
41
+ GRANITEDOCLING_VLLM.inference_framework = InferenceFramework.VLLM
42
+
43
+ GRANITEDOCLING_MLX = InlineVlmOptions(
44
+ repo_id="ibm-granite/granite-docling-258M-mlx",
45
+ prompt="Convert this page to docling.",
46
+ response_format=ResponseFormat.DOCTAGS,
47
+ inference_framework=InferenceFramework.MLX,
48
+ supported_devices=[AcceleratorDevice.MPS],
49
+ scale=2.0,
50
+ temperature=0.0,
51
+ max_new_tokens=8192,
52
+ stop_strings=["</doctag>", "<|end_of_text|>"],
53
+ )
54
+
55
+ GRANITEDOCLING_VLLM_API = ApiVlmOptions(
56
+ url="http://localhost:8000/v1/chat/completions", # LM studio defaults to port 1234, VLLM to 8000
57
+ params=dict(
58
+ model=GRANITEDOCLING_TRANSFORMERS.repo_id,
59
+ max_tokens=4096,
60
+ skip_special_tokens=True,
61
+ ),
62
+ prompt=GRANITEDOCLING_TRANSFORMERS.prompt,
63
+ timeout=90,
64
+ scale=2.0,
65
+ temperature=0.0,
66
+ concurrency=4,
67
+ stop_strings=["</doctag>", "<|end_of_text|>"],
68
+ response_format=ResponseFormat.DOCTAGS,
69
+ )
70
+
71
+ GRANITEDOCLING_OLLAMA = GRANITEDOCLING_VLLM_API.model_copy()
72
+ GRANITEDOCLING_OLLAMA.url = AnyUrl("http://localhost:11434/v1/chat/completions")
73
+ GRANITEDOCLING_OLLAMA.params["model"] = "ibm/granite-docling:258m"
74
+
75
+ # SmolDocling
76
+ SMOLDOCLING_MLX = InlineVlmOptions(
77
+ repo_id="docling-project/SmolDocling-256M-preview-mlx-bf16",
78
+ prompt="Convert this page to docling.",
79
+ response_format=ResponseFormat.DOCTAGS,
80
+ inference_framework=InferenceFramework.MLX,
81
+ supported_devices=[AcceleratorDevice.MPS],
82
+ scale=2.0,
83
+ temperature=0.0,
84
+ stop_strings=["</doctag>", "<end_of_utterance>"],
85
+ )
86
+
87
+ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
88
+ repo_id="docling-project/SmolDocling-256M-preview",
89
+ prompt="Convert this page to docling.",
90
+ response_format=ResponseFormat.DOCTAGS,
91
+ inference_framework=InferenceFramework.TRANSFORMERS,
92
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
93
+ supported_devices=[
94
+ AcceleratorDevice.CPU,
95
+ AcceleratorDevice.CUDA,
96
+ AcceleratorDevice.XPU,
97
+ ],
98
+ torch_dtype="bfloat16",
99
+ scale=2.0,
100
+ temperature=0.0,
101
+ stop_strings=["</doctag>", "<end_of_utterance>"],
102
+ )
103
+
104
+ SMOLDOCLING_VLLM = InlineVlmOptions(
105
+ repo_id="docling-project/SmolDocling-256M-preview",
106
+ prompt="Convert this page to docling.",
107
+ response_format=ResponseFormat.DOCTAGS,
108
+ inference_framework=InferenceFramework.VLLM,
109
+ supported_devices=[
110
+ AcceleratorDevice.CUDA,
111
+ AcceleratorDevice.XPU,
112
+ ],
113
+ scale=2.0,
114
+ temperature=0.0,
115
+ stop_strings=["</doctag>", "<end_of_utterance>"],
116
+ )
117
+
118
+ # SmolVLM-256M-Instruct
119
+ SMOLVLM256_TRANSFORMERS = InlineVlmOptions(
120
+ repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
121
+ prompt="Transcribe this image to plain text.",
122
+ response_format=ResponseFormat.PLAINTEXT,
123
+ inference_framework=InferenceFramework.TRANSFORMERS,
124
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
125
+ supported_devices=[
126
+ AcceleratorDevice.CPU,
127
+ AcceleratorDevice.CUDA,
128
+ # AcceleratorDevice.MPS,
129
+ AcceleratorDevice.XPU,
130
+ ],
131
+ torch_dtype="bfloat16",
132
+ scale=2.0,
133
+ temperature=0.0,
134
+ )
135
+
136
+ # SmolVLM2-2.2b-Instruct
137
+ SMOLVLM256_MLX = InlineVlmOptions(
138
+ repo_id="moot20/SmolVLM-256M-Instruct-MLX",
139
+ prompt="Extract the text.",
140
+ response_format=ResponseFormat.DOCTAGS,
141
+ inference_framework=InferenceFramework.MLX,
142
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
143
+ supported_devices=[
144
+ AcceleratorDevice.MPS,
145
+ ],
146
+ scale=2.0,
147
+ temperature=0.0,
148
+ )
149
+
150
+ SMOLVLM256_VLLM = InlineVlmOptions(
151
+ repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
152
+ prompt="Transcribe this image to plain text.",
153
+ response_format=ResponseFormat.PLAINTEXT,
154
+ inference_framework=InferenceFramework.VLLM,
155
+ supported_devices=[
156
+ AcceleratorDevice.CUDA,
157
+ AcceleratorDevice.XPU,
158
+ ],
159
+ scale=2.0,
160
+ temperature=0.0,
161
+ )
162
+
163
+
164
+ # GraniteVision
165
+ GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
166
+ repo_id="ibm-granite/granite-vision-3.2-2b",
167
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
168
+ response_format=ResponseFormat.MARKDOWN,
169
+ inference_framework=InferenceFramework.TRANSFORMERS,
170
+ transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
171
+ supported_devices=[
172
+ AcceleratorDevice.CPU,
173
+ AcceleratorDevice.CUDA,
174
+ AcceleratorDevice.MPS,
175
+ AcceleratorDevice.XPU,
176
+ ],
177
+ scale=2.0,
178
+ temperature=0.0,
179
+ )
180
+
181
+ GRANITE_VISION_VLLM = InlineVlmOptions(
182
+ repo_id="ibm-granite/granite-vision-3.2-2b",
183
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
184
+ response_format=ResponseFormat.MARKDOWN,
185
+ inference_framework=InferenceFramework.VLLM,
186
+ supported_devices=[
187
+ AcceleratorDevice.CUDA,
188
+ AcceleratorDevice.XPU,
189
+ ],
190
+ scale=2.0,
191
+ temperature=0.0,
192
+ )
193
+
194
+ GRANITE_VISION_OLLAMA = ApiVlmOptions(
195
+ url=AnyUrl("http://localhost:11434/v1/chat/completions"),
196
+ params={"model": "granite3.2-vision:2b"},
197
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
198
+ scale=1.0,
199
+ timeout=120,
200
+ response_format=ResponseFormat.MARKDOWN,
201
+ temperature=0.0,
202
+ )
203
+
204
+ # Pixtral
205
+ PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
206
+ repo_id="mistral-community/pixtral-12b",
207
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
208
+ response_format=ResponseFormat.MARKDOWN,
209
+ inference_framework=InferenceFramework.TRANSFORMERS,
210
+ transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
211
+ supported_devices=[
212
+ AcceleratorDevice.CPU,
213
+ AcceleratorDevice.CUDA,
214
+ AcceleratorDevice.XPU,
215
+ ],
216
+ scale=2.0,
217
+ temperature=0.0,
218
+ )
219
+
220
+ PIXTRAL_12B_MLX = InlineVlmOptions(
221
+ repo_id="mlx-community/pixtral-12b-bf16",
222
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
223
+ response_format=ResponseFormat.MARKDOWN,
224
+ inference_framework=InferenceFramework.MLX,
225
+ supported_devices=[AcceleratorDevice.MPS],
226
+ scale=2.0,
227
+ temperature=0.0,
228
+ )
229
+
230
+ # Phi4
231
+ PHI4_TRANSFORMERS = InlineVlmOptions(
232
+ repo_id="microsoft/Phi-4-multimodal-instruct",
233
+ prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
234
+ trust_remote_code=True,
235
+ response_format=ResponseFormat.MARKDOWN,
236
+ inference_framework=InferenceFramework.TRANSFORMERS,
237
+ transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM,
238
+ supported_devices=[
239
+ AcceleratorDevice.CPU,
240
+ AcceleratorDevice.CUDA,
241
+ AcceleratorDevice.XPU,
242
+ ],
243
+ scale=2.0,
244
+ temperature=0.0,
245
+ extra_generation_config=dict(num_logits_to_keep=0),
246
+ )
247
+
248
+ # Qwen
249
+ QWEN25_VL_3B_MLX = InlineVlmOptions(
250
+ repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
251
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
252
+ response_format=ResponseFormat.MARKDOWN,
253
+ inference_framework=InferenceFramework.MLX,
254
+ supported_devices=[AcceleratorDevice.MPS],
255
+ scale=2.0,
256
+ temperature=0.0,
257
+ )
258
+
259
+ # GoT 2.0
260
+ GOT2_TRANSFORMERS = InlineVlmOptions(
261
+ repo_id="stepfun-ai/GOT-OCR-2.0-hf",
262
+ prompt="",
263
+ response_format=ResponseFormat.MARKDOWN,
264
+ inference_framework=InferenceFramework.TRANSFORMERS,
265
+ transformers_prompt_style=TransformersPromptStyle.NONE,
266
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
267
+ supported_devices=[
268
+ AcceleratorDevice.CPU,
269
+ AcceleratorDevice.CUDA,
270
+ # AcceleratorDevice.MPS,
271
+ AcceleratorDevice.XPU,
272
+ ],
273
+ scale=2.0,
274
+ temperature=0.0,
275
+ stop_strings=["<|im_end|>"],
276
+ extra_processor_kwargs={"format": True},
277
+ )
278
+
279
+
280
+ # Gemma-3
281
+ GEMMA3_12B_MLX = InlineVlmOptions(
282
+ repo_id="mlx-community/gemma-3-12b-it-bf16",
283
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
284
+ response_format=ResponseFormat.MARKDOWN,
285
+ inference_framework=InferenceFramework.MLX,
286
+ supported_devices=[AcceleratorDevice.MPS],
287
+ scale=2.0,
288
+ temperature=0.0,
289
+ )
290
+
291
+ GEMMA3_27B_MLX = InlineVlmOptions(
292
+ repo_id="mlx-community/gemma-3-27b-it-bf16",
293
+ prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
294
+ response_format=ResponseFormat.MARKDOWN,
295
+ inference_framework=InferenceFramework.MLX,
296
+ supported_devices=[AcceleratorDevice.MPS],
297
+ scale=2.0,
298
+ temperature=0.0,
299
+ )
300
+
301
+ # Dolphin
302
+
303
+ DOLPHIN_TRANSFORMERS = InlineVlmOptions(
304
+ repo_id="ByteDance/Dolphin",
305
+ prompt="<s>Read text in the image. <Answer/>",
306
+ response_format=ResponseFormat.MARKDOWN,
307
+ inference_framework=InferenceFramework.TRANSFORMERS,
308
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
309
+ transformers_prompt_style=TransformersPromptStyle.RAW,
310
+ supported_devices=[
311
+ AcceleratorDevice.CUDA,
312
+ AcceleratorDevice.CPU,
313
+ AcceleratorDevice.MPS,
314
+ AcceleratorDevice.XPU,
315
+ ],
316
+ scale=2.0,
317
+ temperature=0.0,
318
+ )
319
+
320
+ # DeepSeek-OCR
321
+ DEEPSEEKOCR_OLLAMA = ApiVlmOptions(
322
+ url="http://localhost:11434/v1/chat/completions",
323
+ params=dict(
324
+ model="deepseek-ocr:3b",
325
+ max_tokens=4096,
326
+ skip_special_tokens=True,
327
+ ),
328
+ prompt="<|grounding|>Convert the document to markdown. ",
329
+ timeout=90,
330
+ scale=2.0,
331
+ temperature=0.0,
332
+ concurrency=4,
333
+ response_format=ResponseFormat.DEEPSEEKOCR_MARKDOWN,
334
+ )
335
+
336
+ # NuExtract
337
+ NU_EXTRACT_2B_TRANSFORMERS = InlineVlmOptions(
338
+ repo_id="numind/NuExtract-2.0-2B",
339
+ revision="fe5b2f0b63b81150721435a3ca1129a75c59c74e", # 489efed leads to MPS issues
340
+ prompt="", # This won't be used, template is passed separately
341
+ torch_dtype="bfloat16",
342
+ inference_framework=InferenceFramework.TRANSFORMERS,
343
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
344
+ response_format=ResponseFormat.PLAINTEXT,
345
+ supported_devices=[
346
+ AcceleratorDevice.CPU,
347
+ AcceleratorDevice.CUDA,
348
+ AcceleratorDevice.MPS,
349
+ AcceleratorDevice.XPU,
350
+ ],
351
+ scale=2.0,
352
+ temperature=0.0,
353
+ )
354
+
355
+
356
+ class VlmModelType(str, Enum):
357
+ SMOLDOCLING = "smoldocling"
358
+ SMOLDOCLING_VLLM = "smoldocling_vllm"
359
+ GRANITE_VISION = "granite_vision"
360
+ GRANITE_VISION_VLLM = "granite_vision_vllm"
361
+ GRANITE_VISION_OLLAMA = "granite_vision_ollama"
362
+ GOT_OCR_2 = "got_ocr_2"
363
+ GRANITEDOCLING = "granite_docling"
364
+ GRANITEDOCLING_VLLM = "granite_docling_vllm"
365
+ DEEPSEEKOCR_OLLAMA = "deepseekocr_ollama"