docling 2.34.0__py3-none-any.whl → 2.36.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/xml/jats_backend.py +0 -0
  2. docling/cli/main.py +48 -18
  3. docling/datamodel/accelerator_options.py +68 -0
  4. docling/datamodel/base_models.py +10 -8
  5. docling/datamodel/document.py +7 -2
  6. docling/datamodel/pipeline_options.py +29 -161
  7. docling/datamodel/pipeline_options_vlm_model.py +81 -0
  8. docling/datamodel/vlm_model_specs.py +144 -0
  9. docling/document_converter.py +5 -0
  10. docling/models/api_vlm_model.py +1 -1
  11. docling/models/base_ocr_model.py +2 -1
  12. docling/models/code_formula_model.py +6 -11
  13. docling/models/document_picture_classifier.py +6 -11
  14. docling/models/easyocr_model.py +1 -2
  15. docling/models/layout_model.py +22 -17
  16. docling/models/ocr_mac_model.py +1 -1
  17. docling/models/page_preprocessing_model.py +11 -6
  18. docling/models/picture_description_api_model.py +1 -1
  19. docling/models/picture_description_base_model.py +1 -1
  20. docling/models/picture_description_vlm_model.py +7 -22
  21. docling/models/rapid_ocr_model.py +1 -2
  22. docling/models/table_structure_model.py +6 -12
  23. docling/models/tesseract_ocr_cli_model.py +1 -1
  24. docling/models/tesseract_ocr_model.py +1 -1
  25. docling/models/utils/__init__.py +0 -0
  26. docling/models/utils/hf_model_download.py +40 -0
  27. docling/models/vlm_models_inline/__init__.py +0 -0
  28. docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
  29. docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} +56 -44
  30. docling/pipeline/standard_pdf_pipeline.py +69 -57
  31. docling/pipeline/vlm_pipeline.py +228 -61
  32. docling/utils/accelerator_utils.py +17 -2
  33. docling/utils/model_downloader.py +13 -12
  34. {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/METADATA +54 -55
  35. {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/RECORD +48 -41
  36. {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/WHEEL +2 -1
  37. docling-2.36.0.dist-info/entry_points.txt +6 -0
  38. docling-2.36.0.dist-info/top_level.txt +1 -0
  39. docling/models/hf_vlm_model.py +0 -182
  40. docling-2.34.0.dist-info/entry_points.txt +0 -7
  41. {docling-2.34.0.dist-info → docling-2.36.0.dist-info/licenses}/LICENSE +0 -0
File without changes
docling/cli/main.py CHANGED
@@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
12
12
 
13
13
  import rich.table
14
14
  import typer
15
+ from docling_core.transforms.serializer.html import (
16
+ HTMLDocSerializer,
17
+ HTMLOutputStyle,
18
+ HTMLParams,
19
+ )
20
+ from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
15
21
  from docling_core.types.doc import ImageRefMode
16
22
  from docling_core.utils.file import resolve_source_to_path
17
23
  from pydantic import TypeAdapter
@@ -22,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
22
28
  from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
23
29
  from docling.backend.pdf_backend import PdfDocumentBackend
24
30
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
31
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
25
32
  from docling.datamodel.base_models import (
26
33
  ConversionStatus,
27
34
  FormatToExtensions,
@@ -30,8 +37,6 @@ from docling.datamodel.base_models import (
30
37
  )
31
38
  from docling.datamodel.document import ConversionResult
32
39
  from docling.datamodel.pipeline_options import (
33
- AcceleratorDevice,
34
- AcceleratorOptions,
35
40
  EasyOcrOptions,
36
41
  OcrOptions,
37
42
  PaginatedPipelineOptions,
@@ -39,14 +44,16 @@ from docling.datamodel.pipeline_options import (
39
44
  PdfPipeline,
40
45
  PdfPipelineOptions,
41
46
  TableFormerMode,
42
- VlmModelType,
43
47
  VlmPipelineOptions,
44
- granite_vision_vlm_conversion_options,
45
- granite_vision_vlm_ollama_conversion_options,
46
- smoldocling_vlm_conversion_options,
47
- smoldocling_vlm_mlx_conversion_options,
48
48
  )
49
49
  from docling.datamodel.settings import settings
50
+ from docling.datamodel.vlm_model_specs import (
51
+ GRANITE_VISION_OLLAMA,
52
+ GRANITE_VISION_TRANSFORMERS,
53
+ SMOLDOCLING_MLX,
54
+ SMOLDOCLING_TRANSFORMERS,
55
+ VlmModelType,
56
+ )
50
57
  from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
51
58
  from docling.models.factories import get_ocr_factory
52
59
  from docling.pipeline.vlm_pipeline import VlmPipeline
@@ -156,6 +163,7 @@ def export_documents(
156
163
  export_json: bool,
157
164
  export_html: bool,
158
165
  export_html_split_page: bool,
166
+ show_layout: bool,
159
167
  export_md: bool,
160
168
  export_txt: bool,
161
169
  export_doctags: bool,
@@ -189,9 +197,27 @@ def export_documents(
189
197
  if export_html_split_page:
190
198
  fname = output_dir / f"{doc_filename}.html"
191
199
  _log.info(f"writing HTML output to {fname}")
192
- conv_res.document.save_as_html(
193
- filename=fname, image_mode=image_export_mode, split_page_view=True
194
- )
200
+ if show_layout:
201
+ ser = HTMLDocSerializer(
202
+ doc=conv_res.document,
203
+ params=HTMLParams(
204
+ image_mode=image_export_mode,
205
+ output_style=HTMLOutputStyle.SPLIT_PAGE,
206
+ ),
207
+ )
208
+ visualizer = LayoutVisualizer()
209
+ visualizer.params.show_label = False
210
+ ser_res = ser.serialize(
211
+ visualizer=visualizer,
212
+ )
213
+ with open(fname, "w") as fw:
214
+ fw.write(ser_res.text)
215
+ else:
216
+ conv_res.document.save_as_html(
217
+ filename=fname,
218
+ image_mode=image_export_mode,
219
+ split_page_view=True,
220
+ )
195
221
 
196
222
  # Export Text format:
197
223
  if export_txt:
@@ -250,6 +276,13 @@ def convert( # noqa: C901
250
276
  to_formats: List[OutputFormat] = typer.Option(
251
277
  None, "--to", help="Specify output formats. Defaults to Markdown."
252
278
  ),
279
+ show_layout: Annotated[
280
+ bool,
281
+ typer.Option(
282
+ ...,
283
+ help="If enabled, the page images will show the bounding-boxes of the items.",
284
+ ),
285
+ ] = False,
253
286
  headers: str = typer.Option(
254
287
  None,
255
288
  "--headers",
@@ -547,20 +580,16 @@ def convert( # noqa: C901
547
580
  )
548
581
 
549
582
  if vlm_model == VlmModelType.GRANITE_VISION:
550
- pipeline_options.vlm_options = granite_vision_vlm_conversion_options
583
+ pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
551
584
  elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
552
- pipeline_options.vlm_options = (
553
- granite_vision_vlm_ollama_conversion_options
554
- )
585
+ pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
555
586
  elif vlm_model == VlmModelType.SMOLDOCLING:
556
- pipeline_options.vlm_options = smoldocling_vlm_conversion_options
587
+ pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
557
588
  if sys.platform == "darwin":
558
589
  try:
559
590
  import mlx_vlm
560
591
 
561
- pipeline_options.vlm_options = (
562
- smoldocling_vlm_mlx_conversion_options
563
- )
592
+ pipeline_options.vlm_options = SMOLDOCLING_MLX
564
593
  except ImportError:
565
594
  _log.warning(
566
595
  "To run SmolDocling faster, please install mlx-vlm:\n"
@@ -596,6 +625,7 @@ def convert( # noqa: C901
596
625
  export_json=export_json,
597
626
  export_html=export_html,
598
627
  export_html_split_page=export_html_split_page,
628
+ show_layout=show_layout,
599
629
  export_md=export_md,
600
630
  export_txt=export_txt,
601
631
  export_doctags=export_doctags,
@@ -0,0 +1,68 @@
1
+ import logging
2
+ import os
3
+ import re
4
+ from enum import Enum
5
+ from typing import Any, Union
6
+
7
+ from pydantic import field_validator, model_validator
8
+ from pydantic_settings import BaseSettings, SettingsConfigDict
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class AcceleratorDevice(str, Enum):
14
+ """Devices to run model inference"""
15
+
16
+ AUTO = "auto"
17
+ CPU = "cpu"
18
+ CUDA = "cuda"
19
+ MPS = "mps"
20
+
21
+
22
+ class AcceleratorOptions(BaseSettings):
23
+ model_config = SettingsConfigDict(
24
+ env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
25
+ )
26
+
27
+ num_threads: int = 4
28
+ device: Union[str, AcceleratorDevice] = "auto"
29
+ cuda_use_flash_attention2: bool = False
30
+
31
+ @field_validator("device")
32
+ def validate_device(cls, value):
33
+ # "auto", "cpu", "cuda", "mps", or "cuda:N"
34
+ if value in {d.value for d in AcceleratorDevice} or re.match(
35
+ r"^cuda(:\d+)?$", value
36
+ ):
37
+ return value
38
+ raise ValueError(
39
+ "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
40
+ )
41
+
42
+ @model_validator(mode="before")
43
+ @classmethod
44
+ def check_alternative_envvars(cls, data: Any) -> Any:
45
+ r"""
46
+ Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
47
+ The alternative envvar is used only if it is valid and the regular envvar is not set.
48
+
49
+ Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
50
+ the same functionality. In case the alias envvar is set and the user tries to override the
51
+ parameter in settings initialization, Pydantic treats the parameter provided in __init__()
52
+ as an extra input instead of simply overwriting the evvar value for that parameter.
53
+ """
54
+ if isinstance(data, dict):
55
+ input_num_threads = data.get("num_threads")
56
+ # Check if to set the num_threads from the alternative envvar
57
+ if input_num_threads is None:
58
+ docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
59
+ omp_num_threads = os.getenv("OMP_NUM_THREADS")
60
+ if docling_num_threads is None and omp_num_threads is not None:
61
+ try:
62
+ data["num_threads"] = int(omp_num_threads)
63
+ except ValueError:
64
+ _log.error(
65
+ "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
66
+ omp_num_threads,
67
+ )
68
+ return data
@@ -13,11 +13,11 @@ from docling_core.types.doc import (
13
13
  TableCell,
14
14
  )
15
15
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
16
-
17
- # DO NOT REMOVE; explicitly exposed from this location
18
16
  from docling_core.types.io import (
19
17
  DocumentStream,
20
18
  )
19
+
20
+ # DO NOT REMOVE; explicitly exposed from this location
21
21
  from PIL.Image import Image
22
22
  from pydantic import BaseModel, ConfigDict, Field, computed_field
23
23
 
@@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
131
131
  error_message: str
132
132
 
133
133
 
134
- # class Cell(BaseModel):
135
- # id: int
136
- # text: str
137
- # bbox: BoundingBox
138
-
139
-
140
134
  class Cluster(BaseModel):
141
135
  id: int
142
136
  label: DocItemLabel
@@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
158
152
  clusters: List[Cluster] = []
159
153
 
160
154
 
155
+ class VlmPredictionToken(BaseModel):
156
+ text: str = ""
157
+ token: int = -1
158
+ logprob: float = -1
159
+
160
+
161
161
  class VlmPrediction(BaseModel):
162
162
  text: str = ""
163
+ generated_tokens: list[VlmPredictionToken] = []
164
+ generation_time: float = -1
163
165
 
164
166
 
165
167
  class ContainerElement(
@@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
334
334
  ) -> Optional[InputFormat]:
335
335
  """Guess the input format of a document by checking part of its content."""
336
336
  input_format: Optional[InputFormat] = None
337
- content_str = content.decode("utf-8")
338
337
 
339
338
  if mime == "application/xml":
339
+ content_str = content.decode("utf-8")
340
340
  match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
341
341
  if match_doctype:
342
342
  xml_doctype = match_doctype.group()
@@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
358
358
  input_format = InputFormat.XML_JATS
359
359
 
360
360
  elif mime == "text/plain":
361
+ content_str = content.decode("utf-8")
361
362
  if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
362
363
  input_format = InputFormat.XML_USPTO
363
364
 
@@ -411,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
411
412
  else:
412
413
  return "application/xml"
413
414
 
414
- if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
415
+ if re.match(
416
+ r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
417
+ content_str,
418
+ re.DOTALL,
419
+ ):
415
420
  return "text/html"
416
421
 
417
422
  p = re.compile(
@@ -1,6 +1,4 @@
1
1
  import logging
2
- import os
3
- import re
4
2
  from enum import Enum
5
3
  from pathlib import Path
6
4
  from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@@ -10,71 +8,26 @@ from pydantic import (
10
8
  BaseModel,
11
9
  ConfigDict,
12
10
  Field,
13
- field_validator,
14
- model_validator,
15
11
  )
16
- from pydantic_settings import BaseSettings, SettingsConfigDict
17
12
  from typing_extensions import deprecated
18
13
 
19
- _log = logging.getLogger(__name__)
20
-
21
-
22
- class AcceleratorDevice(str, Enum):
23
- """Devices to run model inference"""
24
-
25
- AUTO = "auto"
26
- CPU = "cpu"
27
- CUDA = "cuda"
28
- MPS = "mps"
29
-
30
-
31
- class AcceleratorOptions(BaseSettings):
32
- model_config = SettingsConfigDict(
33
- env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
34
- )
14
+ # Import the following for backwards compatibility
15
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
16
+ from docling.datamodel.pipeline_options_vlm_model import (
17
+ ApiVlmOptions,
18
+ InferenceFramework,
19
+ InlineVlmOptions,
20
+ ResponseFormat,
21
+ )
22
+ from docling.datamodel.vlm_model_specs import (
23
+ GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
24
+ GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
25
+ SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
26
+ SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
27
+ VlmModelType,
28
+ )
35
29
 
36
- num_threads: int = 4
37
- device: Union[str, AcceleratorDevice] = "auto"
38
- cuda_use_flash_attention2: bool = False
39
-
40
- @field_validator("device")
41
- def validate_device(cls, value):
42
- # "auto", "cpu", "cuda", "mps", or "cuda:N"
43
- if value in {d.value for d in AcceleratorDevice} or re.match(
44
- r"^cuda(:\d+)?$", value
45
- ):
46
- return value
47
- raise ValueError(
48
- "Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
49
- )
50
-
51
- @model_validator(mode="before")
52
- @classmethod
53
- def check_alternative_envvars(cls, data: Any) -> Any:
54
- r"""
55
- Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
56
- The alternative envvar is used only if it is valid and the regular envvar is not set.
57
-
58
- Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
59
- the same functionality. In case the alias envvar is set and the user tries to override the
60
- parameter in settings initialization, Pydantic treats the parameter provided in __init__()
61
- as an extra input instead of simply overwriting the evvar value for that parameter.
62
- """
63
- if isinstance(data, dict):
64
- input_num_threads = data.get("num_threads")
65
- # Check if to set the num_threads from the alternative envvar
66
- if input_num_threads is None:
67
- docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
68
- omp_num_threads = os.getenv("OMP_NUM_THREADS")
69
- if docling_num_threads is None and omp_num_threads is not None:
70
- try:
71
- data["num_threads"] = int(omp_num_threads)
72
- except ValueError:
73
- _log.error(
74
- "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
75
- omp_num_threads,
76
- )
77
- return data
30
+ _log = logging.getLogger(__name__)
78
31
 
79
32
 
80
33
  class BaseOptions(BaseModel):
@@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions):
121
74
  lang: List[str] = [
122
75
  "english",
123
76
  "chinese",
124
- ] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
125
- # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
77
+ ]
78
+ # However, language as a parameter is not supported by rapidocr yet
79
+ # and hence changing this options doesn't affect anything.
80
+
81
+ # For more details on supported languages by RapidOCR visit
82
+ # https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
83
+
84
+ # For more details on the following options visit
85
+ # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
126
86
 
127
- # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
128
87
  text_score: float = 0.5 # same default as rapidocr
129
88
 
130
89
  use_det: Optional[bool] = None # same default as rapidocr
131
90
  use_cls: Optional[bool] = None # same default as rapidocr
132
91
  use_rec: Optional[bool] = None # same default as rapidocr
133
92
 
134
- # class Device(Enum):
135
- # CPU = "CPU"
136
- # CUDA = "CUDA"
137
- # DIRECTML = "DIRECTML"
138
- # AUTO = "AUTO"
139
-
140
- # device: Device = Device.AUTO # Default value is AUTO
141
-
142
93
  print_verbose: bool = False # same default as rapidocr
143
94
 
144
95
  det_model_path: Optional[str] = None # same default as rapidocr
@@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
244
195
  return self.repo_id.replace("/", "--")
245
196
 
246
197
 
198
+ # SmolVLM
247
199
  smolvlm_picture_description = PictureDescriptionVlmOptions(
248
200
  repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
249
201
  )
250
- # phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
202
+
203
+ # GraniteVision
251
204
  granite_picture_description = PictureDescriptionVlmOptions(
252
205
  repo_id="ibm-granite/granite-vision-3.1-2b-preview",
253
206
  prompt="What is shown in this image?",
254
207
  )
255
208
 
256
209
 
257
- class BaseVlmOptions(BaseModel):
258
- kind: str
259
- prompt: str
260
-
261
-
262
- class ResponseFormat(str, Enum):
263
- DOCTAGS = "doctags"
264
- MARKDOWN = "markdown"
265
-
266
-
267
- class InferenceFramework(str, Enum):
268
- MLX = "mlx"
269
- TRANSFORMERS = "transformers"
270
- OPENAI = "openai"
271
-
272
-
273
- class HuggingFaceVlmOptions(BaseVlmOptions):
274
- kind: Literal["hf_model_options"] = "hf_model_options"
275
-
276
- repo_id: str
277
- load_in_8bit: bool = True
278
- llm_int8_threshold: float = 6.0
279
- quantized: bool = False
280
-
281
- inference_framework: InferenceFramework
282
- response_format: ResponseFormat
283
-
284
- @property
285
- def repo_cache_folder(self) -> str:
286
- return self.repo_id.replace("/", "--")
287
-
288
-
289
- class ApiVlmOptions(BaseVlmOptions):
290
- kind: Literal["api_model_options"] = "api_model_options"
291
-
292
- url: AnyUrl = AnyUrl(
293
- "http://localhost:11434/v1/chat/completions"
294
- ) # Default to ollama
295
- headers: Dict[str, str] = {}
296
- params: Dict[str, Any] = {}
297
- scale: float = 2.0
298
- timeout: float = 60
299
- concurrency: int = 1
300
- response_format: ResponseFormat
301
-
302
-
303
- smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
304
- repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
305
- prompt="Convert this page to docling.",
306
- response_format=ResponseFormat.DOCTAGS,
307
- inference_framework=InferenceFramework.MLX,
308
- )
309
-
310
-
311
- smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
312
- repo_id="ds4sd/SmolDocling-256M-preview",
313
- prompt="Convert this page to docling.",
314
- response_format=ResponseFormat.DOCTAGS,
315
- inference_framework=InferenceFramework.TRANSFORMERS,
316
- )
317
-
318
- granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
319
- repo_id="ibm-granite/granite-vision-3.1-2b-preview",
320
- # prompt="OCR the full page to markdown.",
321
- prompt="OCR this image.",
322
- response_format=ResponseFormat.MARKDOWN,
323
- inference_framework=InferenceFramework.TRANSFORMERS,
324
- )
325
-
326
- granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
327
- url=AnyUrl("http://localhost:11434/v1/chat/completions"),
328
- params={"model": "granite3.2-vision:2b"},
329
- prompt="OCR the full page to markdown.",
330
- scale=1.0,
331
- timeout=120,
332
- response_format=ResponseFormat.MARKDOWN,
333
- )
334
-
335
-
336
- class VlmModelType(str, Enum):
337
- SMOLDOCLING = "smoldocling"
338
- GRANITE_VISION = "granite_vision"
339
- GRANITE_VISION_OLLAMA = "granite_vision_ollama"
340
-
341
-
342
210
  # Define an enum for the backend options
343
211
  class PdfBackend(str, Enum):
344
212
  """Enum of valid PDF backends."""
@@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
387
255
  False # (To be used with vlms, or other generative models)
388
256
  )
389
257
  # If True, text from backend will be used instead of generated text
390
- vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
258
+ vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
391
259
  smoldocling_vlm_conversion_options
392
260
  )
393
261
 
@@ -0,0 +1,81 @@
1
+ from enum import Enum
2
+ from typing import Any, Dict, List, Literal
3
+
4
+ from pydantic import AnyUrl, BaseModel
5
+ from typing_extensions import deprecated
6
+
7
+ from docling.datamodel.accelerator_options import AcceleratorDevice
8
+
9
+
10
+ class BaseVlmOptions(BaseModel):
11
+ kind: str
12
+ prompt: str
13
+
14
+
15
+ class ResponseFormat(str, Enum):
16
+ DOCTAGS = "doctags"
17
+ MARKDOWN = "markdown"
18
+ HTML = "html"
19
+
20
+
21
+ class InferenceFramework(str, Enum):
22
+ MLX = "mlx"
23
+ TRANSFORMERS = "transformers"
24
+
25
+
26
+ class TransformersModelType(str, Enum):
27
+ AUTOMODEL = "automodel"
28
+ AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
29
+ AUTOMODEL_CAUSALLM = "automodel-causallm"
30
+
31
+
32
+ class InlineVlmOptions(BaseVlmOptions):
33
+ kind: Literal["inline_model_options"] = "inline_model_options"
34
+
35
+ repo_id: str
36
+ trust_remote_code: bool = False
37
+ load_in_8bit: bool = True
38
+ llm_int8_threshold: float = 6.0
39
+ quantized: bool = False
40
+
41
+ inference_framework: InferenceFramework
42
+ transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
43
+ response_format: ResponseFormat
44
+
45
+ supported_devices: List[AcceleratorDevice] = [
46
+ AcceleratorDevice.CPU,
47
+ AcceleratorDevice.CUDA,
48
+ AcceleratorDevice.MPS,
49
+ ]
50
+
51
+ scale: float = 2.0
52
+
53
+ temperature: float = 0.0
54
+ stop_strings: List[str] = []
55
+ extra_generation_config: Dict[str, Any] = {}
56
+
57
+ use_kv_cache: bool = True
58
+ max_new_tokens: int = 4096
59
+
60
+ @property
61
+ def repo_cache_folder(self) -> str:
62
+ return self.repo_id.replace("/", "--")
63
+
64
+
65
+ @deprecated("Use InlineVlmOptions instead.")
66
+ class HuggingFaceVlmOptions(InlineVlmOptions):
67
+ pass
68
+
69
+
70
+ class ApiVlmOptions(BaseVlmOptions):
71
+ kind: Literal["api_model_options"] = "api_model_options"
72
+
73
+ url: AnyUrl = AnyUrl(
74
+ "http://localhost:11434/v1/chat/completions"
75
+ ) # Default to ollama
76
+ headers: Dict[str, str] = {}
77
+ params: Dict[str, Any] = {}
78
+ scale: float = 2.0
79
+ timeout: float = 60
80
+ concurrency: int = 1
81
+ response_format: ResponseFormat