docling 2.57.0__py3-none-any.whl → 2.59.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (35) hide show
  1. docling/backend/abstract_backend.py +24 -3
  2. docling/backend/asciidoc_backend.py +3 -3
  3. docling/backend/docling_parse_v4_backend.py +15 -4
  4. docling/backend/html_backend.py +130 -20
  5. docling/backend/md_backend.py +27 -5
  6. docling/backend/msexcel_backend.py +121 -29
  7. docling/backend/mspowerpoint_backend.py +2 -2
  8. docling/backend/msword_backend.py +18 -18
  9. docling/backend/pdf_backend.py +9 -2
  10. docling/backend/pypdfium2_backend.py +12 -3
  11. docling/cli/main.py +104 -38
  12. docling/datamodel/asr_model_specs.py +408 -6
  13. docling/datamodel/backend_options.py +82 -0
  14. docling/datamodel/base_models.py +19 -2
  15. docling/datamodel/document.py +81 -48
  16. docling/datamodel/pipeline_options_asr_model.py +21 -1
  17. docling/datamodel/pipeline_options_vlm_model.py +1 -0
  18. docling/document_converter.py +37 -45
  19. docling/document_extractor.py +12 -11
  20. docling/models/api_vlm_model.py +5 -3
  21. docling/models/picture_description_vlm_model.py +5 -1
  22. docling/models/readingorder_model.py +6 -7
  23. docling/models/vlm_models_inline/hf_transformers_model.py +13 -3
  24. docling/models/vlm_models_inline/mlx_model.py +9 -3
  25. docling/models/vlm_models_inline/nuextract_transformers_model.py +13 -3
  26. docling/models/vlm_models_inline/vllm_model.py +42 -8
  27. docling/pipeline/asr_pipeline.py +149 -6
  28. docling/utils/api_image_request.py +20 -9
  29. docling/utils/layout_postprocessor.py +23 -24
  30. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/METADATA +11 -8
  31. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/RECORD +35 -34
  32. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/WHEEL +0 -0
  33. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/entry_points.txt +0 -0
  34. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/licenses/LICENSE +0 -0
  35. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/top_level.txt +0 -0
docling/cli/main.py CHANGED
@@ -32,13 +32,26 @@ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
32
32
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
33
33
  from docling.datamodel.asr_model_specs import (
34
34
  WHISPER_BASE,
35
+ WHISPER_BASE_MLX,
36
+ WHISPER_BASE_NATIVE,
35
37
  WHISPER_LARGE,
38
+ WHISPER_LARGE_MLX,
39
+ WHISPER_LARGE_NATIVE,
36
40
  WHISPER_MEDIUM,
41
+ WHISPER_MEDIUM_MLX,
42
+ WHISPER_MEDIUM_NATIVE,
37
43
  WHISPER_SMALL,
44
+ WHISPER_SMALL_MLX,
45
+ WHISPER_SMALL_NATIVE,
38
46
  WHISPER_TINY,
47
+ WHISPER_TINY_MLX,
48
+ WHISPER_TINY_NATIVE,
39
49
  WHISPER_TURBO,
50
+ WHISPER_TURBO_MLX,
51
+ WHISPER_TURBO_NATIVE,
40
52
  AsrModelType,
41
53
  )
54
+ from docling.datamodel.backend_options import PdfBackendOptions
42
55
  from docling.datamodel.base_models import (
43
56
  ConversionStatus,
44
57
  FormatToExtensions,
@@ -391,7 +404,10 @@ def convert( # noqa: C901
391
404
  ] = None,
392
405
  pdf_backend: Annotated[
393
406
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
394
- ] = PdfBackend.DLPARSE_V2,
407
+ ] = PdfBackend.DLPARSE_V4,
408
+ pdf_password: Annotated[
409
+ Optional[str], typer.Option(..., help="Password for protected PDF documents")
410
+ ] = None,
395
411
  table_mode: Annotated[
396
412
  TableFormerMode,
397
413
  typer.Option(..., help="The mode to use in the table structure model."),
@@ -611,10 +627,14 @@ def convert( # noqa: C901
611
627
  ocr_options.psm = psm
612
628
 
613
629
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
630
+
614
631
  # pipeline_options: PaginatedPipelineOptions
615
632
  pipeline_options: PipelineOptions
616
633
 
617
634
  format_options: Dict[InputFormat, FormatOption] = {}
635
+ pdf_backend_options: Optional[PdfBackendOptions] = PdfBackendOptions(
636
+ password=pdf_password
637
+ )
618
638
 
619
639
  if pipeline == ProcessingPipeline.STANDARD:
620
640
  pipeline_options = PdfPipelineOptions(
@@ -645,8 +665,10 @@ def convert( # noqa: C901
645
665
  backend: Type[PdfDocumentBackend]
646
666
  if pdf_backend == PdfBackend.DLPARSE_V1:
647
667
  backend = DoclingParseDocumentBackend
668
+ pdf_backend_options = None
648
669
  elif pdf_backend == PdfBackend.DLPARSE_V2:
649
670
  backend = DoclingParseV2DocumentBackend
671
+ pdf_backend_options = None
650
672
  elif pdf_backend == PdfBackend.DLPARSE_V4:
651
673
  backend = DoclingParseV4DocumentBackend # type: ignore
652
674
  elif pdf_backend == PdfBackend.PYPDFIUM2:
@@ -657,6 +679,7 @@ def convert( # noqa: C901
657
679
  pdf_format_option = PdfFormatOption(
658
680
  pipeline_options=pipeline_options,
659
681
  backend=backend, # pdf_backend
682
+ backend_options=pdf_backend_options,
660
683
  )
661
684
 
662
685
  # METS GBS options
@@ -715,10 +738,15 @@ def convert( # noqa: C901
715
738
 
716
739
  pipeline_options.vlm_options = SMOLDOCLING_MLX
717
740
  except ImportError:
718
- _log.warning(
719
- "To run SmolDocling faster, please install mlx-vlm:\n"
720
- "pip install mlx-vlm"
721
- )
741
+ if sys.version_info < (3, 14):
742
+ _log.warning(
743
+ "To run SmolDocling faster, please install mlx-vlm:\n"
744
+ "pip install mlx-vlm"
745
+ )
746
+ else:
747
+ _log.warning(
748
+ "You can run SmolDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
749
+ )
722
750
 
723
751
  elif vlm_model == VlmModelType.GRANITEDOCLING:
724
752
  pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
@@ -728,10 +756,16 @@ def convert( # noqa: C901
728
756
 
729
757
  pipeline_options.vlm_options = GRANITEDOCLING_MLX
730
758
  except ImportError:
731
- _log.warning(
732
- "To run GraniteDocling faster, please install mlx-vlm:\n"
733
- "pip install mlx-vlm"
734
- )
759
+ if sys.version_info < (3, 14):
760
+ _log.warning(
761
+ "To run GraniteDocling faster, please install mlx-vlm:\n"
762
+ "pip install mlx-vlm"
763
+ )
764
+ else:
765
+ _log.warning(
766
+ "You can run GraniteDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
767
+ )
768
+
735
769
  elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
736
770
  pipeline_options.vlm_options = SMOLDOCLING_VLLM
737
771
 
@@ -747,42 +781,74 @@ def convert( # noqa: C901
747
781
  InputFormat.IMAGE: pdf_format_option,
748
782
  }
749
783
 
750
- elif pipeline == ProcessingPipeline.ASR:
751
- pipeline_options = AsrPipelineOptions(
752
- # enable_remote_services=enable_remote_services,
753
- # artifacts_path = artifacts_path
754
- )
784
+ # Set ASR options
785
+ asr_pipeline_options = AsrPipelineOptions(
786
+ accelerator_options=AcceleratorOptions(
787
+ device=device,
788
+ num_threads=num_threads,
789
+ ),
790
+ # enable_remote_services=enable_remote_services,
791
+ # artifacts_path = artifacts_path
792
+ )
755
793
 
756
- if asr_model == AsrModelType.WHISPER_TINY:
757
- pipeline_options.asr_options = WHISPER_TINY
758
- elif asr_model == AsrModelType.WHISPER_SMALL:
759
- pipeline_options.asr_options = WHISPER_SMALL
760
- elif asr_model == AsrModelType.WHISPER_MEDIUM:
761
- pipeline_options.asr_options = WHISPER_MEDIUM
762
- elif asr_model == AsrModelType.WHISPER_BASE:
763
- pipeline_options.asr_options = WHISPER_BASE
764
- elif asr_model == AsrModelType.WHISPER_LARGE:
765
- pipeline_options.asr_options = WHISPER_LARGE
766
- elif asr_model == AsrModelType.WHISPER_TURBO:
767
- pipeline_options.asr_options = WHISPER_TURBO
768
- else:
769
- _log.error(f"{asr_model} is not known")
770
- raise ValueError(f"{asr_model} is not known")
794
+ # Auto-selecting models (choose best implementation for hardware)
795
+ if asr_model == AsrModelType.WHISPER_TINY:
796
+ asr_pipeline_options.asr_options = WHISPER_TINY
797
+ elif asr_model == AsrModelType.WHISPER_SMALL:
798
+ asr_pipeline_options.asr_options = WHISPER_SMALL
799
+ elif asr_model == AsrModelType.WHISPER_MEDIUM:
800
+ asr_pipeline_options.asr_options = WHISPER_MEDIUM
801
+ elif asr_model == AsrModelType.WHISPER_BASE:
802
+ asr_pipeline_options.asr_options = WHISPER_BASE
803
+ elif asr_model == AsrModelType.WHISPER_LARGE:
804
+ asr_pipeline_options.asr_options = WHISPER_LARGE
805
+ elif asr_model == AsrModelType.WHISPER_TURBO:
806
+ asr_pipeline_options.asr_options = WHISPER_TURBO
807
+
808
+ # Explicit MLX models (force MLX implementation)
809
+ elif asr_model == AsrModelType.WHISPER_TINY_MLX:
810
+ asr_pipeline_options.asr_options = WHISPER_TINY_MLX
811
+ elif asr_model == AsrModelType.WHISPER_SMALL_MLX:
812
+ asr_pipeline_options.asr_options = WHISPER_SMALL_MLX
813
+ elif asr_model == AsrModelType.WHISPER_MEDIUM_MLX:
814
+ asr_pipeline_options.asr_options = WHISPER_MEDIUM_MLX
815
+ elif asr_model == AsrModelType.WHISPER_BASE_MLX:
816
+ asr_pipeline_options.asr_options = WHISPER_BASE_MLX
817
+ elif asr_model == AsrModelType.WHISPER_LARGE_MLX:
818
+ asr_pipeline_options.asr_options = WHISPER_LARGE_MLX
819
+ elif asr_model == AsrModelType.WHISPER_TURBO_MLX:
820
+ asr_pipeline_options.asr_options = WHISPER_TURBO_MLX
821
+
822
+ # Explicit Native models (force native implementation)
823
+ elif asr_model == AsrModelType.WHISPER_TINY_NATIVE:
824
+ asr_pipeline_options.asr_options = WHISPER_TINY_NATIVE
825
+ elif asr_model == AsrModelType.WHISPER_SMALL_NATIVE:
826
+ asr_pipeline_options.asr_options = WHISPER_SMALL_NATIVE
827
+ elif asr_model == AsrModelType.WHISPER_MEDIUM_NATIVE:
828
+ asr_pipeline_options.asr_options = WHISPER_MEDIUM_NATIVE
829
+ elif asr_model == AsrModelType.WHISPER_BASE_NATIVE:
830
+ asr_pipeline_options.asr_options = WHISPER_BASE_NATIVE
831
+ elif asr_model == AsrModelType.WHISPER_LARGE_NATIVE:
832
+ asr_pipeline_options.asr_options = WHISPER_LARGE_NATIVE
833
+ elif asr_model == AsrModelType.WHISPER_TURBO_NATIVE:
834
+ asr_pipeline_options.asr_options = WHISPER_TURBO_NATIVE
771
835
 
772
- _log.info(f"pipeline_options: {pipeline_options}")
836
+ else:
837
+ _log.error(f"{asr_model} is not known")
838
+ raise ValueError(f"{asr_model} is not known")
773
839
 
774
- audio_format_option = AudioFormatOption(
775
- pipeline_cls=AsrPipeline,
776
- pipeline_options=pipeline_options,
777
- )
840
+ _log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
778
841
 
779
- format_options = {
780
- InputFormat.AUDIO: audio_format_option,
781
- }
842
+ audio_format_option = AudioFormatOption(
843
+ pipeline_cls=AsrPipeline,
844
+ pipeline_options=asr_pipeline_options,
845
+ )
846
+ format_options[InputFormat.AUDIO] = audio_format_option
782
847
 
848
+ # Common options for all pipelines
783
849
  if artifacts_path is not None:
784
850
  pipeline_options.artifacts_path = artifacts_path
785
- # audio_pipeline_options.artifacts_path = artifacts_path
851
+ asr_pipeline_options.artifacts_path = artifacts_path
786
852
 
787
853
  doc_converter = DocumentConverter(
788
854
  allowed_formats=from_formats,
@@ -10,13 +10,394 @@ from docling.datamodel.pipeline_options_asr_model import (
10
10
  # AsrResponseFormat,
11
11
  # ApiAsrOptions,
12
12
  InferenceAsrFramework,
13
+ InlineAsrMlxWhisperOptions,
13
14
  InlineAsrNativeWhisperOptions,
14
15
  TransformersModelType,
15
16
  )
16
17
 
17
18
  _log = logging.getLogger(__name__)
18
19
 
19
- WHISPER_TINY = InlineAsrNativeWhisperOptions(
20
+
21
+ def _get_whisper_tiny_model():
22
+ """
23
+ Get the best Whisper Tiny model for the current hardware.
24
+
25
+ Automatically selects MLX Whisper Tiny for Apple Silicon (MPS) if available,
26
+ otherwise falls back to native Whisper Tiny.
27
+ """
28
+ # Check if MPS is available (Apple Silicon)
29
+ try:
30
+ import torch
31
+
32
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
33
+ except ImportError:
34
+ has_mps = False
35
+
36
+ # Check if mlx-whisper is available
37
+ try:
38
+ import mlx_whisper # type: ignore
39
+
40
+ has_mlx_whisper = True
41
+ except ImportError:
42
+ has_mlx_whisper = False
43
+
44
+ # Use MLX Whisper if both MPS and mlx-whisper are available
45
+ if has_mps and has_mlx_whisper:
46
+ return InlineAsrMlxWhisperOptions(
47
+ repo_id="mlx-community/whisper-tiny-mlx",
48
+ inference_framework=InferenceAsrFramework.MLX,
49
+ language="en",
50
+ task="transcribe",
51
+ word_timestamps=True,
52
+ no_speech_threshold=0.6,
53
+ logprob_threshold=-1.0,
54
+ compression_ratio_threshold=2.4,
55
+ )
56
+ else:
57
+ return InlineAsrNativeWhisperOptions(
58
+ repo_id="tiny",
59
+ inference_framework=InferenceAsrFramework.WHISPER,
60
+ verbose=True,
61
+ timestamps=True,
62
+ word_timestamps=True,
63
+ temperature=0.0,
64
+ max_new_tokens=256,
65
+ max_time_chunk=30.0,
66
+ )
67
+
68
+
69
+ # Create the model instance
70
+ WHISPER_TINY = _get_whisper_tiny_model()
71
+
72
+
73
+ def _get_whisper_small_model():
74
+ """
75
+ Get the best Whisper Small model for the current hardware.
76
+
77
+ Automatically selects MLX Whisper Small for Apple Silicon (MPS) if available,
78
+ otherwise falls back to native Whisper Small.
79
+ """
80
+ # Check if MPS is available (Apple Silicon)
81
+ try:
82
+ import torch
83
+
84
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
85
+ except ImportError:
86
+ has_mps = False
87
+
88
+ # Check if mlx-whisper is available
89
+ try:
90
+ import mlx_whisper # type: ignore
91
+
92
+ has_mlx_whisper = True
93
+ except ImportError:
94
+ has_mlx_whisper = False
95
+
96
+ # Use MLX Whisper if both MPS and mlx-whisper are available
97
+ if has_mps and has_mlx_whisper:
98
+ return InlineAsrMlxWhisperOptions(
99
+ repo_id="mlx-community/whisper-small-mlx",
100
+ inference_framework=InferenceAsrFramework.MLX,
101
+ language="en",
102
+ task="transcribe",
103
+ word_timestamps=True,
104
+ no_speech_threshold=0.6,
105
+ logprob_threshold=-1.0,
106
+ compression_ratio_threshold=2.4,
107
+ )
108
+ else:
109
+ return InlineAsrNativeWhisperOptions(
110
+ repo_id="small",
111
+ inference_framework=InferenceAsrFramework.WHISPER,
112
+ verbose=True,
113
+ timestamps=True,
114
+ word_timestamps=True,
115
+ temperature=0.0,
116
+ max_new_tokens=256,
117
+ max_time_chunk=30.0,
118
+ )
119
+
120
+
121
+ # Create the model instance
122
+ WHISPER_SMALL = _get_whisper_small_model()
123
+
124
+
125
+ def _get_whisper_medium_model():
126
+ """
127
+ Get the best Whisper Medium model for the current hardware.
128
+
129
+ Automatically selects MLX Whisper Medium for Apple Silicon (MPS) if available,
130
+ otherwise falls back to native Whisper Medium.
131
+ """
132
+ # Check if MPS is available (Apple Silicon)
133
+ try:
134
+ import torch
135
+
136
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
137
+ except ImportError:
138
+ has_mps = False
139
+
140
+ # Check if mlx-whisper is available
141
+ try:
142
+ import mlx_whisper # type: ignore
143
+
144
+ has_mlx_whisper = True
145
+ except ImportError:
146
+ has_mlx_whisper = False
147
+
148
+ # Use MLX Whisper if both MPS and mlx-whisper are available
149
+ if has_mps and has_mlx_whisper:
150
+ return InlineAsrMlxWhisperOptions(
151
+ repo_id="mlx-community/whisper-medium-mlx-8bit",
152
+ inference_framework=InferenceAsrFramework.MLX,
153
+ language="en",
154
+ task="transcribe",
155
+ word_timestamps=True,
156
+ no_speech_threshold=0.6,
157
+ logprob_threshold=-1.0,
158
+ compression_ratio_threshold=2.4,
159
+ )
160
+ else:
161
+ return InlineAsrNativeWhisperOptions(
162
+ repo_id="medium",
163
+ inference_framework=InferenceAsrFramework.WHISPER,
164
+ verbose=True,
165
+ timestamps=True,
166
+ word_timestamps=True,
167
+ temperature=0.0,
168
+ max_new_tokens=256,
169
+ max_time_chunk=30.0,
170
+ )
171
+
172
+
173
+ # Create the model instance
174
+ WHISPER_MEDIUM = _get_whisper_medium_model()
175
+
176
+
177
+ def _get_whisper_base_model():
178
+ """
179
+ Get the best Whisper Base model for the current hardware.
180
+
181
+ Automatically selects MLX Whisper Base for Apple Silicon (MPS) if available,
182
+ otherwise falls back to native Whisper Base.
183
+ """
184
+ # Check if MPS is available (Apple Silicon)
185
+ try:
186
+ import torch
187
+
188
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
189
+ except ImportError:
190
+ has_mps = False
191
+
192
+ # Check if mlx-whisper is available
193
+ try:
194
+ import mlx_whisper # type: ignore
195
+
196
+ has_mlx_whisper = True
197
+ except ImportError:
198
+ has_mlx_whisper = False
199
+
200
+ # Use MLX Whisper if both MPS and mlx-whisper are available
201
+ if has_mps and has_mlx_whisper:
202
+ return InlineAsrMlxWhisperOptions(
203
+ repo_id="mlx-community/whisper-base-mlx",
204
+ inference_framework=InferenceAsrFramework.MLX,
205
+ language="en",
206
+ task="transcribe",
207
+ word_timestamps=True,
208
+ no_speech_threshold=0.6,
209
+ logprob_threshold=-1.0,
210
+ compression_ratio_threshold=2.4,
211
+ )
212
+ else:
213
+ return InlineAsrNativeWhisperOptions(
214
+ repo_id="base",
215
+ inference_framework=InferenceAsrFramework.WHISPER,
216
+ verbose=True,
217
+ timestamps=True,
218
+ word_timestamps=True,
219
+ temperature=0.0,
220
+ max_new_tokens=256,
221
+ max_time_chunk=30.0,
222
+ )
223
+
224
+
225
+ # Create the model instance
226
+ WHISPER_BASE = _get_whisper_base_model()
227
+
228
+
229
+ def _get_whisper_large_model():
230
+ """
231
+ Get the best Whisper Large model for the current hardware.
232
+
233
+ Automatically selects MLX Whisper Large for Apple Silicon (MPS) if available,
234
+ otherwise falls back to native Whisper Large.
235
+ """
236
+ # Check if MPS is available (Apple Silicon)
237
+ try:
238
+ import torch
239
+
240
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
241
+ except ImportError:
242
+ has_mps = False
243
+
244
+ # Check if mlx-whisper is available
245
+ try:
246
+ import mlx_whisper # type: ignore
247
+
248
+ has_mlx_whisper = True
249
+ except ImportError:
250
+ has_mlx_whisper = False
251
+
252
+ # Use MLX Whisper if both MPS and mlx-whisper are available
253
+ if has_mps and has_mlx_whisper:
254
+ return InlineAsrMlxWhisperOptions(
255
+ repo_id="mlx-community/whisper-large-mlx-8bit",
256
+ inference_framework=InferenceAsrFramework.MLX,
257
+ language="en",
258
+ task="transcribe",
259
+ word_timestamps=True,
260
+ no_speech_threshold=0.6,
261
+ logprob_threshold=-1.0,
262
+ compression_ratio_threshold=2.4,
263
+ )
264
+ else:
265
+ return InlineAsrNativeWhisperOptions(
266
+ repo_id="large",
267
+ inference_framework=InferenceAsrFramework.WHISPER,
268
+ verbose=True,
269
+ timestamps=True,
270
+ word_timestamps=True,
271
+ temperature=0.0,
272
+ max_new_tokens=256,
273
+ max_time_chunk=30.0,
274
+ )
275
+
276
+
277
+ # Create the model instance
278
+ WHISPER_LARGE = _get_whisper_large_model()
279
+
280
+
281
+ def _get_whisper_turbo_model():
282
+ """
283
+ Get the best Whisper Turbo model for the current hardware.
284
+
285
+ Automatically selects MLX Whisper Turbo for Apple Silicon (MPS) if available,
286
+ otherwise falls back to native Whisper Turbo.
287
+ """
288
+ # Check if MPS is available (Apple Silicon)
289
+ try:
290
+ import torch
291
+
292
+ has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
293
+ except ImportError:
294
+ has_mps = False
295
+
296
+ # Check if mlx-whisper is available
297
+ try:
298
+ import mlx_whisper # type: ignore
299
+
300
+ has_mlx_whisper = True
301
+ except ImportError:
302
+ has_mlx_whisper = False
303
+
304
+ # Use MLX Whisper if both MPS and mlx-whisper are available
305
+ if has_mps and has_mlx_whisper:
306
+ return InlineAsrMlxWhisperOptions(
307
+ repo_id="mlx-community/whisper-turbo",
308
+ inference_framework=InferenceAsrFramework.MLX,
309
+ language="en",
310
+ task="transcribe",
311
+ word_timestamps=True,
312
+ no_speech_threshold=0.6,
313
+ logprob_threshold=-1.0,
314
+ compression_ratio_threshold=2.4,
315
+ )
316
+ else:
317
+ return InlineAsrNativeWhisperOptions(
318
+ repo_id="turbo",
319
+ inference_framework=InferenceAsrFramework.WHISPER,
320
+ verbose=True,
321
+ timestamps=True,
322
+ word_timestamps=True,
323
+ temperature=0.0,
324
+ max_new_tokens=256,
325
+ max_time_chunk=30.0,
326
+ )
327
+
328
+
329
+ # Create the model instance
330
+ WHISPER_TURBO = _get_whisper_turbo_model()
331
+
332
+ # Explicit MLX Whisper model options for users who want to force MLX usage
333
+ WHISPER_TINY_MLX = InlineAsrMlxWhisperOptions(
334
+ repo_id="mlx-community/whisper-tiny-mlx",
335
+ inference_framework=InferenceAsrFramework.MLX,
336
+ language="en",
337
+ task="transcribe",
338
+ word_timestamps=True,
339
+ no_speech_threshold=0.6,
340
+ logprob_threshold=-1.0,
341
+ compression_ratio_threshold=2.4,
342
+ )
343
+
344
+ WHISPER_SMALL_MLX = InlineAsrMlxWhisperOptions(
345
+ repo_id="mlx-community/whisper-small-mlx",
346
+ inference_framework=InferenceAsrFramework.MLX,
347
+ language="en",
348
+ task="transcribe",
349
+ word_timestamps=True,
350
+ no_speech_threshold=0.6,
351
+ logprob_threshold=-1.0,
352
+ compression_ratio_threshold=2.4,
353
+ )
354
+
355
+ WHISPER_MEDIUM_MLX = InlineAsrMlxWhisperOptions(
356
+ repo_id="mlx-community/whisper-medium-mlx-8bit",
357
+ inference_framework=InferenceAsrFramework.MLX,
358
+ language="en",
359
+ task="transcribe",
360
+ word_timestamps=True,
361
+ no_speech_threshold=0.6,
362
+ logprob_threshold=-1.0,
363
+ compression_ratio_threshold=2.4,
364
+ )
365
+
366
+ WHISPER_BASE_MLX = InlineAsrMlxWhisperOptions(
367
+ repo_id="mlx-community/whisper-base-mlx",
368
+ inference_framework=InferenceAsrFramework.MLX,
369
+ language="en",
370
+ task="transcribe",
371
+ word_timestamps=True,
372
+ no_speech_threshold=0.6,
373
+ logprob_threshold=-1.0,
374
+ compression_ratio_threshold=2.4,
375
+ )
376
+
377
+ WHISPER_LARGE_MLX = InlineAsrMlxWhisperOptions(
378
+ repo_id="mlx-community/whisper-large-mlx-8bit",
379
+ inference_framework=InferenceAsrFramework.MLX,
380
+ language="en",
381
+ task="transcribe",
382
+ word_timestamps=True,
383
+ no_speech_threshold=0.6,
384
+ logprob_threshold=-1.0,
385
+ compression_ratio_threshold=2.4,
386
+ )
387
+
388
+ WHISPER_TURBO_MLX = InlineAsrMlxWhisperOptions(
389
+ repo_id="mlx-community/whisper-turbo",
390
+ inference_framework=InferenceAsrFramework.MLX,
391
+ language="en",
392
+ task="transcribe",
393
+ word_timestamps=True,
394
+ no_speech_threshold=0.6,
395
+ logprob_threshold=-1.0,
396
+ compression_ratio_threshold=2.4,
397
+ )
398
+
399
+ # Explicit Native Whisper model options for users who want to force native usage
400
+ WHISPER_TINY_NATIVE = InlineAsrNativeWhisperOptions(
20
401
  repo_id="tiny",
21
402
  inference_framework=InferenceAsrFramework.WHISPER,
22
403
  verbose=True,
@@ -27,7 +408,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
27
408
  max_time_chunk=30.0,
28
409
  )
29
410
 
30
- WHISPER_SMALL = InlineAsrNativeWhisperOptions(
411
+ WHISPER_SMALL_NATIVE = InlineAsrNativeWhisperOptions(
31
412
  repo_id="small",
32
413
  inference_framework=InferenceAsrFramework.WHISPER,
33
414
  verbose=True,
@@ -38,7 +419,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
38
419
  max_time_chunk=30.0,
39
420
  )
40
421
 
41
- WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
422
+ WHISPER_MEDIUM_NATIVE = InlineAsrNativeWhisperOptions(
42
423
  repo_id="medium",
43
424
  inference_framework=InferenceAsrFramework.WHISPER,
44
425
  verbose=True,
@@ -49,7 +430,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
49
430
  max_time_chunk=30.0,
50
431
  )
51
432
 
52
- WHISPER_BASE = InlineAsrNativeWhisperOptions(
433
+ WHISPER_BASE_NATIVE = InlineAsrNativeWhisperOptions(
53
434
  repo_id="base",
54
435
  inference_framework=InferenceAsrFramework.WHISPER,
55
436
  verbose=True,
@@ -60,7 +441,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
60
441
  max_time_chunk=30.0,
61
442
  )
62
443
 
63
- WHISPER_LARGE = InlineAsrNativeWhisperOptions(
444
+ WHISPER_LARGE_NATIVE = InlineAsrNativeWhisperOptions(
64
445
  repo_id="large",
65
446
  inference_framework=InferenceAsrFramework.WHISPER,
66
447
  verbose=True,
@@ -71,7 +452,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
71
452
  max_time_chunk=30.0,
72
453
  )
73
454
 
74
- WHISPER_TURBO = InlineAsrNativeWhisperOptions(
455
+ WHISPER_TURBO_NATIVE = InlineAsrNativeWhisperOptions(
75
456
  repo_id="turbo",
76
457
  inference_framework=InferenceAsrFramework.WHISPER,
77
458
  verbose=True,
@@ -82,11 +463,32 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
82
463
  max_time_chunk=30.0,
83
464
  )
84
465
 
466
+ # Note: The main WHISPER_* models (WHISPER_TURBO, WHISPER_BASE, etc.) automatically
467
+ # select the best implementation (MLX on Apple Silicon, Native elsewhere).
468
+ # Use the explicit _MLX or _NATIVE variants if you need to force a specific implementation.
469
+
85
470
 
86
471
  class AsrModelType(str, Enum):
472
+ # Auto-selecting models (choose best implementation for hardware)
87
473
  WHISPER_TINY = "whisper_tiny"
88
474
  WHISPER_SMALL = "whisper_small"
89
475
  WHISPER_MEDIUM = "whisper_medium"
90
476
  WHISPER_BASE = "whisper_base"
91
477
  WHISPER_LARGE = "whisper_large"
92
478
  WHISPER_TURBO = "whisper_turbo"
479
+
480
+ # Explicit MLX models (force MLX implementation)
481
+ WHISPER_TINY_MLX = "whisper_tiny_mlx"
482
+ WHISPER_SMALL_MLX = "whisper_small_mlx"
483
+ WHISPER_MEDIUM_MLX = "whisper_medium_mlx"
484
+ WHISPER_BASE_MLX = "whisper_base_mlx"
485
+ WHISPER_LARGE_MLX = "whisper_large_mlx"
486
+ WHISPER_TURBO_MLX = "whisper_turbo_mlx"
487
+
488
+ # Explicit Native models (force native implementation)
489
+ WHISPER_TINY_NATIVE = "whisper_tiny_native"
490
+ WHISPER_SMALL_NATIVE = "whisper_small_native"
491
+ WHISPER_MEDIUM_NATIVE = "whisper_medium_native"
492
+ WHISPER_BASE_NATIVE = "whisper_base_native"
493
+ WHISPER_LARGE_NATIVE = "whisper_large_native"
494
+ WHISPER_TURBO_NATIVE = "whisper_turbo_native"