docling 2.57.0__py3-none-any.whl → 2.59.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/abstract_backend.py +24 -3
- docling/backend/asciidoc_backend.py +3 -3
- docling/backend/docling_parse_v4_backend.py +15 -4
- docling/backend/html_backend.py +130 -20
- docling/backend/md_backend.py +27 -5
- docling/backend/msexcel_backend.py +121 -29
- docling/backend/mspowerpoint_backend.py +2 -2
- docling/backend/msword_backend.py +18 -18
- docling/backend/pdf_backend.py +9 -2
- docling/backend/pypdfium2_backend.py +12 -3
- docling/cli/main.py +104 -38
- docling/datamodel/asr_model_specs.py +408 -6
- docling/datamodel/backend_options.py +82 -0
- docling/datamodel/base_models.py +19 -2
- docling/datamodel/document.py +81 -48
- docling/datamodel/pipeline_options_asr_model.py +21 -1
- docling/datamodel/pipeline_options_vlm_model.py +1 -0
- docling/document_converter.py +37 -45
- docling/document_extractor.py +12 -11
- docling/models/api_vlm_model.py +5 -3
- docling/models/picture_description_vlm_model.py +5 -1
- docling/models/readingorder_model.py +6 -7
- docling/models/vlm_models_inline/hf_transformers_model.py +13 -3
- docling/models/vlm_models_inline/mlx_model.py +9 -3
- docling/models/vlm_models_inline/nuextract_transformers_model.py +13 -3
- docling/models/vlm_models_inline/vllm_model.py +42 -8
- docling/pipeline/asr_pipeline.py +149 -6
- docling/utils/api_image_request.py +20 -9
- docling/utils/layout_postprocessor.py +23 -24
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/METADATA +11 -8
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/RECORD +35 -34
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/WHEEL +0 -0
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/entry_points.txt +0 -0
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/top_level.txt +0 -0
docling/cli/main.py
CHANGED
|
@@ -32,13 +32,26 @@ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
|
32
32
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
|
33
33
|
from docling.datamodel.asr_model_specs import (
|
|
34
34
|
WHISPER_BASE,
|
|
35
|
+
WHISPER_BASE_MLX,
|
|
36
|
+
WHISPER_BASE_NATIVE,
|
|
35
37
|
WHISPER_LARGE,
|
|
38
|
+
WHISPER_LARGE_MLX,
|
|
39
|
+
WHISPER_LARGE_NATIVE,
|
|
36
40
|
WHISPER_MEDIUM,
|
|
41
|
+
WHISPER_MEDIUM_MLX,
|
|
42
|
+
WHISPER_MEDIUM_NATIVE,
|
|
37
43
|
WHISPER_SMALL,
|
|
44
|
+
WHISPER_SMALL_MLX,
|
|
45
|
+
WHISPER_SMALL_NATIVE,
|
|
38
46
|
WHISPER_TINY,
|
|
47
|
+
WHISPER_TINY_MLX,
|
|
48
|
+
WHISPER_TINY_NATIVE,
|
|
39
49
|
WHISPER_TURBO,
|
|
50
|
+
WHISPER_TURBO_MLX,
|
|
51
|
+
WHISPER_TURBO_NATIVE,
|
|
40
52
|
AsrModelType,
|
|
41
53
|
)
|
|
54
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
42
55
|
from docling.datamodel.base_models import (
|
|
43
56
|
ConversionStatus,
|
|
44
57
|
FormatToExtensions,
|
|
@@ -391,7 +404,10 @@ def convert( # noqa: C901
|
|
|
391
404
|
] = None,
|
|
392
405
|
pdf_backend: Annotated[
|
|
393
406
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
|
394
|
-
] = PdfBackend.
|
|
407
|
+
] = PdfBackend.DLPARSE_V4,
|
|
408
|
+
pdf_password: Annotated[
|
|
409
|
+
Optional[str], typer.Option(..., help="Password for protected PDF documents")
|
|
410
|
+
] = None,
|
|
395
411
|
table_mode: Annotated[
|
|
396
412
|
TableFormerMode,
|
|
397
413
|
typer.Option(..., help="The mode to use in the table structure model."),
|
|
@@ -611,10 +627,14 @@ def convert( # noqa: C901
|
|
|
611
627
|
ocr_options.psm = psm
|
|
612
628
|
|
|
613
629
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
|
630
|
+
|
|
614
631
|
# pipeline_options: PaginatedPipelineOptions
|
|
615
632
|
pipeline_options: PipelineOptions
|
|
616
633
|
|
|
617
634
|
format_options: Dict[InputFormat, FormatOption] = {}
|
|
635
|
+
pdf_backend_options: Optional[PdfBackendOptions] = PdfBackendOptions(
|
|
636
|
+
password=pdf_password
|
|
637
|
+
)
|
|
618
638
|
|
|
619
639
|
if pipeline == ProcessingPipeline.STANDARD:
|
|
620
640
|
pipeline_options = PdfPipelineOptions(
|
|
@@ -645,8 +665,10 @@ def convert( # noqa: C901
|
|
|
645
665
|
backend: Type[PdfDocumentBackend]
|
|
646
666
|
if pdf_backend == PdfBackend.DLPARSE_V1:
|
|
647
667
|
backend = DoclingParseDocumentBackend
|
|
668
|
+
pdf_backend_options = None
|
|
648
669
|
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
|
649
670
|
backend = DoclingParseV2DocumentBackend
|
|
671
|
+
pdf_backend_options = None
|
|
650
672
|
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
|
651
673
|
backend = DoclingParseV4DocumentBackend # type: ignore
|
|
652
674
|
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
|
@@ -657,6 +679,7 @@ def convert( # noqa: C901
|
|
|
657
679
|
pdf_format_option = PdfFormatOption(
|
|
658
680
|
pipeline_options=pipeline_options,
|
|
659
681
|
backend=backend, # pdf_backend
|
|
682
|
+
backend_options=pdf_backend_options,
|
|
660
683
|
)
|
|
661
684
|
|
|
662
685
|
# METS GBS options
|
|
@@ -715,10 +738,15 @@ def convert( # noqa: C901
|
|
|
715
738
|
|
|
716
739
|
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
|
717
740
|
except ImportError:
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
741
|
+
if sys.version_info < (3, 14):
|
|
742
|
+
_log.warning(
|
|
743
|
+
"To run SmolDocling faster, please install mlx-vlm:\n"
|
|
744
|
+
"pip install mlx-vlm"
|
|
745
|
+
)
|
|
746
|
+
else:
|
|
747
|
+
_log.warning(
|
|
748
|
+
"You can run SmolDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
|
|
749
|
+
)
|
|
722
750
|
|
|
723
751
|
elif vlm_model == VlmModelType.GRANITEDOCLING:
|
|
724
752
|
pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
|
|
@@ -728,10 +756,16 @@ def convert( # noqa: C901
|
|
|
728
756
|
|
|
729
757
|
pipeline_options.vlm_options = GRANITEDOCLING_MLX
|
|
730
758
|
except ImportError:
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
759
|
+
if sys.version_info < (3, 14):
|
|
760
|
+
_log.warning(
|
|
761
|
+
"To run GraniteDocling faster, please install mlx-vlm:\n"
|
|
762
|
+
"pip install mlx-vlm"
|
|
763
|
+
)
|
|
764
|
+
else:
|
|
765
|
+
_log.warning(
|
|
766
|
+
"You can run GraniteDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
|
|
767
|
+
)
|
|
768
|
+
|
|
735
769
|
elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
|
|
736
770
|
pipeline_options.vlm_options = SMOLDOCLING_VLLM
|
|
737
771
|
|
|
@@ -747,42 +781,74 @@ def convert( # noqa: C901
|
|
|
747
781
|
InputFormat.IMAGE: pdf_format_option,
|
|
748
782
|
}
|
|
749
783
|
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
784
|
+
# Set ASR options
|
|
785
|
+
asr_pipeline_options = AsrPipelineOptions(
|
|
786
|
+
accelerator_options=AcceleratorOptions(
|
|
787
|
+
device=device,
|
|
788
|
+
num_threads=num_threads,
|
|
789
|
+
),
|
|
790
|
+
# enable_remote_services=enable_remote_services,
|
|
791
|
+
# artifacts_path = artifacts_path
|
|
792
|
+
)
|
|
755
793
|
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
794
|
+
# Auto-selecting models (choose best implementation for hardware)
|
|
795
|
+
if asr_model == AsrModelType.WHISPER_TINY:
|
|
796
|
+
asr_pipeline_options.asr_options = WHISPER_TINY
|
|
797
|
+
elif asr_model == AsrModelType.WHISPER_SMALL:
|
|
798
|
+
asr_pipeline_options.asr_options = WHISPER_SMALL
|
|
799
|
+
elif asr_model == AsrModelType.WHISPER_MEDIUM:
|
|
800
|
+
asr_pipeline_options.asr_options = WHISPER_MEDIUM
|
|
801
|
+
elif asr_model == AsrModelType.WHISPER_BASE:
|
|
802
|
+
asr_pipeline_options.asr_options = WHISPER_BASE
|
|
803
|
+
elif asr_model == AsrModelType.WHISPER_LARGE:
|
|
804
|
+
asr_pipeline_options.asr_options = WHISPER_LARGE
|
|
805
|
+
elif asr_model == AsrModelType.WHISPER_TURBO:
|
|
806
|
+
asr_pipeline_options.asr_options = WHISPER_TURBO
|
|
807
|
+
|
|
808
|
+
# Explicit MLX models (force MLX implementation)
|
|
809
|
+
elif asr_model == AsrModelType.WHISPER_TINY_MLX:
|
|
810
|
+
asr_pipeline_options.asr_options = WHISPER_TINY_MLX
|
|
811
|
+
elif asr_model == AsrModelType.WHISPER_SMALL_MLX:
|
|
812
|
+
asr_pipeline_options.asr_options = WHISPER_SMALL_MLX
|
|
813
|
+
elif asr_model == AsrModelType.WHISPER_MEDIUM_MLX:
|
|
814
|
+
asr_pipeline_options.asr_options = WHISPER_MEDIUM_MLX
|
|
815
|
+
elif asr_model == AsrModelType.WHISPER_BASE_MLX:
|
|
816
|
+
asr_pipeline_options.asr_options = WHISPER_BASE_MLX
|
|
817
|
+
elif asr_model == AsrModelType.WHISPER_LARGE_MLX:
|
|
818
|
+
asr_pipeline_options.asr_options = WHISPER_LARGE_MLX
|
|
819
|
+
elif asr_model == AsrModelType.WHISPER_TURBO_MLX:
|
|
820
|
+
asr_pipeline_options.asr_options = WHISPER_TURBO_MLX
|
|
821
|
+
|
|
822
|
+
# Explicit Native models (force native implementation)
|
|
823
|
+
elif asr_model == AsrModelType.WHISPER_TINY_NATIVE:
|
|
824
|
+
asr_pipeline_options.asr_options = WHISPER_TINY_NATIVE
|
|
825
|
+
elif asr_model == AsrModelType.WHISPER_SMALL_NATIVE:
|
|
826
|
+
asr_pipeline_options.asr_options = WHISPER_SMALL_NATIVE
|
|
827
|
+
elif asr_model == AsrModelType.WHISPER_MEDIUM_NATIVE:
|
|
828
|
+
asr_pipeline_options.asr_options = WHISPER_MEDIUM_NATIVE
|
|
829
|
+
elif asr_model == AsrModelType.WHISPER_BASE_NATIVE:
|
|
830
|
+
asr_pipeline_options.asr_options = WHISPER_BASE_NATIVE
|
|
831
|
+
elif asr_model == AsrModelType.WHISPER_LARGE_NATIVE:
|
|
832
|
+
asr_pipeline_options.asr_options = WHISPER_LARGE_NATIVE
|
|
833
|
+
elif asr_model == AsrModelType.WHISPER_TURBO_NATIVE:
|
|
834
|
+
asr_pipeline_options.asr_options = WHISPER_TURBO_NATIVE
|
|
771
835
|
|
|
772
|
-
|
|
836
|
+
else:
|
|
837
|
+
_log.error(f"{asr_model} is not known")
|
|
838
|
+
raise ValueError(f"{asr_model} is not known")
|
|
773
839
|
|
|
774
|
-
|
|
775
|
-
pipeline_cls=AsrPipeline,
|
|
776
|
-
pipeline_options=pipeline_options,
|
|
777
|
-
)
|
|
840
|
+
_log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
|
|
778
841
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
842
|
+
audio_format_option = AudioFormatOption(
|
|
843
|
+
pipeline_cls=AsrPipeline,
|
|
844
|
+
pipeline_options=asr_pipeline_options,
|
|
845
|
+
)
|
|
846
|
+
format_options[InputFormat.AUDIO] = audio_format_option
|
|
782
847
|
|
|
848
|
+
# Common options for all pipelines
|
|
783
849
|
if artifacts_path is not None:
|
|
784
850
|
pipeline_options.artifacts_path = artifacts_path
|
|
785
|
-
|
|
851
|
+
asr_pipeline_options.artifacts_path = artifacts_path
|
|
786
852
|
|
|
787
853
|
doc_converter = DocumentConverter(
|
|
788
854
|
allowed_formats=from_formats,
|
|
@@ -10,13 +10,394 @@ from docling.datamodel.pipeline_options_asr_model import (
|
|
|
10
10
|
# AsrResponseFormat,
|
|
11
11
|
# ApiAsrOptions,
|
|
12
12
|
InferenceAsrFramework,
|
|
13
|
+
InlineAsrMlxWhisperOptions,
|
|
13
14
|
InlineAsrNativeWhisperOptions,
|
|
14
15
|
TransformersModelType,
|
|
15
16
|
)
|
|
16
17
|
|
|
17
18
|
_log = logging.getLogger(__name__)
|
|
18
19
|
|
|
19
|
-
|
|
20
|
+
|
|
21
|
+
def _get_whisper_tiny_model():
|
|
22
|
+
"""
|
|
23
|
+
Get the best Whisper Tiny model for the current hardware.
|
|
24
|
+
|
|
25
|
+
Automatically selects MLX Whisper Tiny for Apple Silicon (MPS) if available,
|
|
26
|
+
otherwise falls back to native Whisper Tiny.
|
|
27
|
+
"""
|
|
28
|
+
# Check if MPS is available (Apple Silicon)
|
|
29
|
+
try:
|
|
30
|
+
import torch
|
|
31
|
+
|
|
32
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
|
33
|
+
except ImportError:
|
|
34
|
+
has_mps = False
|
|
35
|
+
|
|
36
|
+
# Check if mlx-whisper is available
|
|
37
|
+
try:
|
|
38
|
+
import mlx_whisper # type: ignore
|
|
39
|
+
|
|
40
|
+
has_mlx_whisper = True
|
|
41
|
+
except ImportError:
|
|
42
|
+
has_mlx_whisper = False
|
|
43
|
+
|
|
44
|
+
# Use MLX Whisper if both MPS and mlx-whisper are available
|
|
45
|
+
if has_mps and has_mlx_whisper:
|
|
46
|
+
return InlineAsrMlxWhisperOptions(
|
|
47
|
+
repo_id="mlx-community/whisper-tiny-mlx",
|
|
48
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
49
|
+
language="en",
|
|
50
|
+
task="transcribe",
|
|
51
|
+
word_timestamps=True,
|
|
52
|
+
no_speech_threshold=0.6,
|
|
53
|
+
logprob_threshold=-1.0,
|
|
54
|
+
compression_ratio_threshold=2.4,
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
return InlineAsrNativeWhisperOptions(
|
|
58
|
+
repo_id="tiny",
|
|
59
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
|
60
|
+
verbose=True,
|
|
61
|
+
timestamps=True,
|
|
62
|
+
word_timestamps=True,
|
|
63
|
+
temperature=0.0,
|
|
64
|
+
max_new_tokens=256,
|
|
65
|
+
max_time_chunk=30.0,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# Create the model instance
|
|
70
|
+
WHISPER_TINY = _get_whisper_tiny_model()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _get_whisper_small_model():
|
|
74
|
+
"""
|
|
75
|
+
Get the best Whisper Small model for the current hardware.
|
|
76
|
+
|
|
77
|
+
Automatically selects MLX Whisper Small for Apple Silicon (MPS) if available,
|
|
78
|
+
otherwise falls back to native Whisper Small.
|
|
79
|
+
"""
|
|
80
|
+
# Check if MPS is available (Apple Silicon)
|
|
81
|
+
try:
|
|
82
|
+
import torch
|
|
83
|
+
|
|
84
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
|
85
|
+
except ImportError:
|
|
86
|
+
has_mps = False
|
|
87
|
+
|
|
88
|
+
# Check if mlx-whisper is available
|
|
89
|
+
try:
|
|
90
|
+
import mlx_whisper # type: ignore
|
|
91
|
+
|
|
92
|
+
has_mlx_whisper = True
|
|
93
|
+
except ImportError:
|
|
94
|
+
has_mlx_whisper = False
|
|
95
|
+
|
|
96
|
+
# Use MLX Whisper if both MPS and mlx-whisper are available
|
|
97
|
+
if has_mps and has_mlx_whisper:
|
|
98
|
+
return InlineAsrMlxWhisperOptions(
|
|
99
|
+
repo_id="mlx-community/whisper-small-mlx",
|
|
100
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
101
|
+
language="en",
|
|
102
|
+
task="transcribe",
|
|
103
|
+
word_timestamps=True,
|
|
104
|
+
no_speech_threshold=0.6,
|
|
105
|
+
logprob_threshold=-1.0,
|
|
106
|
+
compression_ratio_threshold=2.4,
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
return InlineAsrNativeWhisperOptions(
|
|
110
|
+
repo_id="small",
|
|
111
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
|
112
|
+
verbose=True,
|
|
113
|
+
timestamps=True,
|
|
114
|
+
word_timestamps=True,
|
|
115
|
+
temperature=0.0,
|
|
116
|
+
max_new_tokens=256,
|
|
117
|
+
max_time_chunk=30.0,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# Create the model instance
|
|
122
|
+
WHISPER_SMALL = _get_whisper_small_model()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _get_whisper_medium_model():
|
|
126
|
+
"""
|
|
127
|
+
Get the best Whisper Medium model for the current hardware.
|
|
128
|
+
|
|
129
|
+
Automatically selects MLX Whisper Medium for Apple Silicon (MPS) if available,
|
|
130
|
+
otherwise falls back to native Whisper Medium.
|
|
131
|
+
"""
|
|
132
|
+
# Check if MPS is available (Apple Silicon)
|
|
133
|
+
try:
|
|
134
|
+
import torch
|
|
135
|
+
|
|
136
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
|
137
|
+
except ImportError:
|
|
138
|
+
has_mps = False
|
|
139
|
+
|
|
140
|
+
# Check if mlx-whisper is available
|
|
141
|
+
try:
|
|
142
|
+
import mlx_whisper # type: ignore
|
|
143
|
+
|
|
144
|
+
has_mlx_whisper = True
|
|
145
|
+
except ImportError:
|
|
146
|
+
has_mlx_whisper = False
|
|
147
|
+
|
|
148
|
+
# Use MLX Whisper if both MPS and mlx-whisper are available
|
|
149
|
+
if has_mps and has_mlx_whisper:
|
|
150
|
+
return InlineAsrMlxWhisperOptions(
|
|
151
|
+
repo_id="mlx-community/whisper-medium-mlx-8bit",
|
|
152
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
153
|
+
language="en",
|
|
154
|
+
task="transcribe",
|
|
155
|
+
word_timestamps=True,
|
|
156
|
+
no_speech_threshold=0.6,
|
|
157
|
+
logprob_threshold=-1.0,
|
|
158
|
+
compression_ratio_threshold=2.4,
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
return InlineAsrNativeWhisperOptions(
|
|
162
|
+
repo_id="medium",
|
|
163
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
|
164
|
+
verbose=True,
|
|
165
|
+
timestamps=True,
|
|
166
|
+
word_timestamps=True,
|
|
167
|
+
temperature=0.0,
|
|
168
|
+
max_new_tokens=256,
|
|
169
|
+
max_time_chunk=30.0,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# Create the model instance
|
|
174
|
+
WHISPER_MEDIUM = _get_whisper_medium_model()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _get_whisper_base_model():
|
|
178
|
+
"""
|
|
179
|
+
Get the best Whisper Base model for the current hardware.
|
|
180
|
+
|
|
181
|
+
Automatically selects MLX Whisper Base for Apple Silicon (MPS) if available,
|
|
182
|
+
otherwise falls back to native Whisper Base.
|
|
183
|
+
"""
|
|
184
|
+
# Check if MPS is available (Apple Silicon)
|
|
185
|
+
try:
|
|
186
|
+
import torch
|
|
187
|
+
|
|
188
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
|
189
|
+
except ImportError:
|
|
190
|
+
has_mps = False
|
|
191
|
+
|
|
192
|
+
# Check if mlx-whisper is available
|
|
193
|
+
try:
|
|
194
|
+
import mlx_whisper # type: ignore
|
|
195
|
+
|
|
196
|
+
has_mlx_whisper = True
|
|
197
|
+
except ImportError:
|
|
198
|
+
has_mlx_whisper = False
|
|
199
|
+
|
|
200
|
+
# Use MLX Whisper if both MPS and mlx-whisper are available
|
|
201
|
+
if has_mps and has_mlx_whisper:
|
|
202
|
+
return InlineAsrMlxWhisperOptions(
|
|
203
|
+
repo_id="mlx-community/whisper-base-mlx",
|
|
204
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
205
|
+
language="en",
|
|
206
|
+
task="transcribe",
|
|
207
|
+
word_timestamps=True,
|
|
208
|
+
no_speech_threshold=0.6,
|
|
209
|
+
logprob_threshold=-1.0,
|
|
210
|
+
compression_ratio_threshold=2.4,
|
|
211
|
+
)
|
|
212
|
+
else:
|
|
213
|
+
return InlineAsrNativeWhisperOptions(
|
|
214
|
+
repo_id="base",
|
|
215
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
|
216
|
+
verbose=True,
|
|
217
|
+
timestamps=True,
|
|
218
|
+
word_timestamps=True,
|
|
219
|
+
temperature=0.0,
|
|
220
|
+
max_new_tokens=256,
|
|
221
|
+
max_time_chunk=30.0,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# Create the model instance
|
|
226
|
+
WHISPER_BASE = _get_whisper_base_model()
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _get_whisper_large_model():
|
|
230
|
+
"""
|
|
231
|
+
Get the best Whisper Large model for the current hardware.
|
|
232
|
+
|
|
233
|
+
Automatically selects MLX Whisper Large for Apple Silicon (MPS) if available,
|
|
234
|
+
otherwise falls back to native Whisper Large.
|
|
235
|
+
"""
|
|
236
|
+
# Check if MPS is available (Apple Silicon)
|
|
237
|
+
try:
|
|
238
|
+
import torch
|
|
239
|
+
|
|
240
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
|
241
|
+
except ImportError:
|
|
242
|
+
has_mps = False
|
|
243
|
+
|
|
244
|
+
# Check if mlx-whisper is available
|
|
245
|
+
try:
|
|
246
|
+
import mlx_whisper # type: ignore
|
|
247
|
+
|
|
248
|
+
has_mlx_whisper = True
|
|
249
|
+
except ImportError:
|
|
250
|
+
has_mlx_whisper = False
|
|
251
|
+
|
|
252
|
+
# Use MLX Whisper if both MPS and mlx-whisper are available
|
|
253
|
+
if has_mps and has_mlx_whisper:
|
|
254
|
+
return InlineAsrMlxWhisperOptions(
|
|
255
|
+
repo_id="mlx-community/whisper-large-mlx-8bit",
|
|
256
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
257
|
+
language="en",
|
|
258
|
+
task="transcribe",
|
|
259
|
+
word_timestamps=True,
|
|
260
|
+
no_speech_threshold=0.6,
|
|
261
|
+
logprob_threshold=-1.0,
|
|
262
|
+
compression_ratio_threshold=2.4,
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
return InlineAsrNativeWhisperOptions(
|
|
266
|
+
repo_id="large",
|
|
267
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
|
268
|
+
verbose=True,
|
|
269
|
+
timestamps=True,
|
|
270
|
+
word_timestamps=True,
|
|
271
|
+
temperature=0.0,
|
|
272
|
+
max_new_tokens=256,
|
|
273
|
+
max_time_chunk=30.0,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# Create the model instance
|
|
278
|
+
WHISPER_LARGE = _get_whisper_large_model()
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _get_whisper_turbo_model():
|
|
282
|
+
"""
|
|
283
|
+
Get the best Whisper Turbo model for the current hardware.
|
|
284
|
+
|
|
285
|
+
Automatically selects MLX Whisper Turbo for Apple Silicon (MPS) if available,
|
|
286
|
+
otherwise falls back to native Whisper Turbo.
|
|
287
|
+
"""
|
|
288
|
+
# Check if MPS is available (Apple Silicon)
|
|
289
|
+
try:
|
|
290
|
+
import torch
|
|
291
|
+
|
|
292
|
+
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
|
293
|
+
except ImportError:
|
|
294
|
+
has_mps = False
|
|
295
|
+
|
|
296
|
+
# Check if mlx-whisper is available
|
|
297
|
+
try:
|
|
298
|
+
import mlx_whisper # type: ignore
|
|
299
|
+
|
|
300
|
+
has_mlx_whisper = True
|
|
301
|
+
except ImportError:
|
|
302
|
+
has_mlx_whisper = False
|
|
303
|
+
|
|
304
|
+
# Use MLX Whisper if both MPS and mlx-whisper are available
|
|
305
|
+
if has_mps and has_mlx_whisper:
|
|
306
|
+
return InlineAsrMlxWhisperOptions(
|
|
307
|
+
repo_id="mlx-community/whisper-turbo",
|
|
308
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
309
|
+
language="en",
|
|
310
|
+
task="transcribe",
|
|
311
|
+
word_timestamps=True,
|
|
312
|
+
no_speech_threshold=0.6,
|
|
313
|
+
logprob_threshold=-1.0,
|
|
314
|
+
compression_ratio_threshold=2.4,
|
|
315
|
+
)
|
|
316
|
+
else:
|
|
317
|
+
return InlineAsrNativeWhisperOptions(
|
|
318
|
+
repo_id="turbo",
|
|
319
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
|
320
|
+
verbose=True,
|
|
321
|
+
timestamps=True,
|
|
322
|
+
word_timestamps=True,
|
|
323
|
+
temperature=0.0,
|
|
324
|
+
max_new_tokens=256,
|
|
325
|
+
max_time_chunk=30.0,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
# Create the model instance
|
|
330
|
+
WHISPER_TURBO = _get_whisper_turbo_model()
|
|
331
|
+
|
|
332
|
+
# Explicit MLX Whisper model options for users who want to force MLX usage
|
|
333
|
+
WHISPER_TINY_MLX = InlineAsrMlxWhisperOptions(
|
|
334
|
+
repo_id="mlx-community/whisper-tiny-mlx",
|
|
335
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
336
|
+
language="en",
|
|
337
|
+
task="transcribe",
|
|
338
|
+
word_timestamps=True,
|
|
339
|
+
no_speech_threshold=0.6,
|
|
340
|
+
logprob_threshold=-1.0,
|
|
341
|
+
compression_ratio_threshold=2.4,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
WHISPER_SMALL_MLX = InlineAsrMlxWhisperOptions(
|
|
345
|
+
repo_id="mlx-community/whisper-small-mlx",
|
|
346
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
347
|
+
language="en",
|
|
348
|
+
task="transcribe",
|
|
349
|
+
word_timestamps=True,
|
|
350
|
+
no_speech_threshold=0.6,
|
|
351
|
+
logprob_threshold=-1.0,
|
|
352
|
+
compression_ratio_threshold=2.4,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
WHISPER_MEDIUM_MLX = InlineAsrMlxWhisperOptions(
|
|
356
|
+
repo_id="mlx-community/whisper-medium-mlx-8bit",
|
|
357
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
358
|
+
language="en",
|
|
359
|
+
task="transcribe",
|
|
360
|
+
word_timestamps=True,
|
|
361
|
+
no_speech_threshold=0.6,
|
|
362
|
+
logprob_threshold=-1.0,
|
|
363
|
+
compression_ratio_threshold=2.4,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
WHISPER_BASE_MLX = InlineAsrMlxWhisperOptions(
|
|
367
|
+
repo_id="mlx-community/whisper-base-mlx",
|
|
368
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
369
|
+
language="en",
|
|
370
|
+
task="transcribe",
|
|
371
|
+
word_timestamps=True,
|
|
372
|
+
no_speech_threshold=0.6,
|
|
373
|
+
logprob_threshold=-1.0,
|
|
374
|
+
compression_ratio_threshold=2.4,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
WHISPER_LARGE_MLX = InlineAsrMlxWhisperOptions(
|
|
378
|
+
repo_id="mlx-community/whisper-large-mlx-8bit",
|
|
379
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
380
|
+
language="en",
|
|
381
|
+
task="transcribe",
|
|
382
|
+
word_timestamps=True,
|
|
383
|
+
no_speech_threshold=0.6,
|
|
384
|
+
logprob_threshold=-1.0,
|
|
385
|
+
compression_ratio_threshold=2.4,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
WHISPER_TURBO_MLX = InlineAsrMlxWhisperOptions(
|
|
389
|
+
repo_id="mlx-community/whisper-turbo",
|
|
390
|
+
inference_framework=InferenceAsrFramework.MLX,
|
|
391
|
+
language="en",
|
|
392
|
+
task="transcribe",
|
|
393
|
+
word_timestamps=True,
|
|
394
|
+
no_speech_threshold=0.6,
|
|
395
|
+
logprob_threshold=-1.0,
|
|
396
|
+
compression_ratio_threshold=2.4,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Explicit Native Whisper model options for users who want to force native usage
|
|
400
|
+
WHISPER_TINY_NATIVE = InlineAsrNativeWhisperOptions(
|
|
20
401
|
repo_id="tiny",
|
|
21
402
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
22
403
|
verbose=True,
|
|
@@ -27,7 +408,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
|
|
27
408
|
max_time_chunk=30.0,
|
|
28
409
|
)
|
|
29
410
|
|
|
30
|
-
|
|
411
|
+
WHISPER_SMALL_NATIVE = InlineAsrNativeWhisperOptions(
|
|
31
412
|
repo_id="small",
|
|
32
413
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
33
414
|
verbose=True,
|
|
@@ -38,7 +419,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
|
|
38
419
|
max_time_chunk=30.0,
|
|
39
420
|
)
|
|
40
421
|
|
|
41
|
-
|
|
422
|
+
WHISPER_MEDIUM_NATIVE = InlineAsrNativeWhisperOptions(
|
|
42
423
|
repo_id="medium",
|
|
43
424
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
44
425
|
verbose=True,
|
|
@@ -49,7 +430,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
|
|
49
430
|
max_time_chunk=30.0,
|
|
50
431
|
)
|
|
51
432
|
|
|
52
|
-
|
|
433
|
+
WHISPER_BASE_NATIVE = InlineAsrNativeWhisperOptions(
|
|
53
434
|
repo_id="base",
|
|
54
435
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
55
436
|
verbose=True,
|
|
@@ -60,7 +441,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
|
|
60
441
|
max_time_chunk=30.0,
|
|
61
442
|
)
|
|
62
443
|
|
|
63
|
-
|
|
444
|
+
WHISPER_LARGE_NATIVE = InlineAsrNativeWhisperOptions(
|
|
64
445
|
repo_id="large",
|
|
65
446
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
66
447
|
verbose=True,
|
|
@@ -71,7 +452,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
|
|
71
452
|
max_time_chunk=30.0,
|
|
72
453
|
)
|
|
73
454
|
|
|
74
|
-
|
|
455
|
+
WHISPER_TURBO_NATIVE = InlineAsrNativeWhisperOptions(
|
|
75
456
|
repo_id="turbo",
|
|
76
457
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
77
458
|
verbose=True,
|
|
@@ -82,11 +463,32 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
|
|
82
463
|
max_time_chunk=30.0,
|
|
83
464
|
)
|
|
84
465
|
|
|
466
|
+
# Note: The main WHISPER_* models (WHISPER_TURBO, WHISPER_BASE, etc.) automatically
|
|
467
|
+
# select the best implementation (MLX on Apple Silicon, Native elsewhere).
|
|
468
|
+
# Use the explicit _MLX or _NATIVE variants if you need to force a specific implementation.
|
|
469
|
+
|
|
85
470
|
|
|
86
471
|
class AsrModelType(str, Enum):
|
|
472
|
+
# Auto-selecting models (choose best implementation for hardware)
|
|
87
473
|
WHISPER_TINY = "whisper_tiny"
|
|
88
474
|
WHISPER_SMALL = "whisper_small"
|
|
89
475
|
WHISPER_MEDIUM = "whisper_medium"
|
|
90
476
|
WHISPER_BASE = "whisper_base"
|
|
91
477
|
WHISPER_LARGE = "whisper_large"
|
|
92
478
|
WHISPER_TURBO = "whisper_turbo"
|
|
479
|
+
|
|
480
|
+
# Explicit MLX models (force MLX implementation)
|
|
481
|
+
WHISPER_TINY_MLX = "whisper_tiny_mlx"
|
|
482
|
+
WHISPER_SMALL_MLX = "whisper_small_mlx"
|
|
483
|
+
WHISPER_MEDIUM_MLX = "whisper_medium_mlx"
|
|
484
|
+
WHISPER_BASE_MLX = "whisper_base_mlx"
|
|
485
|
+
WHISPER_LARGE_MLX = "whisper_large_mlx"
|
|
486
|
+
WHISPER_TURBO_MLX = "whisper_turbo_mlx"
|
|
487
|
+
|
|
488
|
+
# Explicit Native models (force native implementation)
|
|
489
|
+
WHISPER_TINY_NATIVE = "whisper_tiny_native"
|
|
490
|
+
WHISPER_SMALL_NATIVE = "whisper_small_native"
|
|
491
|
+
WHISPER_MEDIUM_NATIVE = "whisper_medium_native"
|
|
492
|
+
WHISPER_BASE_NATIVE = "whisper_base_native"
|
|
493
|
+
WHISPER_LARGE_NATIVE = "whisper_large_native"
|
|
494
|
+
WHISPER_TURBO_NATIVE = "whisper_turbo_native"
|